# Input Module
# ============

# Module variable:
#
#	$inputLine	-- the unprocessed part of the latest line
#			   of input to have been read
#
# Exported functions:
#
#	&INInit		-- module initialization function
#	&INGetToken	-- returns one token read from the input.


# INInit initializes variables owned by the Input module
sub INInit {
    $inputLine = '';
}


# INGetToken scans text read from the STDIN input stream and
# returns one token.
#
# A token is implemented by a (tokentype, tokentext) pair, where:
#
#   1.	The possible tokentypes are: 'EOF', 'TAG', 'SPACE' or 'TEXT',
#	which indicate an end-of-file, a HTML tag, a block of
#	white-space characters or a block of other text, respectively.
#
#   2.	The tokentext component is the sequence of characters that
#	comprise the token.  These characters either form a complete
#	HTML tag (including the opening and closing angle brackets),
#	or a group of consecutive white-space characters,
#	or group of non-white-space and non-tag characters.
#	(The tokentext component is undefined if the tokentype
#	is EOF.)

sub INGetToken {
    local($token, $tokenType);

    if (length($inputLine) == 0) {
	$inputLine = <STDIN>;
    }
    if ($inputLine) {
	if ($inputLine =~ /^\s+/) {
	    # leading white space is extracted as the token
	    $token = $&;
	    $inputLine = $';
	    $tokenType = 'SPACE';
	} elsif ($inputLine =~ /^<[^>]*>/) {
	    # an HTML tag is extracted as the token
	    $token = $&;
	    $inputLine = $';
	    $tokenType = 'TAG';
	} elsif ($inputLine =~ /^</) {
	    # an unmatched left angle bracket plus following
	    # non-space characters are returned as a token
	    $token = $&;
	    $inputLine = $';
	    $tokenType = 'TEXT';
	} elsif ($inputLine =~ /^[^ \t\n\r\f<]+/) {
	    # leading non-spaces & non-tag are returned
	    $token = $&;
	    $inputLine = $';
	    $tokenType = 'TEXT';
	} else {
	    die "Input line failed match any pattern\n";
	}
    } else {
	$tokenType = 'EOF';
    }
    ($tokenType, $token);
}

1;
