#include	"global.h"
#include	"source.h"
#include	"lex.h"

// For the purpose of error reporting, it is convenient to have
// representations of the various tokens.

char *Lexer::symbolSpelling[] = {
	"<identifier>", "<charconst>",
	"{", "}", "[", "]", "(", ")", "=", "|", ",", "..", ".", "+", "*",
	"%sets", "%ignore", "%tokens",
	"'", "\"",
	"<eof>", "??"
};

Lexer::symbols Lexer::scan()
{

// The member function scan() relies on two lookup tables for character
// to token mappings.  The charLookUp[] table maps single characters to
// token values.

	static struct charLookUpType {
		int ch; symbols value;
	} charLookUp[] = {
		{ '=', EQUAL },
		{ ',', COMMA },
		{ '|', ALTERNATE },
		{ '{', LEFTBRACE },
		{ '}', RIGHTBRACE },
		{ '[', LEFTBRACKET },
		{ ']', RIGHTBRACKET },
		{ '(', LEFTPARENT },
		{ ')', RIGHTPARENT },
		{ '*', STAR },
		{ '+', PLUS },
		{ '\'', SQUOTE },
		{ '"', DQUOTE },
		{ EOF, EOFTOKEN }
	};
	static int charLookUpNumbers =
		sizeof(charLookUp)/sizeof(struct charLookUpType);

// The markLookUp[] table maps % strings to token values.

	static struct markLookUpType {
		char *m; symbols value;
	} markLookUp[] = {
		{ "ignore", IGNOREMARK },
		{ "prelude", PRELUDEMARK },
		{ "postlude", POSTLUDEMARK },
		{ "filter", FILTERMARK },
		{ "tokens", TOKENMARK }
	};
	static int markLookUpNumbers =
		sizeof(markLookUp)/sizeof(struct markLookUpType);

// The processing for scan() first involves ignoring whitespaces.
// Subsequently, the token position is noted.

	skipSpaces();
	tokenPosition = charPosition();

// If the starting character falls into the alphanumeric
// catagory for identifiers, the function getSpelling() is
// invoked to read it.

	if ((lookahead >= 'a' && lookahead <= 'z') ||
		(lookahead >= 'A' && lookahead <= 'Z') ||
		lookahead == '_') {
		getSpelling();
		return IDENTIFIER;
	}

// Character constants are prefixed with the '$' literal or '#' ASCII
// value markers.  In the former case, the next character is merely
// read, whereas in the latter case, a string to integer conversion is
// necessary.

	if (lookahead == '$') {
		_spelling[0] = nextChar();
		_spelling[1] = '\0';
		lookahead = nextChar();
		return CHARCONST;
	}
	if (lookahead == '#') {
		int value = 0;
		lookahead = nextChar();
		while (lookahead >= '0' && lookahead <= '9') {
			value = value*10 + lookahead - '0';
			lookahead = nextChar();
		}
		_spelling[0] = value;
		_spelling[1] = '\0';
		return CHARCONST;
	}

// Comment conventions follow that of C++ and C.
// Input between double slashes (//) till the end of the line,
// and those between /* and */ are ignored.

	if (lookahead == '/') {
		lookahead = nextChar();
		if (lookahead == '/') {
			while ((lookahead = nextChar()) != '\n')
				;
			lookahead = nextChar();
			return(scan());
		} else if (lookahead == '*') {
			char c1 = nextChar();
			char c2 = nextChar();
			while (c1 != '*' && c2 != '/') {
				c1 = c2;
				c2 = nextChar();
			}
			lookahead = nextChar();
			return(scan());
		}
	}

// Single character tokens are recognised by scanning the
// charLookUp[] table.

	for (int i=0; i<charLookUpNumbers; i++)
		if (charLookUp[i].ch == lookahead) {
			_spelling[0] = lookahead;
			_spelling[1] = '\0';
			lookahead = nextChar();
			return(charLookUp[i].value);
		}

// Tokens involving two or more characters however require
// specific attention.  Here, we differentiate between '.' and '..'.

	if (lookahead == '.') {
		_spelling[0] = '.';
		lookahead = nextChar();
		if (lookahead == '.') {
			_spelling[1] = '.';
			_spelling[2] = '\0';
			lookahead = nextChar();
			return(RANGE);
		} else {
			_spelling[1] = '\0';
			return(PERIOD);
		}
	}

// % markers are checked against the markLookUp table

	if (lookahead == '%') {
		lookahead = nextChar();
		getSpelling();
		for (int i=0; i<markLookUpNumbers; i++)
			if (strcmp(markLookUp[i].m, _spelling) == 0)
				return(markLookUp[i].value);
		error(ERROR, "does not recognise mark `%s'\n", _spelling);
	}

// If tokens cannot be recognised by this stage, it is
// erroneous and the UNKNOWN symbol is returned instead.

	_spelling[0] = lookahead;
	_spelling[1] = '\0';
	return UNKNOWN;
}

// The getSpelling() member function scans identifier spellings into
// the character block _spelling[].

void Lexer::getSpelling()
{
	int i = 0;

	do {
		if (i < MAXSPELLING)
			_spelling[i++] = lookahead;
		lookahead = nextChar();
	} while ((lookahead >= 'a' && lookahead <= 'z') ||
			(lookahead >= 'A' && lookahead <= 'Z') ||
			(lookahead >= '0' && lookahead <= '9') ||
			lookahead == '_');
	_spelling[i] = 0;
}

void Lexer::skipSpaces()
{
	while (lookahead == ' ' || lookahead == '\n' || lookahead == '\t')
		lookahead = nextChar();
}

Lexer::Lexer(const char *fn) : Source(fn)
{
	if (status() == CANNOTOPEN)
		error(ABORT, "cannot open '%s'\n", fileName());
	lookahead = nextChar();
	nextSymbol();
}

char *Lexer::scanCode()
{
	char buf[20480];
	char *p;
	int i = 0;

	tokenPosition = currentLine();

// A code block is terminated with an double percentage (%%).

	for (;;) {
		if (lookahead == '%') {
			lookahead = nextChar();
			if (lookahead == '%')
				break;
			buf[i++] = '%';
		}
		buf[i++] = lookahead;
		lookahead = nextChar();
	}
	lookahead = ' ';
	buf[i++] = '\0';

// The code block is allocated an appropriate memory block.  It is
// prefixed with a '#line' directive to enable any compilation errors
// to be reported with respect to the original specification file
// rather than the generated file produce by LexGen.

        p = allocate(i+20+strlen(fileName()));
        sprintf(p, "\n#line %d \"%s\"\n%s", tokenPosition, fileName(), buf);
	return p;
}


syntax highlighted by Code2HTML, v. 0.9.1