#include "global.h"

#include <ctype.h>

#include "source.h"
#include "lex.h"

// For the purpose of error reporting, it is convenient to have
// representations of the various tokens.

char *Lexer::symbolSpelling[] = {
	"<identifier>", "<charconst>",
	"{", "}", "[", "]", "(", ")",
	"=", "|", ",", "..", ".", "+", "*", "/",
	"%rules", "%token", "%inh", "%left", "%right", "%noassoc",
	"%alias", "%action", "%cond", "%attribute", "%prelude", "%postlude",
	"<eof>", "??"
};


Lexer::symbols Lexer::scan() {

// The processing for scan() first involves ignoring whitespaces.
// Subsequently, the token position is noted.

	skipSpaces();
	tokenPosition = charPosition();

// If the starting character falls into the alphanumeric
// catagory for identifiers, the function getSpelling() is
// invoked to read it.

	if (isalpha(lookahead)) {
		getSpelling();
		return IDENTIFIER;
	}

// Character constants are prefixed with the '$' literal or '#' ASCII
// value markers.  In the former case, the next character is merely
// read, whereas in the latter case, a string to integer conversion is
// necessary.

	if (lookahead == '$') {
		_spelling[0] = nextChar();
		_spelling[1] = '\0';
		lookahead = nextChar();
		return CHARCONST;
	}
	if (lookahead == '#') {
		int value = 0;
		lookahead = nextChar();
		while (lookahead >= '0' && lookahead <= '9') {
			value = value*10 + lookahead - '0';
			lookahead = nextChar();
		}
		_spelling[0] = value;
		_spelling[1] = '\0';
		return CHARCONST;
	}

// Comment conventions follow that of C++ and C.
// Input between double slashes (//) till the end of the line,
// and those between /* and */ are ignored.

	if (lookahead == '/') {
		lookahead = nextChar();
		if (lookahead == '/') {
			while ((lookahead = nextChar()) != '\n')
				;
			lookahead = nextChar();
			return(scan());
		} else if (lookahead == '*') {
			char c1 = nextChar();
			char c2 = nextChar();
			while (c1 != '*' && c2 != '/') {
				c1 = c2;
				c2 = nextChar();
			}
			lookahead = nextChar();
			return(scan());
		}
	}

	// Single character tokens are recognised here
	Lexer::symbols s;
	if (identifyChar(s))
		return s;

	// see if it is a 'quoted' char
	if (lookahead == '\'') {
		const char c = nextChar();
		lookahead = nextChar();
		if (lookahead == '\'') {
			lookahead = nextChar();
			return char2id(c);
		}
		return UNKNOWN;
	}

	// see if it is a "quoted" keyword
	if (lookahead == '"') {
		lookahead = nextChar();
		getSpelling("kw_");
		if (lookahead == '"') {
			lookahead = nextChar();
			return IDENTIFIER;
		}
		return UNKNOWN;
	}

// Tokens involving two or more characters however require
// specific attention.  Here, we differentiate between '.' and '..'.

	if (lookahead == '.') {
		_spelling[0] = '.';
		lookahead = nextChar();
		if (lookahead == '.') {
			_spelling[1] = '.';
			_spelling[2] = '\0';
			lookahead = nextChar();
			return(RANGE);
		} else {
			_spelling[1] = '\0';
			return(PERIOD);
		}
	}

	// %-marker?
	if (identifyMarker(s))
		return s;

// If tokens cannot be recognised by this stage, it is
// erroneous and the UNKNOWN symbol is returned instead.

	error(ERROR, "cannot recognize token starting '%c'\n", lookahead);

	_spelling[0] = lookahead;
	_spelling[1] = '\0';
	return UNKNOWN;
}

// The getSpelling() member function scans identifier spellings into
// the character block _spelling[].

void Lexer::getSpelling(const char *prefix)
{
	int i = 0;

	// copy prefix
	while (*prefix) _spelling[i++] = *prefix++;

	do {
		_spelling[i++] = lookahead;
		lookahead = nextChar();
	} while (isalnum(lookahead) || lookahead == '_' || lookahead == '-');

	_spelling[i] = 0;
}

void Lexer::skipSpaces()
{
	while (isspace(lookahead))
		lookahead = nextChar();
}

Lexer::Lexer(char *fn) : Source(fn)
{
	if (status() == CANNOTOPEN)
		error(ABORT, "cannot open '%s'\n", fileName());
	lookahead = nextChar();
	nextSymbol();
}

char *Lexer::scanCode()
{
	char buf[20480];
	char *p;
	int i = 0;

	tokenPosition = currentLine();

// A code block is terminated with an double percentage (%%).

	for (;;) {
		if (lookahead == '%') {
			lookahead = nextChar();
			if (lookahead == '%')
				break;
			buf[i++] = '%';
		}
		buf[i++] = lookahead;
		lookahead = nextChar();
	}
	lookahead = ' ';
	buf[i++] = '\0';

// The code block is allocated an appropriate memory block.  It is
// prefixed with a '#line' directive to enable any compilation errors
// to be reported with respect to the original specification file
// rather than the generated file produced by LexGen.

	p = allocate(i+20+strlen(fileName()));
	sprintf(p, "\n#line %d \"%s\"\n%s", tokenPosition, fileName(), buf);
	return p;
}


// Single character tokens are recognised by scanning the
// charLookUp[] table.
bool Lexer::identifyChar(Lexer::symbols &val) {

	static struct charLookUpType {
		int ch; 
		symbols value;
	} charLookUp[] = {
		{ '=', EQUAL },
		{ ',', COMMA },
		{ '|', ALTERNATE },
		{ '{', LEFTBRACE },
		{ '}', RIGHTBRACE },
		{ '[', LEFTBRACKET },
		{ ']', RIGHTBRACKET },
		{ '(', LEFTPARENT },
		{ ')', RIGHTPARENT },
		{ '*', STAR },
		{ '+', PLUS },
		{ '-', MINUS },
		{ EOF, EOFTOKEN }
	};

	static int charLookUpNumbers =
		sizeof(charLookUp)/sizeof(struct charLookUpType);

	for (int i = 0; i < charLookUpNumbers; i++) {
		if (charLookUp[i].ch == lookahead) {
			_spelling[0] = lookahead;
			_spelling[1] = '\0';
			lookahead = nextChar();
			val = charLookUp[i].value;
			return true;
		}
	}
	return false;
}


// % markers are checked against the markLookUp table
bool Lexer::identifyMarker(Lexer::symbols &val) {

	// The markLookUp[] table maps % strings to token values.
	static struct markLookUpType {
		char *m; symbols value;
	} markLookUp[] = {
		{ "rules", RULESMARK },
		{ "left", LEFTASSOCMARK },
		{ "right", RIGHTASSOCMARK },
		{ "noassoc", NOASSOCMARK },
		{ "tokens", TOKENMARK },
		{ "alias", ALIASMARK },
		{ "inh", INHMARK },
		{ "attribute", ATTRIBUTEMARK },
		{ "prelude", PRELUDEMARK },
		{ "postlude", POSTLUDEMARK },
		{ "cond", CONDMARK },
		{ "eval", ACTIONMARK }
	};
	static int markLookUpNumbers =
		sizeof(markLookUp)/sizeof(struct markLookUpType);

	if (lookahead == '%') {
		lookahead = nextChar();
		getSpelling();
		for (int i = 0; i < markLookUpNumbers; i++) {
			if (strcmp(markLookUp[i].m, _spelling) == 0) {
				val = markLookUp[i].value;
				return true;
			}
		}
		error(ERROR, "unknown mark `%s'\n", _spelling);
	}
	return false;
}

Lexer::symbols Lexer::char2id(char c) {
	static struct charLookUpType {
		int ch; 
		const char * value;
	} charLookUp[] = {
		{ '=', "ASGN" },
		{ ',', "COMMA" },
		{ ':', "COLON" },
		{ ';', "SEMICOLON" },
		{ '.', "PERIOD" },
		{ '#', "POUND" },
		{ '|', "ALTERNATE" },
		{ '{', "LEFTBRACE" },
		{ '}', "RIGHTBRACE" },
		{ '[', "LEFTBRACKET" },
		{ ']', "RIGHTBRACKET" },
		{ '(', "LEFTPARENT" },
		{ ')', "RIGHTPARENT" },
		{ '*', "MUL" },
		{ '+', "PLUS" },
		{ '-', "MINUS" },
		{ '/', "DIV" },
		{ '^', "POWER" },
		{ '!', "NOT" },
		{ '%', "MOD" }
	};

	static const int charLookUpNumbers =
		sizeof(charLookUp)/sizeof(struct charLookUpType);

	for (int i = 0; i < charLookUpNumbers; ++i) {
		if (charLookUp[i].ch == c) {
			strncpy(_spelling, charLookUp[i].value, sizeof(_spelling));
			return IDENTIFIER;
		}
	}

	error(ERROR, "unknown character `%c', cannot convert to an identifier\n", c);
	return UNKNOWN;
}


syntax highlighted by Code2HTML, v. 0.9.1