#include "global.h"
#include <ctype.h>
#include "source.h"
#include "lex.h"
// For the purpose of error reporting, it is convenient to have
// representations of the various tokens.
char *Lexer::symbolSpelling[] = {
"<identifier>", "<charconst>",
"{", "}", "[", "]", "(", ")",
"=", "|", ",", "..", ".", "+", "*", "/",
"%rules", "%token", "%inh", "%left", "%right", "%noassoc",
"%alias", "%action", "%cond", "%attribute", "%prelude", "%postlude",
"<eof>", "??"
};
Lexer::symbols Lexer::scan() {
// The processing for scan() first involves ignoring whitespaces.
// Subsequently, the token position is noted.
skipSpaces();
tokenPosition = charPosition();
// If the starting character falls into the alphanumeric
// catagory for identifiers, the function getSpelling() is
// invoked to read it.
if (isalpha(lookahead)) {
getSpelling();
return IDENTIFIER;
}
// Character constants are prefixed with the '$' literal or '#' ASCII
// value markers. In the former case, the next character is merely
// read, whereas in the latter case, a string to integer conversion is
// necessary.
if (lookahead == '$') {
_spelling[0] = nextChar();
_spelling[1] = '\0';
lookahead = nextChar();
return CHARCONST;
}
if (lookahead == '#') {
int value = 0;
lookahead = nextChar();
while (lookahead >= '0' && lookahead <= '9') {
value = value*10 + lookahead - '0';
lookahead = nextChar();
}
_spelling[0] = value;
_spelling[1] = '\0';
return CHARCONST;
}
// Comment conventions follow that of C++ and C.
// Input between double slashes (//) till the end of the line,
// and those between /* and */ are ignored.
if (lookahead == '/') {
lookahead = nextChar();
if (lookahead == '/') {
while ((lookahead = nextChar()) != '\n')
;
lookahead = nextChar();
return(scan());
} else if (lookahead == '*') {
char c1 = nextChar();
char c2 = nextChar();
while (c1 != '*' && c2 != '/') {
c1 = c2;
c2 = nextChar();
}
lookahead = nextChar();
return(scan());
}
}
// Single character tokens are recognised here
Lexer::symbols s;
if (identifyChar(s))
return s;
// see if it is a 'quoted' char
if (lookahead == '\'') {
const char c = nextChar();
lookahead = nextChar();
if (lookahead == '\'') {
lookahead = nextChar();
return char2id(c);
}
return UNKNOWN;
}
// see if it is a "quoted" keyword
if (lookahead == '"') {
lookahead = nextChar();
getSpelling("kw_");
if (lookahead == '"') {
lookahead = nextChar();
return IDENTIFIER;
}
return UNKNOWN;
}
// Tokens involving two or more characters however require
// specific attention. Here, we differentiate between '.' and '..'.
if (lookahead == '.') {
_spelling[0] = '.';
lookahead = nextChar();
if (lookahead == '.') {
_spelling[1] = '.';
_spelling[2] = '\0';
lookahead = nextChar();
return(RANGE);
} else {
_spelling[1] = '\0';
return(PERIOD);
}
}
// %-marker?
if (identifyMarker(s))
return s;
// If tokens cannot be recognised by this stage, it is
// erroneous and the UNKNOWN symbol is returned instead.
error(ERROR, "cannot recognize token starting '%c'\n", lookahead);
_spelling[0] = lookahead;
_spelling[1] = '\0';
return UNKNOWN;
}
// The getSpelling() member function scans identifier spellings into
// the character block _spelling[].
void Lexer::getSpelling(const char *prefix)
{
int i = 0;
// copy prefix
while (*prefix) _spelling[i++] = *prefix++;
do {
_spelling[i++] = lookahead;
lookahead = nextChar();
} while (isalnum(lookahead) || lookahead == '_' || lookahead == '-');
_spelling[i] = 0;
}
void Lexer::skipSpaces()
{
while (isspace(lookahead))
lookahead = nextChar();
}
Lexer::Lexer(char *fn) : Source(fn)
{
if (status() == CANNOTOPEN)
error(ABORT, "cannot open '%s'\n", fileName());
lookahead = nextChar();
nextSymbol();
}
char *Lexer::scanCode()
{
char buf[20480];
char *p;
int i = 0;
tokenPosition = currentLine();
// A code block is terminated with an double percentage (%%).
for (;;) {
if (lookahead == '%') {
lookahead = nextChar();
if (lookahead == '%')
break;
buf[i++] = '%';
}
buf[i++] = lookahead;
lookahead = nextChar();
}
lookahead = ' ';
buf[i++] = '\0';
// The code block is allocated an appropriate memory block. It is
// prefixed with a '#line' directive to enable any compilation errors
// to be reported with respect to the original specification file
// rather than the generated file produced by LexGen.
p = allocate(i+20+strlen(fileName()));
sprintf(p, "\n#line %d \"%s\"\n%s", tokenPosition, fileName(), buf);
return p;
}
// Single character tokens are recognised by scanning the
// charLookUp[] table.
bool Lexer::identifyChar(Lexer::symbols &val) {
static struct charLookUpType {
int ch;
symbols value;
} charLookUp[] = {
{ '=', EQUAL },
{ ',', COMMA },
{ '|', ALTERNATE },
{ '{', LEFTBRACE },
{ '}', RIGHTBRACE },
{ '[', LEFTBRACKET },
{ ']', RIGHTBRACKET },
{ '(', LEFTPARENT },
{ ')', RIGHTPARENT },
{ '*', STAR },
{ '+', PLUS },
{ '-', MINUS },
{ EOF, EOFTOKEN }
};
static int charLookUpNumbers =
sizeof(charLookUp)/sizeof(struct charLookUpType);
for (int i = 0; i < charLookUpNumbers; i++) {
if (charLookUp[i].ch == lookahead) {
_spelling[0] = lookahead;
_spelling[1] = '\0';
lookahead = nextChar();
val = charLookUp[i].value;
return true;
}
}
return false;
}
// % markers are checked against the markLookUp table
bool Lexer::identifyMarker(Lexer::symbols &val) {
// The markLookUp[] table maps % strings to token values.
static struct markLookUpType {
char *m; symbols value;
} markLookUp[] = {
{ "rules", RULESMARK },
{ "left", LEFTASSOCMARK },
{ "right", RIGHTASSOCMARK },
{ "noassoc", NOASSOCMARK },
{ "tokens", TOKENMARK },
{ "alias", ALIASMARK },
{ "inh", INHMARK },
{ "attribute", ATTRIBUTEMARK },
{ "prelude", PRELUDEMARK },
{ "postlude", POSTLUDEMARK },
{ "cond", CONDMARK },
{ "eval", ACTIONMARK }
};
static int markLookUpNumbers =
sizeof(markLookUp)/sizeof(struct markLookUpType);
if (lookahead == '%') {
lookahead = nextChar();
getSpelling();
for (int i = 0; i < markLookUpNumbers; i++) {
if (strcmp(markLookUp[i].m, _spelling) == 0) {
val = markLookUp[i].value;
return true;
}
}
error(ERROR, "unknown mark `%s'\n", _spelling);
}
return false;
}
Lexer::symbols Lexer::char2id(char c) {
static struct charLookUpType {
int ch;
const char * value;
} charLookUp[] = {
{ '=', "ASGN" },
{ ',', "COMMA" },
{ ':', "COLON" },
{ ';', "SEMICOLON" },
{ '.', "PERIOD" },
{ '#', "POUND" },
{ '|', "ALTERNATE" },
{ '{', "LEFTBRACE" },
{ '}', "RIGHTBRACE" },
{ '[', "LEFTBRACKET" },
{ ']', "RIGHTBRACKET" },
{ '(', "LEFTPARENT" },
{ ')', "RIGHTPARENT" },
{ '*', "MUL" },
{ '+', "PLUS" },
{ '-', "MINUS" },
{ '/', "DIV" },
{ '^', "POWER" },
{ '!', "NOT" },
{ '%', "MOD" }
};
static const int charLookUpNumbers =
sizeof(charLookUp)/sizeof(struct charLookUpType);
for (int i = 0; i < charLookUpNumbers; ++i) {
if (charLookUp[i].ch == c) {
strncpy(_spelling, charLookUp[i].value, sizeof(_spelling));
return IDENTIFIER;
}
}
error(ERROR, "unknown character `%c', cannot convert to an identifier\n", c);
return UNKNOWN;
}
syntax highlighted by Code2HTML, v. 0.9.1