#include "global.h" #include #include "source.h" #include "lex.h" // For the purpose of error reporting, it is convenient to have // representations of the various tokens. char *Lexer::symbolSpelling[] = { "", "", "{", "}", "[", "]", "(", ")", "=", "|", ",", "..", ".", "+", "*", "/", "%rules", "%token", "%inh", "%left", "%right", "%noassoc", "%alias", "%action", "%cond", "%attribute", "%prelude", "%postlude", "", "??" }; Lexer::symbols Lexer::scan() { // The processing for scan() first involves ignoring whitespaces. // Subsequently, the token position is noted. skipSpaces(); tokenPosition = charPosition(); // If the starting character falls into the alphanumeric // catagory for identifiers, the function getSpelling() is // invoked to read it. if (isalpha(lookahead)) { getSpelling(); return IDENTIFIER; } // Character constants are prefixed with the '$' literal or '#' ASCII // value markers. In the former case, the next character is merely // read, whereas in the latter case, a string to integer conversion is // necessary. if (lookahead == '$') { _spelling[0] = nextChar(); _spelling[1] = '\0'; lookahead = nextChar(); return CHARCONST; } if (lookahead == '#') { int value = 0; lookahead = nextChar(); while (lookahead >= '0' && lookahead <= '9') { value = value*10 + lookahead - '0'; lookahead = nextChar(); } _spelling[0] = value; _spelling[1] = '\0'; return CHARCONST; } // Comment conventions follow that of C++ and C. // Input between double slashes (//) till the end of the line, // and those between /* and */ are ignored. if (lookahead == '/') { lookahead = nextChar(); if (lookahead == '/') { while ((lookahead = nextChar()) != '\n') ; lookahead = nextChar(); return(scan()); } else if (lookahead == '*') { char c1 = nextChar(); char c2 = nextChar(); while (c1 != '*' && c2 != '/') { c1 = c2; c2 = nextChar(); } lookahead = nextChar(); return(scan()); } } // Single character tokens are recognised here Lexer::symbols s; if (identifyChar(s)) return s; // see if it is a 'quoted' char if (lookahead == '\'') { const char c = nextChar(); lookahead = nextChar(); if (lookahead == '\'') { lookahead = nextChar(); return char2id(c); } return UNKNOWN; } // see if it is a "quoted" keyword if (lookahead == '"') { lookahead = nextChar(); getSpelling("kw_"); if (lookahead == '"') { lookahead = nextChar(); return IDENTIFIER; } return UNKNOWN; } // Tokens involving two or more characters however require // specific attention. Here, we differentiate between '.' and '..'. if (lookahead == '.') { _spelling[0] = '.'; lookahead = nextChar(); if (lookahead == '.') { _spelling[1] = '.'; _spelling[2] = '\0'; lookahead = nextChar(); return(RANGE); } else { _spelling[1] = '\0'; return(PERIOD); } } // %-marker? if (identifyMarker(s)) return s; // If tokens cannot be recognised by this stage, it is // erroneous and the UNKNOWN symbol is returned instead. error(ERROR, "cannot recognize token starting '%c'\n", lookahead); _spelling[0] = lookahead; _spelling[1] = '\0'; return UNKNOWN; } // The getSpelling() member function scans identifier spellings into // the character block _spelling[]. void Lexer::getSpelling(const char *prefix) { int i = 0; // copy prefix while (*prefix) _spelling[i++] = *prefix++; do { _spelling[i++] = lookahead; lookahead = nextChar(); } while (isalnum(lookahead) || lookahead == '_' || lookahead == '-'); _spelling[i] = 0; } void Lexer::skipSpaces() { while (isspace(lookahead)) lookahead = nextChar(); } Lexer::Lexer(char *fn) : Source(fn) { if (status() == CANNOTOPEN) error(ABORT, "cannot open '%s'\n", fileName()); lookahead = nextChar(); nextSymbol(); } char *Lexer::scanCode() { char buf[20480]; char *p; int i = 0; tokenPosition = currentLine(); // A code block is terminated with an double percentage (%%). for (;;) { if (lookahead == '%') { lookahead = nextChar(); if (lookahead == '%') break; buf[i++] = '%'; } buf[i++] = lookahead; lookahead = nextChar(); } lookahead = ' '; buf[i++] = '\0'; // The code block is allocated an appropriate memory block. It is // prefixed with a '#line' directive to enable any compilation errors // to be reported with respect to the original specification file // rather than the generated file produced by LexGen. p = allocate(i+20+strlen(fileName())); sprintf(p, "\n#line %d \"%s\"\n%s", tokenPosition, fileName(), buf); return p; } // Single character tokens are recognised by scanning the // charLookUp[] table. bool Lexer::identifyChar(Lexer::symbols &val) { static struct charLookUpType { int ch; symbols value; } charLookUp[] = { { '=', EQUAL }, { ',', COMMA }, { '|', ALTERNATE }, { '{', LEFTBRACE }, { '}', RIGHTBRACE }, { '[', LEFTBRACKET }, { ']', RIGHTBRACKET }, { '(', LEFTPARENT }, { ')', RIGHTPARENT }, { '*', STAR }, { '+', PLUS }, { '-', MINUS }, { EOF, EOFTOKEN } }; static int charLookUpNumbers = sizeof(charLookUp)/sizeof(struct charLookUpType); for (int i = 0; i < charLookUpNumbers; i++) { if (charLookUp[i].ch == lookahead) { _spelling[0] = lookahead; _spelling[1] = '\0'; lookahead = nextChar(); val = charLookUp[i].value; return true; } } return false; } // % markers are checked against the markLookUp table bool Lexer::identifyMarker(Lexer::symbols &val) { // The markLookUp[] table maps % strings to token values. static struct markLookUpType { char *m; symbols value; } markLookUp[] = { { "rules", RULESMARK }, { "left", LEFTASSOCMARK }, { "right", RIGHTASSOCMARK }, { "noassoc", NOASSOCMARK }, { "tokens", TOKENMARK }, { "alias", ALIASMARK }, { "inh", INHMARK }, { "attribute", ATTRIBUTEMARK }, { "prelude", PRELUDEMARK }, { "postlude", POSTLUDEMARK }, { "cond", CONDMARK }, { "eval", ACTIONMARK } }; static int markLookUpNumbers = sizeof(markLookUp)/sizeof(struct markLookUpType); if (lookahead == '%') { lookahead = nextChar(); getSpelling(); for (int i = 0; i < markLookUpNumbers; i++) { if (strcmp(markLookUp[i].m, _spelling) == 0) { val = markLookUp[i].value; return true; } } error(ERROR, "unknown mark `%s'\n", _spelling); } return false; } Lexer::symbols Lexer::char2id(char c) { static struct charLookUpType { int ch; const char * value; } charLookUp[] = { { '=', "ASGN" }, { ',', "COMMA" }, { ':', "COLON" }, { ';', "SEMICOLON" }, { '.', "PERIOD" }, { '#', "POUND" }, { '|', "ALTERNATE" }, { '{', "LEFTBRACE" }, { '}', "RIGHTBRACE" }, { '[', "LEFTBRACKET" }, { ']', "RIGHTBRACKET" }, { '(', "LEFTPARENT" }, { ')', "RIGHTPARENT" }, { '*', "MUL" }, { '+', "PLUS" }, { '-', "MINUS" }, { '/', "DIV" }, { '^', "POWER" }, { '!', "NOT" }, { '%', "MOD" } }; static const int charLookUpNumbers = sizeof(charLookUp)/sizeof(struct charLookUpType); for (int i = 0; i < charLookUpNumbers; ++i) { if (charLookUp[i].ch == c) { strncpy(_spelling, charLookUp[i].value, sizeof(_spelling)); return IDENTIFIER; } } error(ERROR, "unknown character `%c', cannot convert to an identifier\n", c); return UNKNOWN; }