#include "global.h" #include "source.h" #include "lex.h" #include "list.h" #include "irep.h" #include "parser.h" #include "lexparse.h" // The LexGen parser expects a specification as follows and builds an // internal representation for subsequent processing. // // specification ::= [ '%prelude' code '%%' ] // { [ '%tokens' | '%ignore' ] rule } // [ '%postlude' code '%%' ]. // rule ::= ruleHead '=' ruleBody [ %filter' ... '%%' ] '.'. // ruleHead ::= 'identifier'. // ruleBody ::= expression. // expression ::= termSeq [ '|' expression ]. // termSeq ::= term [ termSeq ] // term ::= char [ '-' char ] // | identifier // | '{' expression '}' // | '[' expression ']' // | '(' expression ')' . // The member functions specification() and rule() have structures // consistent with that derived from the language grammar. void LexGenParser::specification() { if (symbol() == PRELUDEMARK) { prelude = scanCode(); nextSymbol(); } while (symbol() == IDENTIFIER || symbol() == IGNOREMARK || symbol() == TOKENMARK) rule(); if (symbol() == POSTLUDEMARK) { postlude = scanCode(); nextSymbol(); } } void LexGenParser::rule() { LexRule *head; if (symbol() == Lexer::IGNOREMARK) { group = IGNORETYPE; nextSymbol(); } else if (symbol() == Lexer::TOKENMARK) { group = TOKENTYPE; nextSymbol(); } if (!(head = LexRule::ruleSet.find(spelling()))) head = new LexRule(spelling()); if (group == TOKENTYPE) head->token(); else if (group == IGNORETYPE) head->ignored(); nextSymbol(); expect(EQUAL); head->defn(expression()); if (symbol() == FILTERMARK) { // The filter mark allows for the execution of code hooks before the // generated lexer returns a token. The scanCode() member function is // invoked to scan such code fragments. head->filterIs(scanCode()); nextSymbol(); } expect(Lexer::PERIOD); } // The member functions term(), termSeq() and expression() contain code // to construct the appropriate internal representation for subsequent // conversion to an appropriate Finite State Acceptor. // Lexical items, nonterminal rules, alternatives and repetitive rules // have their respective representation: LexCharNode, LexNonTermNode, // LexSelectNode and LexRepeatNode. LexNode *LexGenParser::term() { switch (symbol()) { // A lexical term returns a LexCharNode. Where a character range is // present, it is appropriately recorded. case Lexer::CHARCONST: { char c = spelling()[0]; nextSymbol(); if (symbol() == Lexer::RANGE) { nextSymbol(); mustBe(Lexer::CHARCONST); char range = spelling()[0]; nextSymbol(); return(new LexCharNode(c, range)); } else return(new LexCharNode(c)); } // An identifier term returns a LexNonTermNode with a reference to the // original definition. However, names may be forward defined. In this // case, a stub definition is inserted and its body will be elaborated // when a definition is ultimately encountered. case Lexer::IDENTIFIER: { LexNode *n; LexRule *rule; if (!(rule = LexRule::ruleSet.find(spelling()))) rule = new LexRule(spelling()); n = new LexNonTermNode(rule); nextSymbol(); return n; } // A left brace indicates a repetitive rule. It returns a LexRepeatNode // with a reference to the repeated expression. As with most recursive // descent parsers, this is a recursive call and it accommodates all // nesting levels. case Lexer::LEFTBRACE: { LexNode *n; bool atLeastOne = FALSE; LexNode *sep = 0; nextSymbol(); n = expression(); if (symbol() == Lexer::COMMA) { nextSymbol(); sep = expression(); } expect(RIGHTBRACE); if (symbol() == Lexer::PLUS) { atLeastOne = TRUE; nextSymbol(); } else if (symbol() == Lexer::STAR) nextSymbol(); return(new LexRepeatNode(n, sep, atLeastOne)); } // A left bracket indicates an optional rule. It returns a // LexSelectNode with a reference to the optional expression. The other // option is left as 0. As with the previous repetition rule, this // is a recursive call and it accommodates all nesting levels. case Lexer::LEFTBRACKET: { LexNode *n; nextSymbol(); n = expression(); expect(RIGHTBRACKET); return(new LexSelectNode(n, 0)); } // A left parenthesis merely indicates precedence and no special // processing is required. case Lexer::LEFTPARENT: { LexNode *n; nextSymbol(); n = expression(); expect(Lexer::RIGHTPARENT); return n; } // double quoted string case Lexer::DQUOTE: { return qwString(); } // The following symbols indicate the end of an expression sequence // and thus merely result 0. case Lexer::RIGHTBRACE: case Lexer::RIGHTBRACKET: case Lexer::RIGHTPARENT: case Lexer::ALTERNATE: case Lexer::COMMA: case Lexer::FILTERMARK: case Lexer::PERIOD: return 0; } // Other symbols not covered in the above list is erroneous and is // reported as such: parseError("term expected"); nextSymbol(); return 0; } // Recognising a sequence of terms involves processing the first term, // and subsequently checking for subsequent terms via a recursive call // and linking via the next pointer of LexNode instances. LexNode *LexGenParser::termSeq() { LexNode *n; n = term(); if (!n) return 0; switch (symbol()) { // These symbols continue a terminal sequence, and in turn, continue // processing. case Lexer::IDENTIFIER: case Lexer::CHARCONST: case Lexer::LEFTBRACE: case Lexer::LEFTBRACKET: case Lexer::LEFTPARENT: while (n->next != 0) n = n->next; n->next = termSeq(); } return n; } LexNode *LexGenParser::qwString() { expect(Lexer::DQUOTE); // skip opening quote LexNode *first = 0; LexNode *last = 0; for (int i = 0; spelling()[i]; ++i) { LexNode *n = new LexCharNode(spelling()[i]); if (last) last->next = n; last = n; if (!first) first = n; } nextSymbol(); // skip closing quote expect(Lexer::DQUOTE); if (!first) parseError("empty quoted strings are not allowed"); return first; } // Since the alternative operator has the weakest precedence, it is // handled separately. LexNode *LexGenParser::expression() { LexNode *n; n = termSeq(); if (symbol() == Lexer::ALTERNATE) { nextSymbol(); return(new LexSelectNode(n, expression())); } return n; } char *LexGenParser::prelude = 0; char *LexGenParser::postlude = 0; const char *LexGenParser::currentfile = 0;