#!/usr/bin/env python """Toy Parser Generator is a lexical and syntactic parser generator for Python. This generator was born from a simple statement: YACC is too complex to use in simple cases (calculators, configuration files, small programming languages, ...). TPG can very simply write parsers that are usefull for most every day needs (even if it can't make your coffee). With a very clear and simple syntax, you can write an attributed grammar that is translated into a recursive descendant parser. TPG generated code is very closed to the original grammar. This means that the parser works "like" the grammar. A grammar rule can be seen as a method of the parser class, symbols as method calls, attributes as method parameters and semantic values as return values. You can also add Python code directly into grammar rules and build abstract syntax trees while parsing. """ # Toy Parser Generator: A Python parser generator # Copyright (C) 2002 Christophe Delord # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # For further information about TPG you can visit # http://christophe.delord.free.fr/en/tpg # TODO: # - indent and dedent preprocessor # from __future__ import generators __tpgname__ = 'TPG' __version__ = '3.1.1' __date__ = '2006-10-07' __description__ = "A Python parser generator" __long_description__ = __doc__ __license__ = 'LGPL' __author__ = 'Christophe Delord' __email__ = 'christophe.delord@free.fr' __url__ = 'http://christophe.delord.free.fr/en/tpg/' import parser import re import sre_parse import sys try: enumerate except NameError: enumerate = lambda seq: zip(xrange(sys.maxint), seq) _id = lambda x: x tab = " "*4 class Error(Exception): """ Error((line, column), msg) Error is the base class for TPG exceptions. Attributes: line : line number from where the error has been raised column : column number from where the error has been raised msg : message associated to the error """ def __init__(self, (line, column), msg): self.line, self.column = line, column self.msg = msg def __str__(self): return "%s at line %s, column %s: %s"%(self.__class__.__name__, self.line, self.column, self.msg) class WrongToken(Error): """ WrongToken() WrongToken is raised when the parser can not continue in order to backtrack. """ def __init__(self): Exception.__init__(self) class LexicalError(Error): """ LexicalError((line, column), msg) LexicalError is raised by lexers when a lexical error is encountered. Attributes: line : line number from where the error has been raised column : column number from where the error has been raised msg : message associated to the error """ pass class SyntacticError(Error): """ SyntacticError((line, column), msg) SyntacticError is raised by parsers when they fail. Attributes: line : line number from where the error has been raised column : column number from where the error has been raised msg : message associated to the error """ pass class SemanticError(Error): """ SemanticError(msg) SemanticError is raised by user actions when an error is detected. Attributes: msg : message associated to the error """ def __init__(self, msg): Exception.__init__(self) self.msg = msg def __str__(self): return "%s: %s"%(self.__class__.__name__, self.msg) class LexerOptions: """ LexerOptions(word_bounded, compile_options) LexerOptions is a base class for lexers holding lexers' options. Parameters: word_bounded : if True identifier like regular expressions are added word boundaries compile_options : options given to re.compile to compile regular expressions """ word_re = re.compile(r"^\w+$") def __init__(self, wb, compile_options): if not wb: self.word_bounded = self.not_word_bounded self.compile_options = compile_options def re_compile(self, expr): """ compile expr using self.compile_options as re.compile options """ return re.compile(expr, self.compile_options) def word_bounded(self, expr): """ add word boundaries (\\b) to expr if it looks like an identifier """ if self.word_re.match(expr): return r"\b%s\b"%expr else: return expr def not_word_bounded(self, expr): """ return expr without change. Used to replace word_bounded when wb is False """ return expr class NamedGroupLexer(LexerOptions): r""" NamedGroupLexer(word_bounded, compile_options) NamedGroupLexer is a TPG lexer: - use named group regular expressions (faster but limited to 100 tokens) Attributes: token_re : regular expression containing the whole lexer tokens : dictionnary name -> (value, is_real_token) name is a token name value is a function that compute the value of a token from its text is_real_token is a boleean. True for tokens, False for separators Once the lexer is started more attributes are defined: input : input string being parsed max_pos : maximum position reached in the input string last_token : last token reached in the input string pos : position in the input string of the current token line : line of the current token column : column of the current token cur_token : current token """ def __init__(self, wb, compile_options): LexerOptions.__init__(self, wb, compile_options) self.token_re = [] # [named_regexp] and then regexp self.tokens = {} # name -> value, is_real_token def def_token(self, name, expr, value=_id): """ add a new token to the lexer Parameters: name : name of the token expr : regular expression of the token value : function to compute the token value from its text The default for value is the identity function. If value is not callable it is returned whatever the text of the token. """ if not callable(value): value = lambda _, value=value: value if name not in self.tokens: self.token_re.append("(?P<%s>%s)"%(name, self.word_bounded(expr))) self.tokens[name] = value, True else: raise SemanticError("Duplicate token definition (%s)"%name) def def_separator(self, name, expr, value=_id): """ add a new separator to the lexer Parameters: name : name of the separator expr : regular expression of the separator value : function to compute the separator value from its text The default for value is the identity function. If value is not callable it is returned whatever the text of the separator. Note that separator values are ignored. """ if not callable(value): value = lambda _, value=value: value if name not in self.tokens: self.token_re.append("(?P<%s>%s)"%(name, self.word_bounded(expr))) self.tokens[name] = value, False else: raise SemanticError("Duplicate token definition (%s)"%name) def build(self): """ build the token_re attribute from the tokens and separators """ if isinstance(self.token_re, list): self.token_re = self.re_compile("|".join(self.token_re)) def start(self, input): """ start a lexical analysis Parameters: input : input string to be parsed """ self.input = input self.max_pos = 0 self.last_token = None self.build() self.back(None) self.next() def eof(self): """ True if the current position of the lexer is the end of the input string """ return self.pos >= len(self.input) and isinstance(self.cur_token, EOFToken) def back(self, token): """ change the current token to token (used for backtracking) """ if token is None: self.pos = 0 self.line, self.column = 1, 1 self.cur_token = None else: self.pos = token.stop self.line, self.column = token.end_line, token.end_column self.cur_token = token def next(self): """ return the next token Tokens are Token instances. Separators are ignored. """ if self.cur_token is None: prev_stop = 0 else: prev_stop = self.cur_token.stop while True: if self.pos >= len(self.input): self.cur_token = EOFToken(self.line, self.column, self.pos, prev_stop) return self.cur_token tok = self.token_re.match(self.input, self.pos) if tok: name = tok.lastgroup text = tok.group() value, real_token = self.tokens[name] try: value = value(text) except WrongToken: raise LexicalError((self.line, self.column), "Lexical error in %s"%text) start, stop = tok.span() self.pos = stop tok_line, tok_column = self.line, self.column if '\n' in text: self.line += text.count('\n') self.column = len(text) - text.rfind('\n') else: self.column += len(text) if real_token: self.cur_token = Token(name, text, value, tok_line, tok_column, self.line, self.column, start, stop, prev_stop) if self.pos > self.max_pos: self.max_pos = self.pos self.last_token = self.cur_token return self.cur_token else: w = 20 nl = self.input.find('\n', self.pos, self.pos+w) if nl > -1: err = self.input[self.pos:nl] else: err = self.input[self.pos:self.pos+w] raise LexicalError((self.line, self.column), "Lexical error near %s"%err) def token(self): """ return the current token """ return self.cur_token def extract(self, start, stop): """ extract text from the input string Parameters: start : token from which the extraction starts stop : token where the extraction stops """ return self.input[start.start:stop.prev_stop] class Lexer(NamedGroupLexer): r""" Lexer(word_bounded, compile_options) Lexer is a TPG lexer: - based on NamedGroupLexer - doesn't use named group regular expressions (slower but not limited to 100 tokens) - select the longuest match so the order of token definitions doesn't mater Attributes: tokens : list (name, regexp, value, is_real_token) name is a token name regexp is the regular expression of the token value is a function that computes the value of a token from its text is_real_token is a boleean. True for tokens, False for separators Once the lexer is started more attributes are defined: input : input string being parsed max_pos : maximum position reached in the input string last_token : last token reached in the input string pos : position in the input string of the current token line : line of the current token column : column of the current token cur_token : current token """ def __init__(self, wb, compile_options): LexerOptions.__init__(self, wb, compile_options) self.tokens = [] # [(name, regexp, value, is_real_token)] def def_token(self, name, expr, value=_id): """ adds a new token to the lexer Parameters: name : name of the token expr : regular expression of the token value : function to compute the token value from its text The default for value is the identity function. If value is not callable it is returned whatever the text of the token. """ if not callable(value): value = lambda _, value=value: value if name not in self.tokens: self.tokens.append((name, self.re_compile(self.word_bounded(expr)), value, True)) else: raise SemanticError("Duplicate token definition (%s)"%name) def def_separator(self, name, expr, value=_id): """ add a new separator to the lexer Parameters: name : name of the separator expr : regular expression of the separator value : function to compute the separator value from its text The default for value is the identity function. If value is not callable it is returned whatever the text of the separator. Note that separator values are ignored. """ if not callable(value): value = lambda _, value=value: value if name not in self.tokens: self.tokens.append((name, self.re_compile(self.word_bounded(expr)), value, False)) else: raise SemanticError("Duplicate token definition (%s)"%name) def start(self, input): """ start a lexical analysis Parameters: input : input string to be parsed """ self.input = input self.max_pos = 0 self.last_token = None self.back(None) self.next() def next(self): """ return the next token Tokens are Token instances. Separators are ignored. """ if self.cur_token is None: prev_stop = 0 else: prev_stop = self.cur_token.stop while True: if self.pos >= len(self.input): self.cur_token = EOFToken(self.line, self.column, self.pos, prev_stop) return self.cur_token tok = None text = "" for _name, _regexp, _value, _is_real_token in self.tokens: _tok = _regexp.match(self.input, self.pos) if _tok: _text = _tok.group() if len(_text) > len(text): tok = _tok name = _name text = _text value = _value real_token = _is_real_token if tok: try: value = value(text) except WrongToken: raise LexicalError((self.line, self.column), "Lexical error in %s"%text) start, stop = tok.span() self.pos = stop tok_line, tok_column = self.line, self.column if '\n' in text: self.line += text.count('\n') self.column = len(text) - text.rfind('\n') else: self.column += len(text) if real_token: self.cur_token = Token(name, text, value, tok_line, tok_column, self.line, self.column, start, stop, prev_stop) if self.pos > self.max_pos: self.max_pos = self.pos self.last_token = self.cur_token return self.cur_token else: w = 20 nl = self.input.find('\n', self.pos, self.pos+w) if nl > -1: err = self.input[self.pos:nl] else: err = self.input[self.pos:self.pos+w] raise LexicalError((self.line, self.column), "Lexical error near %s"%err) class CacheNamedGroupLexer(NamedGroupLexer): r""" CacheNamedGroupLexer(word_bounded, compile_options) CacheNamedGroupLexer is a TPG lexer: - based on NamedGroupLexer - the complete token list is built before parsing (faster with very ambigous grammars but needs more memory) Attributes: token_re : regular expression containing the whole lexer tokens : dictionnary name -> (value, is_real_token) name is a token name value is a function that computes the value of a token from its text is_real_token is a boleean. True for tokens, False for separators cache : token list Once the lexer is started more attributes are defined: input : input string being parsed max_pos : maximum position reached in the input string last_token : last token reached in the input string pos : position in the input string of the current token line : line of the current token column : column of the current token cur_token : current token """ def __init__(self, wb, compile_options): NamedGroupLexer.__init__(self, wb, compile_options) def start(self, input): """ start a lexical analysis Parameters: input : input string to be parsed """ self.cache = [] self.input = input self.max_pos = 0 self.last_token = None self.build() self.back(None) while True: token = NamedGroupLexer.next(self) token.index = len(self.cache) self.cache.append(token) if isinstance(token, EOFToken): break self.max_pos = 0 self.last_token = None self.back(None) self.next() def next(self): """ return the next token Tokens are Token instances. Separators are ignored. """ if self.cur_token is None: index = 0 else: index = self.cur_token.index+1 token = self.cache[index] self.pos = token.stop self.line, self.column = token.line, token.column self.cur_token = token if self.pos > self.max_pos: self.max_pos = self.pos self.last_token = self.cur_token return self.cur_token class CacheLexer(Lexer): r""" CacheLexer(word_bounded, compile_options) CacheLexer is a TPG lexer: - based on Lexer - doesn't use named group regular expressions (slower but not limited to 100 tokens) - select the longuest match so the order of token definitions doesn't mater - the complete token list is built before parsing (faster with very ambigous grammars but needs more memory) Attributes: tokens : list (name, regexp, value, is_real_token) name is a token name regexp is the regular expression of the token value is a function that computes the value of a token from its text is_real_token is a boleean. True for tokens, False for separators cache : token list Once the lexer is started more attributes are defined: input : input string being parsed max_pos : maximum position reached in the input string last_token : last token reached in the input string pos : position in the input string of the current token line : line of the current token column : column of the current token cur_token : current token """ def __init__(self, wb, compile_options): Lexer.__init__(self, wb, compile_options) def start(self, input): """ start a lexical analysis Parameters: input : input string to be parsed """ self.cache = [] self.input = input self.max_pos = 0 self.last_token = None self.back(None) while True: token = Lexer.next(self) token.index = len(self.cache) self.cache.append(token) if isinstance(token, EOFToken): break self.max_pos = 0 self.last_token = None self.back(None) self.next() def next(self): """ return the next token Tokens are Token instances. Separators are ignored. """ if self.cur_token is None: index = 0 else: index = self.cur_token.index+1 token = self.cache[index] self.pos = token.stop self.line, self.column = token.line, token.column self.cur_token = token if self.pos > self.max_pos: self.max_pos = self.pos self.last_token = self.cur_token return self.cur_token class ContextSensitiveLexer(LexerOptions): r""" ContextSensitiveLexer(word_bounded, compile_options) ContextSensitiveLexer is a TPG lexer: - context sensitive means that each regular expression is matched when required by the parser. Different tokens can be found at the same position if the parser uses different grammar rules. Attributes: tokens : dictionnary name -> (regexp, value) name is a token name regexp is the regular expression of the token value is a function that computes the value of a token from its text separators : list (name, regexp, value) name is a token name regexp is the regular expression of the token value is a function that computes the value of a token from its text Once the lexer is started more attributes are defined: input : input string being parsed max_pos : maximum position reached in the input string last_token : last token reached in the input string pos : position in the input string of the current token line : line of the current token column : column of the current token cur_token : current token """ def __init__(self, wb, compile_options): LexerOptions.__init__(self, wb, compile_options) self.tokens = {} # name -> (regexp, value) self.separators = [] # [(name, regexp, value)] def def_token(self, name, expr, value=_id): """ add a new token to the lexer Parameters: name : name of the token expr : regular expression of the token value : function to compute the token value from its text The default for value is the identity function. If value is not callable it is returned whatever the text of the token. """ if not callable(value): value = lambda _, value=value: value if name not in self.tokens and name not in self.separators: self.tokens[name] = self.re_compile(self.word_bounded(expr)), value else: raise SemanticError("Duplicate token definition (%s)"%name) def def_separator(self, name, expr, value=_id): """ add a new separator to the lexer Parameters: name : name of the separator expr : regular expression of the separator value : function to compute the separator value from its text The default for value is the identity function. If value is not callable it is returned whatever the text of the separator. Note that separator values are ignored. """ if not callable(value): value = lambda _, value=value: value if name not in self.tokens and name not in self.separators: self.separators.append((name, self.re_compile(self.word_bounded(expr)), value)) else: raise SemanticError("Duplicate token definition (%s)"%name) def start(self, input): """ start a lexical analysis Parameters: input : input string to be parsed """ self.input = input self.max_pos = 0 self.last_token = None self.back(None) def eof(self): """ True if the current position of the lexer is the end of the input string """ return self.pos >= len(self.input) def back(self, token): """ change the current token to token (used for backtracking) """ if token is None: self.pos = 0 self.line, self.column = 1, 1 self.cur_token = SOFToken() else: self.pos = token.stop self.line, self.column = token.end_line, token.end_column self.cur_token = token self.eat_separators() self.cur_token.next_start = self.pos def eat_separators(self): """ skip separators in the input string from the current position """ done = False while not done: done = True for name, regexp, value in self.separators: sep = regexp.match(self.input, self.pos) if sep: start, stop = sep.span() text = self.input[start:stop] value = value(text) self.pos = stop if '\n' in text: self.line += text.count('\n') self.column = len(text) - text.rfind('\n') else: self.column += len(text) done = False def eat(self, name): """ return the next token value if it matches the expected token name """ regexp, value = self.tokens[name] tok = regexp.match(self.input, self.pos) if tok is None: raise WrongToken else: if self.cur_token is None: prev_stop = 0 else: prev_stop = self.cur_token.stop start, stop = tok.span() text = self.input[start:stop] value = value(text) self.pos = stop tok_line, tok_column = self.line, self.column if '\n' in text: self.line += text.count('\n') self.column = len(text) - text.rfind('\n') else: self.column += len(text) self.cur_token = Token(name, text, value, tok_line, tok_column, self.line, self.column, start, stop, prev_stop) if self.pos > self.max_pos: self.max_pos = self.pos self.last_token = self.cur_token self.eat_separators() self.cur_token.next_start = self.pos return self.cur_token def token(self): """ return the current token """ return self.cur_token def extract(self, start, stop): """ extract text from the input string Parameters: start : the token from which the extraction starts stop : the token where the extraction stops """ start = start and start.next_start or 0 stop = stop and stop.stop or -1 return self.input[start:stop] class Token: """ Token(name, text, value, line, column, end_line, end_column, start, stop, prev_stop) Token object used by lexers Attributes: name : name of the token text : text matched by the regular expression value : value computed from the text line : line of the token in the input string column : column of the token in the input string end_line : line of the end of the token end_column : column of the end of the token start : position of the start in the input string of the token stop : position of the end in the input string of the token prev_stop : position of the end of the previous token """ def __init__(self, name, text, value, line, column, end_line, end_column, start, stop, prev_stop): self.name = name self.text = text self.value = value self.line, self.column = line, column self.end_line, self.end_column = end_line, end_column self.start, self.stop = start, stop self.prev_stop = prev_stop ####################################################### # row is deprecated : to be removed in a future version def warn_row(self): import warnings; warnings.warn("row is deprecated, please use column instead", category=DeprecationWarning) def get_row(self): self.warn_row(); return self.column def set_row(self): self.warn_row(); return self.column def del_row(self): self.warn_row(); del self.column row = property(fget=get_row, fset=set_row, fdel=del_row, doc="row") ####################################################### def match(self, name): """ return True is the token name is the name of the expected token Parameters: name : name of the expected token """ return name == self.name def __str__(self): return "line %s, column %s: %s %s %s"%(self.line, self.column, self.name, self.text, self.value) class EOFToken(Token): """ EOFToken(line, column, pos, prev_stop) Token for the end of file (end of the input string). EOFToken is a Token object. Attributes: name : name of the token text : text matched by the regular expression value : value computed from the text line : line of the token in the input string column : column of the token in the input string end_line : line of the end of the token end_column : column of the end of the token start : position of the start in the input string of the token stop : position of the end in the input string of the token prev_stop : position of the end of the previous token """ def __init__(self, line, column, pos, prev_stop): Token.__init__(self, "EOF", "EOF", None, line, column, line, column, pos, pos, prev_stop) class SOFToken(Token): """ SOFToken() Token for the start of file (start of the input string). SOFToken is a Token object. Attributes: name : name of the token text : text matched by the regular expression value : value computed from the text line : line of the token in the input string column : column of the token in the input string end_line : line of the end of the token end_column : column of the end of the token start : position of the start in the input string of the token stop : position of the end in the input string of the token prev_stop : position of the end of the previous token """ def __init__(self): Token.__init__(self, "SOF", "SOF", None, 1, 1, 1, 1, 0, 0, 0) class Py: def __init__(self, level=0): frame = sys._getframe(1+level) self.globals = frame.f_globals self.locals = frame.f_locals def __getitem__(self, item): return eval(item%self, self.globals, self.locals) class ParserMetaClass(type): """ ParserMetaClass is the metaclass of Parser objects. When a ParserMetaClass class in defined, its doc string should contain a grammar. This grammar is parsed by TPGParser and the generated code is added to the class. If the class doesn't have a doc string, nothing is generated """ def __init__(cls, name, bases, dict): super(ParserMetaClass, cls).__init__(name, bases, dict) try: grammar = dict['__doc__'] except KeyError: pass else: parser = TPGParser(sys._getframe(1).f_globals) for attribute, source, code in parser(grammar): setattr(cls, attribute, code) class Parser: # Parser is the base class for parsers. # # This class can not have a doc string otherwise it would be considered as a grammar. # The metaclass of this class is ParserMetaClass. # # Attributes: # lexer : lexer build from the grammar # # Methods added to the generated parsers: # init_lexer(self) : return a lexer object to scan the tokens defined by the grammar # : each rule is translated into a method with the same name __metaclass__ = ParserMetaClass def __init__(self): """ Parser is the base class for parsers. This class can not have a doc string otherwise it would be considered as a grammar. The metaclass of this class is ParserMetaClass. Attributes: lexer : lexer build from the grammar Methods added to the generated parsers: init_lexer(self) : return a lexer object to scan the tokens defined by the grammar : each rule is translated into a method with the same name """ self.lexer = self.init_lexer() def eat(self, name): """ eat the current token if it matches the expected token Parameters: name : name of the expected token """ token = self.lexer.token() if token.match(name): self.lexer.next() return token.value else: raise WrongToken def eatCSL(self, name): """ eat the current token if it matches the expected token This method replaces eat for context sensitive lexers. Parameters: name : name of the expected token """ token = self.lexer.eat(name) return token.value def __call__(self, input, *args, **kws): """ parse a string starting from the default axiom The default axiom is START. Parameters: input : input string to parse *args : argument list to pass to START **kws : argument dictionnary to pass to START """ return self.parse('START', input, *args, **kws) def parse(self, axiom, input, *args, **kws): """ parse a string starting from a given axiom Parameters: axiom : rule name where the parser starts input : input string to parse *args : argument list to pass to START **kws : argument dictionnary to pass to START """ try: self.lexer.start(input) value = getattr(self, axiom)(*args, **kws) if not self.lexer.eof(): raise WrongToken except WrongToken: if self.lexer.last_token is None: last_token = "" line, column = 1, 1 else: last_token = self.lexer.last_token.text line, column = self.lexer.last_token.line, self.lexer.last_token.column raise SyntacticError((line, column), "Syntax error near %s"%last_token) return value def line(self, token=None): """ return the line number of a token Parameters: token : token object. If None, the current token line is returned. """ if token is None: token = self.lexer.token() if token is None: return 1 return token.line def column(self, token=None): """ return the column number of a token Parameters: token : token object. If None, the current token column is returned. """ if token is None: token =self.lexer.token() if token is None: return 1 return token.column ####################################################### # row is deprecated : to be removed in a future version def row(self, token=None): import warnings warnings.warn("row is deprecated, please use column instead.", category=DeprecationWarning) return self.column(token) ####################################################### def mark(self): """ return the current token This can be used to get the line or column number of a token or to extract text between two tokens. """ return self.lexer.token() def extract(self, start, stop): """ return the text found between two tokens Parameters : start : token object as returned by mark stop : token object as returned by mark """ return self.lexer.extract(start, stop) def check(self, cond): """ check a condition and backtrack when it is False Parameters: cond : condition to be checked """ if not cond: raise WrongToken return cond def error(self, msg): """ stop the parser and raise a SemanticError exception Parameters: msg : error message to raise """ raise SemanticError(msg) class VerboseParser(Parser): # VerboseParser is the base class for debugging parsers. # # This class can not have a doc string otherwise it would be considered as a grammar. # The metaclass of this class is ParserMetaClass. # It extends the Parser class to log the activity of the lexer. # # Attributes: # lexer : lexer build from the grammar # verbose : level of information # 0 : no information # 1 : print tokens successfully matched # 2 : print tokens matched and not matched # # Methods added to the generated parsers: # init_lexer(self) : return a lexer object to scan the tokens defined by the grammar # : each rule is translated into a method with the same name verbose = 1 def __init__(self): """ VerboseParser is the base class for debugging parsers. This class can not have a doc string otherwise it would be considered as a grammar. The metaclass of this class is ParserMetaClass. It extends the Parser class to log the activity of the lexer. Attributes: lexer : lexer build from the grammar verbose : level of information 0 : no information 1 : print tokens successfully matched 2 : print tokens matched and not matched Methods added to the generated parsers: init_lexer(self) : return a lexer object to scan the tokens defined by the grammar : each rule is translated into a method with the same name """ Parser.__init__(self) self.eatcnt = 0 def eat(self, name): """ eat the current token if it matches the expected token Parameters: name : name of the expected token """ self.eatcnt += 1 token = self.lexer.token() try: value = Parser.eat(self, name) if self.verbose >= 1: print self.token_info(token, "==", name) return value except WrongToken: if self.verbose >= 2: print self.token_info(token, "!=", name) raise def eatCSL(self, name): """ eat the current token if it matches the expected token This method replaces eat for context sensitive lexers. Parameters: name : name of the expected token """ self.eatcnt += 1 try: value = Parser.eatCSL(self, name) if self.verbose >= 1: token = self.lexer.token() print self.token_info(token, "==", name) return value except WrongToken: if self.verbose >= 2: token = Token("???", self.lexer.input[self.lexer.pos:self.lexer.pos+10].replace('\n', ' '), "???", self.lexer.line, self.lexer.column, self.lexer.line, self.lexer.column, self.lexer.pos, self.lexer.pos, self.lexer.pos) print self.token_info(token, "!=", name) raise def parse(self, axiom, input, *args, **kws): """ parse a string starting from a given axiom Parameters: axiom : rule name where the parser starts input : input string to parse *args : argument list to pass to START **kws : argument dictionnary to pass to START """ self.axiom = axiom return Parser.parse(self, axiom, input, *args, **kws) def token_info(self, token, op, expected): """ return information about a token Parameters: token : token read by the lexer op : result of the comparison made by the lexer (== or !=) expected : name of the expected token """ eatcnt = self.eatcnt callernames = [] stackdepth = 0 name = None while name != self.axiom: stackdepth += 1 name = sys._getframe(stackdepth+1).f_code.co_name if len(callernames) < 10: callernames.insert(0, name) callernames = '.'.join(callernames) found = "(%d,%d) %s %s"%(token.line, token.column, token.name, token.text) return "[%3d][%2d]%s: %s %s %s"%(eatcnt, stackdepth, callernames, found, op, expected) blank_line_re = re.compile("^\s*$") indent_re = re.compile("^\s*") class tpg: """ This class contains some TPG classes to make the parsers usable inside and outside the tpg module """ NamedGroupLexer = NamedGroupLexer Lexer = Lexer CacheNamedGroupLexer = CacheNamedGroupLexer CacheLexer = CacheLexer ContextSensitiveLexer = ContextSensitiveLexer Parser = Parser WrongToken = WrongToken re = re class TPGParser(tpg.Parser): r""" # This class parses TPG grammar # and generate the Python source and compiled code for the parser set lexer = NamedGroupLexer set lexer_verbose separator spaces '\s+' ; separator comment '\#.*' ; token string ''' "{3} [^"\\]* (?: (?: \\. | "(?!"") ) [^"\\]* )* "{3} | " [^"\\\n]* (?: \\. [^"\\\n]* )* " | '{3} [^'\\]* (?: (?: \\. | '(?!'') ) [^'\\]* )* '{3} | ' [^'\\\n]* (?: \\. [^'\\\n]* )* ' ''' ; token code ''' \{\{ ( \}? [^\}]+ )* \}\} | \$ [^\$\n]* \$ | \$ .*\n ( [ \t]* \$ .*\n )* ''' $ self.Code token ident '\w+' ; token lcbra '\{' ; token rcbra '\}' ; token star2 '\*\*' ; token star '\*' ; START/$self.gen(options, tokens, rules)$ -> OPTIONS/options TOKENS/tokens RULES/rules ; OPTIONS/options -> $ options = self.Options(self) ( 'set' ident/name ( '=' ident/value $ options.set(name, value) | $ options.set(name, 'True') ) )* ; TOKENS/ts -> $ ts = [] ( TOKEN/t $ ts.append(t) )* ; TOKEN/$token_type(name, expr, code)$ -> ( 'separator' $ token_type = self.DefSeparator | 'token' $ token_type = self.DefToken ) ident/name ':'? @t string/expr $ self.re_check(expr, t) ( PY_EXPR/code ';'? | ';' $ code = None ) ; RULES/rs -> $ rs = self.Rules() ( RULE/r $ rs.append(r) )* ; RULE/$self.Rule(head, body)$ -> HEAD/head '->' OR_EXPR/body ';' ; #HEAD/$self.Symbol(name, args, ret)$ -> ident/name OPT_ARGS/args RET/ret ; HEAD/$self.Symbol(name, args, ret)$ -> ident/name OPT_ARGS/args RET<$self.PY_Ident(name)$>/ret ; OR_EXPR/$self.balance(or_expr)$ -> AND_EXPR/a $ or_expr = [a] ( check $ not or_expr[-1].empty() $ '\|' AND_EXPR/a $ or_expr.append(a) )* ; AND_EXPR/$and_expr$ -> $ and_expr = self.And() ( ATOM_EXPR/a REP/a $ and_expr.append(a) )* ; ATOM_EXPR/a -> SYMBOL/a | INLINE_TOKEN/a | @t code/a $ self.code_check(a, t) | '\(' OR_EXPR/a '\)' | 'check' PY_EXPR/cond $ a = self.Check(cond) | 'error' PY_EXPR/msg $ a = self.Error(msg) | '@' PY_EXPR/mark $ a = self.Mark(mark) ; REP/a -> ( '\*' $ a = self.Rep(a, 0, None) | '\+' $ a = self.Rep(a, 1, None) | '\?' $ a = self.Rep(a, 0, 1) | '\{' ( PY_EXPR/min | $ min = self.PY_Ident("0") $ ) ( ',' ( PY_EXPR/max | $ max = self.PY_Ident("None") $ ) | $ max = min $ ) '\}' $ a = self.Rep(a, min, max) )? ; #SYMBOL/$self.Symbol(name, args, ret)$ -> ident/name OPT_ARGS/args RET/ret ; SYMBOL/$self.Symbol(name, args, ret)$ -> ident/name OPT_ARGS/args RET<$self.PY_Ident(name)$>/ret ; INLINE_TOKEN/$self.InlineToken(expr, ret)$ -> @t string/expr $ self.re_check(expr, t) RET/ret ; OPT_ARGS/args -> ARGS/args | $ args = self.Args() $ ; ARGS/args -> '<' $ args = self.Args() ( ARG/arg $ args.append(arg) ( ',' ARG/arg $ args.append(arg) )* ','? )? '>' ; ARG/a -> ident/name '=' PY_EXPR/a $ a = self.PY_KeywordArgument(name, a) | PY_EXPR/a $ a = self.PY_PositionArgument(a) | '\*' ident/name $ a = self.PY_PositionArgumentList(name) | '\*\*' ident/name $ a = self.PY_KeywordArgumentList(name) ; #RET/ret -> '/' PY_EXPR/ret | $ ret = None $ ; RET/ret -> ( '/' PY_EXPR/ret )? ; PY_EXPR/expr -> ident/name $ expr = self.PY_Ident(name) | string/st $ expr = self.PY_Ident(st) | code/expr | ARGS/expr ; """ def __init__(self, _globals=None): Parser.__init__(self) if _globals is not None: self.env = _globals else: self.env = {} def re_check(self, expr, tok): try: sre_parse.parse(eval('r'+expr)) except Exception, e: raise LexicalError((tok.line, tok.column), "Invalid regular expression: %s (%s)"%(expr, e)) def code_check(self, code, tok): try: parser.suite(code.code) except Exception, e: erroneous_code = "\n".join([ "%2d: %s"%(i+1, l) for (i, l) in enumerate(code.code.splitlines()) ]) raise LexicalError((tok.line, tok.column), "Invalid Python code (%s): \n%s"%(e, erroneous_code)) class Options: option_dict = { # Option name Accepted values Default value 'lexer': ({'NamedGroupLexer': NamedGroupLexer, 'Lexer': Lexer, 'CacheNamedGroupLexer': CacheNamedGroupLexer, 'CacheLexer': CacheLexer, 'ContextSensitiveLexer': ContextSensitiveLexer, }, 'NamedGroupLexer'), 'word_boundary': ({'True': True, 'False': False}, 'True'), #'indent': ({'True': True, 'False': False}, 'False'), 'lexer_ignorecase': ({'True': "IGNORECASE", 'False': False}, 'False'), 'lexer_locale': ({'True': "LOCALE", 'False': False}, 'False'), 'lexer_multiline': ({'True': "MULTILINE", 'False': False}, 'False'), 'lexer_dotall': ({'True': "DOTALL", 'False': False}, 'False'), 'lexer_verbose': ({'True': "VERBOSE", 'False': False}, 'False'), 'lexer_unicode': ({'True': "UNICODE", 'False': False}, 'False'), } def __init__(self, parser): self.parser = parser for name, (values, default) in TPGParser.Options.option_dict.items(): self.set(name, default) def set(self, name, value): try: options, default = TPGParser.Options.option_dict[name] except KeyError: opts = TPGParser.Options.option_dict.keys() opts.sort() self.parser.error("Unknown option (%s). Valid options are %s"%(name, ', '.join(opts))) try: value = options[value] except KeyError: values = options.keys() values.sort() self.parser.error("Unknown value (%s). Valid values for %s are %s"%(value, name, ', '.join(values))) setattr(self, name, value) def lexer_compile_options(self): options = [ self.lexer_ignorecase, self.lexer_locale, self.lexer_multiline, self.lexer_dotall, self.lexer_verbose, self.lexer_unicode, ] return "+".join([ "tpg.re.%s"%opt for opt in options if opt ]) or 0 class Empty: def empty(self): return True class NotEmpty: def empty(self): return False class Code(NotEmpty): def __init__(self, code): if code.startswith('$'): if code.endswith('$'): lines = code[1:-1].splitlines() else: lines = [line.split('$', 1)[1] for line in code.splitlines()] elif code.startswith('{{') and code.endswith('}}'): lines = code[2:-2].splitlines() else: raise WrongToken while lines and blank_line_re.match(lines[0]): lines.pop(0) while lines and blank_line_re.match(lines[-1]): lines.pop(-1) if lines: indents = [len(indent_re.match(line).group(0)) for line in lines] indent = indents[0] if min(indents) < indent: # Indentation incorrecte raise WrongToken lines = [line[indent:] for line in lines] self.code = "".join([line+"\n" for line in lines]) def get_inline_tokens(self): return yield None def gen_code(self, indent=None, counters=None, pos=None): if indent is None: return self.code.strip() else: return [indent+line for line in self.code.splitlines()] def links_symbols_to_tokens(self, tokens): pass def gen_doc(self, parent): return "" class DefToken: def_method = "def_token" def __init__(self, name, expr, code=None): self.name = name self.expr = expr if code is not None and code.gen_code().count('\n') > 1: raise WrongToken self.code = code def gen_def(self): expr = self.expr if self.code is None: return "lexer.%s('%s', r%s)"%(self.def_method, self.name, expr) else: code = self.code.gen_code().strip() return "lexer.%s('%s', r%s, %s)"%(self.def_method, self.name, expr, code) class DefSeparator(DefToken): def_method = "def_separator" class Rules(list): def get_inline_tokens(self): for rule in self: for token in rule.get_inline_tokens(): yield token def links_symbols_to_tokens(self, tokens): for rule in self: rule.links_symbols_to_tokens(tokens) def gen_code(self): for rule in self: yield rule.gen_code() class Rule: class Counters(dict): def __call__(self, name): n = self.get(name, 1) self[name] = n+1 return "_%s%s"%(name, n) def __init__(self, head, body): self.head = head self.body = body def get_inline_tokens(self): for token in self.body.get_inline_tokens(): yield token def links_symbols_to_tokens(self, tokens): if self.head.name in tokens: raise SemanticError("%s is both a token and a symbol"%self.head.name) else: self.body.links_symbols_to_tokens(tokens) def gen_code(self): counters = self.Counters() return self.head.name, [ self.head.gen_def(), tab + 'r""" %s -> %s """'%(self.head.gen_doc(self), self.body.gen_doc(self)), self.head.gen_init_ret(tab), self.body.gen_code(tab, counters, None), self.head.gen_ret(tab), ] class Symbol(NotEmpty): def __init__(self, name, args, ret): self.name = name self.args = args self.ret = ret def get_inline_tokens(self): return yield None def links_symbols_to_tokens(self, tokens): self.token = tokens.get(self.name, None) if self.token is not None and self.args: raise SemanticError("Token %s can not have arguments"%self.name) def gen_def(self): return "def %s(self, %s):"%(self.name, self.args.gen_code()) def gen_init_ret(self, indent): return self.ret.gen_code() == self.name and indent + "%s = None"%(self.name) or () def gen_ret(self, indent): return self.ret and indent + "return %s"%self.ret.gen_code() or () def gen_code(self, indent, counters, pos): if self.token is not None: if self.ret is not None: return indent + "%s = self.eat('%s')"%(self.ret.gen_code(), self.token.name) else: return indent + "self.eat('%s')"%(self.token.name) else: if self.ret is not None: return indent + "%s = self.%s(%s)"%(self.ret.gen_code(), self.name, self.args.gen_code()) else: return indent + "self.%s(%s)"%(self.name, self.args.gen_code()) def gen_doc(self, parent): return self.name class InlineToken(NotEmpty): def __init__(self, expr, ret): self.expr = expr self.ret = ret def get_inline_tokens(self): yield self def set_explicit_token(self, token): self.explicit_token = token def gen_def(self): return self.explicit_token.gen_def() def links_symbols_to_tokens(self, tokens): pass def gen_code(self, indent, counters, pos): if self.ret is not None: return indent + "%s = self.eat('%s') # %s"%(self.ret.gen_code(), self.explicit_token.name, self.expr) else: return indent + "self.eat('%s') # %s"%(self.explicit_token.name, self.expr) def gen_doc(self, parent): return self.expr class Args(list): def gen_code(self): return ", ".join([a.gen_code() for a in self]) class PY_PositionArgument: def __init__(self, arg): self.arg = arg def gen_code(self): return self.arg.gen_code() class PY_KeywordArgument: def __init__(self, name, arg): self.name = name self.arg = arg def gen_code(self): return "%s=%s"%(self.name, self.arg.gen_code()) class PY_PositionArgumentList: def __init__(self, name): self.name = name def gen_code(self): return "*%s"%self.name class PY_KeywordArgumentList: def __init__(self, name): self.name = name def gen_code(self): return "**%s"%self.name class And(list): def empty(self): for a in self: if not a.empty(): return False return True def get_inline_tokens(self): for a in self: for token in a.get_inline_tokens(): yield token def links_symbols_to_tokens(self, tokens): for a in self: a.links_symbols_to_tokens(tokens) def gen_code(self, indent, counters, pos): return self and [ self[0].gen_code(indent, counters, pos), [a.gen_code(indent, counters, None) for a in self[1:]], ] def gen_doc(self, parent): docs = [] for a in self: doc = a.gen_doc(self) if doc: docs.append(doc) return " ".join(docs) class Or(NotEmpty): def __init__(self, a, b): self.a = a self.b = b def get_inline_tokens(self): for token in self.a.get_inline_tokens(): yield token for token in self.b.get_inline_tokens(): yield token def links_symbols_to_tokens(self, tokens): self.a.links_symbols_to_tokens(tokens) self.b.links_symbols_to_tokens(tokens) def gen_code(self, indent, counters, pos): p = pos or counters("p") return [ pos is None and indent + "%s = self.lexer.token()"%p or (), indent + "try:", self.a.gen_code(indent+tab, counters, p), indent + "except tpg.WrongToken:", indent + tab + "self.lexer.back(%s)"%p, self.b.gen_code(indent+tab, counters, p), ] def gen_doc(self, parent): doc = "%s | %s"%(self.a.gen_doc(self), self.b.gen_doc(self)) if isinstance(parent, TPGParser.And) and len(parent) > 1: doc = "(%s)"%doc return doc def balance(self, xs): if len(xs) == 1: return xs[0] else: m = len(xs)//2 return self.Or(self.balance(xs[:m]), self.balance(xs[m:])) class Rep(NotEmpty): def __init__(self, a, min, max): self.a = a self.min = min self.max = max def get_inline_tokens(self): for token in self.a.get_inline_tokens(): yield token def links_symbols_to_tokens(self, tokens): self.a.links_symbols_to_tokens(tokens) def gen_code(self, indent, counters, pos): # A? if (self.min, self.max) == (0, 1): p = pos or counters("p") return [ pos is None and indent + "%s = self.lexer.token()"%p or (), indent + "try:", self.a.gen_code(indent+tab, counters, p), indent + "except tpg.WrongToken:", indent + tab + "self.lexer.back(%s)"%p, ] # A* elif (self.min, self.max) == (0, None): p = pos or counters("p") return [ indent + "while True:", indent + tab + "%s = self.lexer.token()"%p, indent + tab + "try:", self.a.gen_code(indent+tab+tab, counters, p), indent + tab + "except tpg.WrongToken:", indent + tab + tab + "self.lexer.back(%s)"%p, indent + tab + tab + "break", ] # A+ elif (self.min, self.max) == (1, None): p = pos or counters("p") n = counters("n") return [ indent + "%s = 0"%n, indent + "while True:", indent + tab + "%s = self.lexer.token()"%p, indent + tab + "try:", self.a.gen_code(indent+tab+tab, counters, p), indent + tab + tab + "%s += 1"%n, indent + tab + "except tpg.WrongToken:", indent + tab + tab + "if %s < 1: raise"%n, indent + tab + tab + "self.lexer.back(%s)"%p, indent + tab + tab + "break", ] # A{min, max} else: p = pos or counters("p") n = counters("n") min = self.min.gen_code() max = self.max.gen_code() return [ indent + "%s = 0"%n, indent + "while %s:"%(max=="None" and "True" or "%s < %s"%(n, max)), indent + tab + "%s = self.lexer.token()"%p, indent + tab + "try:", self.a.gen_code(indent+tab+tab, counters, p), indent + tab + tab + "%s += 1"%n, indent + tab + "except tpg.WrongToken:", indent + tab + tab + "if %s < %s: raise"%(n, min), indent + tab + tab + "self.lexer.back(%s)"%p, indent + tab + tab + "break", ] def gen_doc(self, parent): doc = self.a.gen_doc(self) if isinstance(self.a, (TPGParser.And, TPGParser.Or)): doc = "(%s)"%doc if (self.min, self.max) == (0, 1): rep = "?" elif (self.min, self.max) == (0, None): rep = "*" elif (self.min, self.max) == (1, None): rep = "+" else: min = self.min.gen_code() max = self.max.gen_code() if min == max: rep = "{%s}"%min else: if min == "0": min = "" if max == "None": max = "" rep = "{%s,%s}"%(min, max) return "%s%s"%(doc, rep) class Check(NotEmpty): def __init__(self, cond): self.cond = cond def get_inline_tokens(self): return yield None def links_symbols_to_tokens(self, tokens): pass def gen_doc(self, parent): return "" def gen_code(self, indent, counters, pos): return indent + "self.check(%s)"%self.cond.gen_code() class Error(NotEmpty): def __init__(self, msg): self.msg = msg def get_inline_tokens(self): return yield None def links_symbols_to_tokens(self, tokens): pass def gen_doc(self, parent): return "" def gen_code(self, indent, counters, pos): return indent + "self.error(%s)"%self.msg.gen_code() class Mark(NotEmpty): def __init__(self, mark): self.mark = mark def get_inline_tokens(self): return yield None def links_symbols_to_tokens(self, tokens): pass def gen_doc(self, parent): return "" def gen_code(self, indent, counters, pos): return indent + "%s = self.mark()"%self.mark.gen_code() class PY_Ident(str): def gen_code(self): return str(self) def flatten_nl(self, *lines): for sublines in lines: if isinstance(sublines, (list, tuple)): for line in self.flatten_nl(*sublines): yield line else: yield sublines + "\n" def make_code(self, attribute, *source): source = "".join(self.flatten_nl(*source)) local_namespace = {} exec source in self.env, local_namespace code = local_namespace[attribute] return attribute, source, code def gen(self, options, tokens, rules): # building the lexer lexer = options.lexer word_bounded = options.word_boundary lexer_options = options.lexer_compile_options() explicit_tokens = {} for token in tokens: explicit_tokens[token.expr[1:-1]] = token token_number = 0 inline_tokens = [] for token in rules.get_inline_tokens(): try: # If the token was already defined just link it to the first definition token.set_explicit_token(explicit_tokens[token.expr[1:-1]]) except KeyError: # Otherwise create an explicit definition for the new inline token token_number += 1 token.set_explicit_token(self.DefToken("_tok_%s"%token_number, token.expr)) explicit_tokens[token.expr[1:-1]] = token.explicit_token inline_tokens.append(token) yield self.make_code("init_lexer", "def init_lexer(self):", lexer is ContextSensitiveLexer and [tab + "self.eat = self.eatCSL"] or (), tab + "lexer = tpg.%s(%s, %s)"%(lexer.__name__, word_bounded, lexer_options), [ tab + tok.gen_def() for tok in inline_tokens ], [ tab + tok.gen_def() for tok in tokens ], tab + "return lexer", ) # building the parser tokens_from_name = {} for token in inline_tokens: tokens_from_name[token.explicit_token.name] = token for token in tokens: tokens_from_name[token.name] = token rules.links_symbols_to_tokens(tokens_from_name) for name, code in rules.gen_code(): yield self.make_code(name, *code)