"""Object-oriented tag-table generator objects The objectgenerator module is the core of the SimpleParse system, the various element token classes defined here implement transitions from EBNF-style abstractions into the low-level (assembly-like) instructions to the TextTools engine. Each class within the module is a sub-class of ElementToken, which provides a number of common facilities, the most obvious of which is the permute method, which takes care of the negative, optional, and repeating flags for the normal case (with character ranges and literals being non-normal). """ from simpleparse.stt.TextTools.TextTools import * ### Direct use of BMS is deprecated now... try: TextSearch except NameError: TextSearch = BMS from simpleparse.error import ParserSyntaxError import copy class ElementToken: """Abstract base class for all ElementTokens Common Attributes: negative -- the element token should match a character if the "base" definition would not match at the current position optional -- the element token will match even if the base definition would not match at the current position repeating -- if the element is successfully matched, attempt to match it again. lookahead -- if true, the scanning position of the engine will be reset after the element matches errorOnFail -- if true, the engine will call the object stored in errorOnFail as a text- matching object iff the element token fails to match. This is used to signal SyntaxErrors. Attributes only used for top-level Productions: report -- if true, the production's results will be added to the result tree expanded -- if true, the production's children's results will be added to the result tree but the production's own result will be ignored """ negative = 0 optional = 0 repeating = 0 report = 1 # note that optional and errorOnFail are mutually exclusive errorOnFail = None # any item may be marked as expanded, # which says that it's a top-level declaration # and that links to it should automatically expand # as if the name wasn't present... expanded = 0 lookahead = 0 def __init__( self, **namedarguments ): """Initialize the object with named attributes This method simply takes the named attributes and updates the object's dictionary with them """ self.__dict__.update( namedarguments ) def toParser( self, generator, noReport=0 ): """Abstract interface for implementing the conversion to a text-tools table generator -- an instance of generator.Generator which provides various facilities for discovering other productions. noReport -- if true, we're being called recursively for a terminal grammar fragment where one of our parents has explicitly suppressed all reporting. This method is called by the generator or by another element-token's toParser method. """ raise NotImplementedError( '''Element token generator abstract function called''' ) def permute( self, basetable ): '''Given a positive, required, non-repeating table, convert to appropriately configured table This method applies generic logic for applying the operational flags to a basic recipe for an element. It is normally called from the elements-token's own toParser method. ''' flags = 0 if self.lookahead: flags = flags + LookAhead assert len(basetable) == 3, '''Attempt to permute a base table that already has fail flag set, can only permute unadorned tables''' if self.negative: # negative "matches" if it fails # we add in the flags while we're at it... basetable = (None, SubTable+flags, ( basetable + (1,2), (None, EOF, Here,2,1), # if we hit eof, this didn't match, otherwise, we matched (None, Fail, Here),# either hit eof or matched the client (None,Skip,1), )) elif flags: # unpack, add the flags, and repack tag, command, arg = basetable basetable = ( tag, command+flags, arg) if self.repeating: ### There are a number of problems with repetition that we'd like to solve ### via recursive table calls, but those are very expensive in the current ### implementation, so we need to use something a little more hacky... if self.optional: return [ ## this would be the "simplistic" implementation... ## basetable + (1,0) ## it doesn't work because of cases ## where all-optional children "succeed" without consuming ## when within a repeating parent ## the EOF test isn't enough to fix the problem, ## as it's only checking a common case, not the underlying failure basetable +(2,1), # fail, done, succeed, check for eof and if not, try matching again # if we hit eof, no chance of further matches, # consider ourselves done (None, EOF, Here,-1,1), ] elif self.errorOnFail: return [ basetable+(1,2), (None, Call, self.errorOnFail), # as for optional... basetable +(2,1), (None, EOF, Here,-1,1), ] else: return [ basetable, # as for optional... basetable +(2,1), (None, EOF, Here,-1,1), ] else: # single if self.optional: return [ basetable +(1,1) ] elif self.errorOnFail: return [ basetable+(1,2), (None, Call, self.errorOnFail), ] else: # not optional return [ basetable ] def __repr__( self): """Return a readily recognisable version of ourself""" from simpleparse import printers return printers.asObject( self ) def terminal (self, generator): """Determine if this element is terminal for the generator""" return 0 class Literal( ElementToken ): """Literal string value to be matched Literals are one of the most common elements within any grammar. The implementation tries to use the most efficient mechanism available for matching/searching for a literal value, so the Literal class does not use the permute method, instead defining explicit parsing methodologies for each flag and value combination Literals in the SimpleParse EBNF grammar are defined like so: "test", "test"?, "test"*, "test"+ -"test", -"test"?, -"test"*, -"test"+ Attributes: value -- a string storing the literal's value Notes: Currently we don't support Unicode literals See also: CILiteral -- case-insensitive Literal values """ value = "" def toParser( self, generator=None, noReport=0 ): """Create the parser for the element token""" flags = 0 if self.lookahead: flags = flags + LookAhead base = self.baseToParser( generator ) if flags or self.errorOnFail: if self.errorOnFail: return [(None, SubTable+flags, tuple(base),1,2),(None, Call, self.errorOnFail)] else: return [(None, SubTable+flags, tuple(base))] else: return base def baseToParser( self, generator=None ): """Parser generation without considering flag settings""" svalue = self.value if self.negative: if self.repeating: # a repeating negative value, a "search" in effect if self.optional: # if fails, then go to end of file return [ (None, sWordStart, TextSearch( svalue ),1,2), (None, Move, ToEOF ) ] else: # must first check to make sure the current position is not the word, then the same return [ (None, Word, svalue, 2,1), (None, Fail, Here), (None, sWordStart, TextSearch( svalue ),1,2), (None, Move, ToEOF ) ] #return [ (None, Word, svalue, 2,1),(None, Fail, Here),(None, WordStart, svalue,1,2), (None, Move, ToEOF ) ] else: # a single-character test saying "not a this" if self.optional: # test for a success, move back if success, move one forward if failure if len(svalue) > 1: return [ (None, Word, svalue, 2,1), (None, Skip, -len(svalue), 2,2), # backup if this was the word to start of word, succeed (None, Skip, 1 ) ] # else just move one character and succeed else: # Uses Is test instead of Word test, should be faster I'd imagine return [ (None, Is, svalue, 2,1), (None, Skip, -1, 2,2), # backtrack (None, Skip, 1 ) ] # else just move one character and succeed else: # must find at least one character not part of the word, so if len(svalue) > 1: return [ (None, Word, svalue, 2,1), (None, Fail, Here), (None, Skip, 1 ) ] # else just move one character and succeed else: #must fail if it finds or move one forward return [ (None, Is, svalue, 2,1), (None, Fail, Here), (None, Skip, 1 ) ] # else just move one character and succeed else: # positive if self.repeating: if self.optional: if len(svalue) > 1: return [ (None, Word, svalue, 1,0) ] else: return [ (None, Is, svalue, 1,0) ] else: # not optional if len(svalue) > 1: return [ (None, Word, svalue),(None, Word, svalue,1,0) ] else: return [ (None, Is, svalue),(None, Is, svalue,1,0) ] else: # not repeating if self.optional: if len(svalue) > 1: return [ (None, Word, svalue, 1,1) ] else: return [ (None, Is, svalue, 1,1) ] else: # not optional if len(svalue) > 1: return [ (None, Word, svalue) ] else: return [ (None, Word, svalue) ] def terminal (self, generator): """Determine if this element is terminal for the generator""" return 1 class _Range( ElementToken ): """Range of character values where any one of the characters may match The Range token allows you to define a set of characters (using a mini-grammar) of which any one may match. By using the repetition flags, it is possible to easily create such common structures as "names" and "numbers". For example: name := [a-zA-Z]+ number := [0-9.eE]+ (Note: those are not beautifully defined examples :) ). The mini-grammar for the simpleparsegrammar is defined as follows: '[',CHARBRACE?,CHARDASH?, (CHARRANGE/CHARNOBRACE)*, CHARDASH?,']' that is, if a literal ']' character is wanted, you must define the character as the first item in the range. A literal '-' character must appear as the first character after any literal ']' character (or the beginning of the range) or as the last character in the range. Note: The expansion from the mini-grammar occurs before the Range token is created (the simpleparse grammar does the expansion), so the value attribute of the token is actually the expanded string of characters. """ value = "" requiresExpandedSet = 1 def toParser( self, generator=None, noReport=0 ): """Create the parser for the element token""" flags = 0 if self.lookahead: flags = flags + LookAhead base = self.baseToParser( generator ) if flags or self.errorOnFail: if self.errorOnFail: return [(None, SubTable+flags, tuple(base),1,2),(None, Call, self.errorOnFail)] else: return [(None, SubTable+flags, tuple(base))] else: return base # this should be a faster and more generic character set # approach, but there's a bug with mxTextTools b3 which makes # it non-functional, so for now I'm using the old version. # Eventually this should also support the Unicode character sets ##try: ## CharSet ## class Range( _Range ): ## """Range type using the CharSet feature of mx.TextTools 2.1.0 ## ## The CharSet type allows for both Unicode and 256-char strings, ## so we can use it as our 2.1.0 primary parsing mechanism. ## It also allows for simpler definitions (doesn't require that ## we pre-exand the character set). That's going to require support ## in the SimpleParse grammar, of course. ## """ ## requiresExpandedSet = 0 ## def baseToParser( self, generator=None ): ## """Parser generation without considering flag settings""" ## svalue = self.value ## print 'generating range for ', repr(svalue) ## if not svalue: ## raise ValueError( '''Range defined with no member values, would cause infinite loop %s'''%(self)) ## if self.negative: ## svalue = '^' + svalue ## print ' generated', repr(svalue) ## svalue = CharSet(svalue) ## if self.repeating: ## if self.optional: ## return [ (None, AllInCharSet, svalue, 1 ) ] ## else: # not optional ## #return [ (None, AllInSet, svalue ) ] ## return [ (None, AllInCharSet, svalue ) ] ## else: # not repeating ## if self.optional: ## #return [ (None, IsInSet, svalue, 1 ) ] ## return [ (None, IsInCharSet, svalue, 1 ) ] ## else: # not optional ## #return [ (None, IsInSet, svalue ) ] ## return [ (None, IsInCharSet, svalue ) ] ##except NameError: class Range( _Range ): """Range type which doesn't use the CharSet features in mx.TextTools This is likely to be much slower than the CharSet version (below), and is unable to handle unicode character sets. However, it will work with TextTools 2.0.3, which may be needed in some cases. """ def baseToParser( self, generator=None ): """Parser generation without considering flag settings""" svalue = self.value if not svalue: raise ValueError( '''Range defined with no member values, would cause infinite loop %s'''%(self)) if self.negative: if self.repeating: if self.optional: #return [ (None, AllInSet, svalue, 1 ) ] return [ (None, AllNotIn, svalue, 1 ) ] else: # not optional #return [ (None, AllInSet, svalue ) ] return [ (None, AllNotIn, svalue ) ] else: # not repeating if self.optional: #return [ (None, IsInSet, svalue, 1 ) ] return [ (None, IsNotIn, svalue, 1 ) ] else: # not optional #return [ (None, IsInSet, svalue ) ] return [ (None, IsNotIn, svalue ) ] else: if self.repeating: if self.optional: #return [ (None, AllInSet, svalue, 1 ) ] return [ (None, AllIn, svalue, 1 ) ] else: # not optional #return [ (None, AllInSet, svalue ) ] return [ (None, AllIn, svalue ) ] else: # not repeating if self.optional: #return [ (None, IsInSet, svalue, 1 ) ] return [ (None, IsIn, svalue, 1 ) ] else: # not optional #return [ (None, IsInSet, svalue ) ] return [ (None, IsIn, svalue ) ] def terminal (self, generator): """Determine if this element is terminal for the generator""" return 1 class Group( ElementToken ): """Abstract base class for all group element tokens The primary feature of a group is that it has a set of element tokens stored in the attribute "children". """ children = () terminalValue = None def terminal (self, generator): """Determine if this element is terminal for the generator""" if self.terminalValue in (0,1): return self.terminalValue self.terminalValue = 0 for item in self.children: if not item.terminal( generator): return self.terminalValue self.terminalValue = 1 return self.terminalValue class SequentialGroup( Group ): """A sequence of element tokens which must match in a particular order A sequential group must match each child in turn and all children must be satisfied to consider the group matched. Within the simpleparsegrammar, the sequential group is defined like so: ("a", b, c, "d") i.e. a series of comma-separated element token definitions. """ def toParser( self, generator=None, noReport=0 ): elset = [] for child in self.children: elset.extend( child.toParser( generator, noReport ) ) basic = self.permute( (None, SubTable, tuple( elset)) ) if len(basic) == 1: first = basic[0] if len(first) == 3 and first[0] is None and first[1] == SubTable: return tuple(first[2]) return basic class CILiteral( SequentialGroup ): """Case-insensitive Literal values The CILiteral is a sequence of literal and character-range values, where each element is positive and required. Literal values are composed of those characters which are not upper-case/lower-case pairs, while the ranges are all two-character ranges with the upper and lower forms. CILiterals in the SimpleParse EBNF grammar are defined like so: c"test", c"test"?, c"test"*, c"test"+ -c"test", -c"test"?, -c"test"*, -c"test"+ Attributes: value -- a string storing the literal's value Notes: Currently we don't support Unicode literals A CILiteral will be *much* slower than a regular literal or character range """ value = "" def toParser( self, generator=None, noReport=0 ): elset = self.ciParse( self.value ) if len(elset) == 1: # XXX should be compressing these out during optimisation... # pointless declaration of case-insensitivity, # or a single-character value pass basic = self.permute( (None, SubTable, tuple( elset)) ) if len(basic) == 1: first = basic[0] if len(first) == 3 and first[0] is None and first[1] == SubTable: return tuple(first[2]) return basic def ciParse( self, value ): """Break value into set of case-dependent groups...""" def equalPrefix( a,b ): for x in range(len(a)-1): if a[x] != b[x]: return x result = [] a,b = value.upper(), value.lower() while a and b: # is there an equal literal run at the start? stringPrefix = equalPrefix( a,b ) if stringPrefix: result.append( (None, Word, a[:stringPrefix]) ) a,b = a[stringPrefix:],b[stringPrefix:] # if we hit the end of the string, that's fine, just return if not a and b: break # otherwise, the next character must be a case-differing pair result.append( (None, IsIn, a[0]+b[0]) ) a,b = a[1:], b[1:] return result class ErrorOnFail(ElementToken): """When called as a matching function, raises a SyntaxError Attributes: expected -- list of strings describing expected productions production -- string name of the production that's failing to parse message -- overrides default message generation if non-null (something,something)+! (something,something)! (something,something)+!"Unable to parse somethings in my production" (something,something)!"Unable to parse somethings in my production" if string -> give an explicit message (with optional % values) else -> use a default string """ production = "" message = "" expected = "" def __call__( self, text, position, end ): """Method called by mxTextTools iff the base production fails""" error = ParserSyntaxError( self.message ) error.message = self.message error.production = self.production error.expected= self.expected error.buffer = text error.position = position raise error def copy( self ): import copy return copy.copy( self ) class FirstOfGroup( Group ): """Set of tokens that matches (and stops searching) with the first successful child A FirstOf group attempts to match each child in turn, declaring success with the first successful child, or failure if none of the children match. Within the simpleparsegrammar, the FirstOf group is defined like so: ("a" / b / c / "d") i.e. a series of slash-separated element token definitions. """ def toParser( self, generator=None, noReport=0 ): elset = [] # should catch condition where a child is optional # and we are repeating (which causes a crash during # parsing), but doing so is rather complex and # requires analysis of the whole grammar. for el in self.children: assert not el.optional, """Optional child of a FirstOf group created, this would cause an infinite recursion in the engine, child was %s"""%el dataset = el.toParser( generator, noReport ) if len( dataset) == 1:# and len(dataset[0]) == 3: # we can alter the jump states with impunity elset.append( dataset[0] ) else: # for now I'm eating the inefficiency and doing an extra SubTable for all elements to allow for easy calculation of jumps within the FO group elset.append( (None, SubTable, tuple( dataset )) ) procset = [] for i in range( len( elset) -1): # note that we have to treat last el specially procset.append( elset[i] + (1,len(elset)-i) ) # if success, jump past end procset.append( elset[-1] ) # will cause a failure if last element doesn't match procset = tuple(procset) basetable = (None, SubTable, procset ) return self.permute( basetable ) class Prebuilt( ElementToken ): """Holder for pre-built TextTools tag tables You can pass in a Pre-built tag table when creating your grammar, doing so creates Prebuilt element tokens which can be referenced by the other element tokens in your grammar. """ value = () def toParser( self, generator=None, noReport=0 ): return self.value class LibraryElement( ElementToken ): """Holder for a prebuilt item with it's own generator""" generator = None production = "" methodSource = None def toParser( self, generator=None, noReport=0 ): if self.methodSource is None: source = generator.methodSource else: source = self.methodSource basetable = self.generator.buildParser( self.production, source ) try: if type(basetable[0]) == type(()): if len(basetable) == 1 and len(basetable[0]) == 3: basetable = basetable[0] else: # this is a table that got returned! basetable = (None, SubTable, basetable) return self.permute( basetable ) except: print basetable raise class Name( ElementToken ): """Reference to another rule in the grammar The Name element token allows you to reference another production within the grammar. There are three major sub-categories of reference depending on both the Name element token and the referenced table's values. if the Name token's report attribute is false, or the target table's report attribute is false, or the Name token negative attribute is true, the Name reference will report nothing in the result tree if the target's expand attribute is true, however, the Name reference will report the children of the target production without reporting the target production's results (SubTable match) finally: if the target is not expanded and the Name token should report something, the generator object is asked to supply the tag object and flags for processing the results of the target. See the generator.MethodSource documentation for details. Notes: expanded and un-reported productions won't get any methodsource methods called when they are finished, that's just how I decided to do it, not sure if there's some case where you'd want it. As a result, it's possible to have a method getting called for one instance (where a name ref is reporting) and not for another (where the name ref isn't reporting). """ value = "" # following two flags are new ideas in the rewrite... report = 1 def toParser( self, generator, noReport=0 ): """Create the table for parsing a name-reference Note that currently most of the "compression" optimisations occur here. """ sindex = generator.getNameIndex( self.value ) command = TableInList target = generator.getRootObjects()[sindex] reportSelf = ( (not noReport) and # parent hasn't suppressed reporting self.report and # we are not suppressing ourselves target.report and # target doesn't suppress reporting (not self.negative) and # we aren't a negation, which doesn't report anything by itself (not target.expanded) # we don't report the expanded production ) reportChildren = ( (not noReport) and # parent hasn't suppressed reporting self.report and # we are not suppressing ourselves target.report and # target doesn't suppress reporting (not self.negative) # we aren't a negation, which doesn't report anything by itself ) if reportSelf: svalue = self.value else: svalue = None flags = 0 if target.expanded: # the target is the root of an expandedname declaration # so we need to do special processing to make sure that # it gets properly reported... command = SubTableInList tagobject = None # check for indirected reference to another name... elif not reportSelf: tagobject = svalue else: flags, tagobject = generator.getObjectForName( svalue ) if flags: command = command | flags if tagobject is None and not flags: if self.terminal(generator): if extractFlags(self,reportChildren) != extractFlags(target): composite = compositeFlags(self,target, reportChildren) partial = generator.getCustomTerminalParser( sindex,composite) if partial is not None: return partial partial = tuple(copyToNewFlags(target, composite).toParser( generator, not reportChildren )) generator.cacheCustomTerminalParser( sindex,composite, partial) return partial else: partial = generator.getTerminalParser( sindex ) if partial is not None: return partial partial = tuple(target.toParser( generator, not reportChildren )) generator.setTerminalParser( sindex, partial) return partial # base, required, positive table... if ( self.terminal( generator ) and (not flags) and isinstance(target, (SequentialGroup,Literal,Name,Range)) ): partial = generator.getTerminalParser( sindex ) if partial is None: partial = tuple(target.toParser( generator, #not reportChildren )) generator.setTerminalParser( sindex, partial) if len(partial) == 1 and len(partial[0]) == 3 and ( partial[0][0] is None or tagobject is None ): # there is a single child # it doesn't report anything, or we don't partial = (partial[0][0] or tagobject,)+ partial[0][1:] else: partial = (tagobject, Table, tuple(partial)) return self.permute( partial ) basetable = ( tagobject, command, ( generator.getParserList (), sindex, ) ) return self.permute( basetable ) terminalValue = None def terminal (self, generator): """Determine if this element is terminal for the generator""" if self.terminalValue in (0,1): return self.terminalValue self.terminalValue = 0 target = generator.getRootObject( self.value ) if target.terminal( generator): self.terminalValue = 1 return self.terminalValue def extractFlags( item, report=1 ): """Extract the flags from an item as a tuple""" return ( item.negative, item.optional, item.repeating, item.errorOnFail, item.lookahead, item.report and report, ) def compositeFlags( first, second, report=1 ): """Composite flags from two items into overall flag-set""" result = [] for a,b in map(None, extractFlags(first, report), extractFlags(second, report)): result.append( a or b ) return tuple(result) def copyToNewFlags( target, flags ): """Copy target using combined flags""" new = copy.copy( target ) for name,value in map(None, ("negative","optional","repeating","errorOnFail","lookahead",'report'), flags, ): setattr(new, name,value) return new