# -*-python-*- # # Copyright (C) 1999-2006 The ViewCVS Group. All Rights Reserved. # # By using this file, you agree to the terms and conditions set forth in # the LICENSE.html file which can be found at the top level of the ViewVC # distribution or at http://viewvc.org/license-1.html. # # For more information, visit http://viewvc.org/ # # ----------------------------------------------------------------------- import string # note: this will raise an ImportError if it isn't available. the rcsparse # package will recognize this and switch over to the default parser. from mx import TextTools import common # for convenience _tt = TextTools _idchar_list = map(chr, range(33, 127)) + map(chr, range(160, 256)) _idchar_list.remove('$') _idchar_list.remove(',') #_idchar_list.remove('.') # leave as part of 'num' symbol _idchar_list.remove(':') _idchar_list.remove(';') _idchar_list.remove('@') _idchar = string.join(_idchar_list, '') _idchar_set = _tt.set(_idchar) _onechar_token_set = _tt.set(':;') _not_at_set = _tt.invset('@') _T_TOKEN = 30 _T_STRING_START = 40 _T_STRING_SPAN = 60 _T_STRING_END = 70 _E_COMPLETE = 100 # ended on a complete token _E_TOKEN = 110 # ended mid-token _E_STRING_SPAN = 130 # ended within a string _E_STRING_END = 140 # ended with string-end ('@') (could be mid-@@) _SUCCESS = +100 _EOF = 'EOF' _CONTINUE = 'CONTINUE' _UNUSED = 'UNUSED' # continuation of a token over a chunk boundary _c_token_table = ( (_T_TOKEN, _tt.AllInSet, _idchar_set), ) class _mxTokenStream: # the algorithm is about the same speed for any CHUNK_SIZE chosen. # grab a good-sized chunk, but not too large to overwhelm memory. # note: we use a multiple of a standard block size CHUNK_SIZE = 192 * 512 # about 100k # CHUNK_SIZE = 5 # for debugging, make the function grind... def __init__(self, file): self.rcsfile = file self.tokens = [ ] self.partial = None self.string_end = None def _parse_chunk(self, buf, start=0): "Get the next token from the RCS file." buflen = len(buf) assert start < buflen # construct a tag table which refers to the buffer we need to parse. table = ( #1: ignore whitespace. with or without whitespace, move to the next rule. (None, _tt.AllInSet, _tt.whitespace_set, +1), #2 (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS), #3: accumulate token text and exit, or move to the next rule. (_UNUSED, _tt.AllInSet + _tt.AppendMatch, _idchar_set, +2), #4 (_E_TOKEN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -3, _SUCCESS), #5: single character tokens exit immediately, or move to the next rule (_UNUSED, _tt.IsInSet + _tt.AppendMatch, _onechar_token_set, +2), #6 (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, -5, _SUCCESS), #7: if this isn't an '@' symbol, then we have a syntax error (go to a # negative index to indicate that condition). otherwise, suck it up # and move to the next rule. (_T_STRING_START, _tt.Is + _tt.AppendTagobj, '@'), #8 (None, _tt.Is, '@', +4, +1), #9 (buf, _tt.Is, '@', +1, -1), #10 (_T_STRING_END, _tt.Skip + _tt.AppendTagobj, 0, 0, +1), #11 (_E_STRING_END, _tt.EOF + _tt.AppendTagobj, _tt.Here, -10, _SUCCESS), #12 (_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS), #13: suck up everything that isn't an AT. go to next rule to look for EOF (buf, _tt.AllInSet, _not_at_set, 0, +1), #14: go back to look for double AT if we aren't at the end of the string (_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -6, _SUCCESS), ) # Fast, texttools may be, but it's somewhat lacking in clarity. # Here's an attempt to document the logic encoded in the table above: # # Flowchart: # _____ # / /\ # 1 -> 2 -> 3 -> 5 -> 7 -> 8 -> 9 -> 10 -> 11 # | \/ \/ \/ /\ \/ # \ 4 6 12 14 / # \_______/_____/ \ / / # \ 13 / # \__________________________________________/ # # #1: Skip over any whitespace. # #2: If now EOF, exit with code _E_COMPLETE. # #3: If we have a series of characters in _idchar_set, then: # #4: Output them as a token, and go back to #1. # #5: If we have a character in _onechar_token_set, then: # #6: Output it as a token, and go back to #1. # #7: If we do not have an '@', then error. # If we do, then log a _T_STRING_START and continue. # #8: If we have another '@', continue on to #9. Otherwise: # #12: If now EOF, exit with code _E_STRING_SPAN. # #13: Record the slice up to the next '@' (or EOF). # #14: If now EOF, exit with code _E_STRING_SPAN. # Otherwise, go back to #8. # #9: If we have another '@', then we've just seen an escaped # (by doubling) '@' within an @-string. Record a slice including # just one '@' character, and jump back to #8. # Otherwise, we've *either* seen the terminating '@' of an @-string, # *or* we've seen one half of an escaped @@ sequence that just # happened to be split over a chunk boundary - in either case, # we continue on to #10. # #10: Log a _T_STRING_END. # #11: If now EOF, exit with _E_STRING_END. Otherwise, go back to #1. success, taglist, idx = _tt.tag(buf, table, start) if not success: ### need a better way to report this error raise common.RCSIllegalCharacter() assert idx == buflen # pop off the last item last_which = taglist.pop() i = 0 tlen = len(taglist) while i < tlen: if taglist[i] == _T_STRING_START: j = i + 1 while j < tlen: if taglist[j] == _T_STRING_END: s = _tt.join(taglist, '', i+1, j) del taglist[i:j] tlen = len(taglist) taglist[i] = s break j = j + 1 else: assert last_which == _E_STRING_SPAN s = _tt.join(taglist, '', i+1) del taglist[i:] self.partial = (_T_STRING_SPAN, [ s ]) break i = i + 1 # figure out whether we have a partial last-token if last_which == _E_TOKEN: self.partial = (_T_TOKEN, [ taglist.pop() ]) elif last_which == _E_COMPLETE: pass elif last_which == _E_STRING_SPAN: assert self.partial else: assert last_which == _E_STRING_END self.partial = (_T_STRING_END, [ taglist.pop() ]) taglist.reverse() taglist.extend(self.tokens) self.tokens = taglist def _set_end(self, taglist, text, l, r, subtags): self.string_end = l def _handle_partial(self, buf): which, chunks = self.partial if which == _T_TOKEN: success, taglist, idx = _tt.tag(buf, _c_token_table) if not success: # The start of this buffer was not a token. So the end of the # prior buffer was a complete token. self.tokens.insert(0, string.join(chunks, '')) else: assert len(taglist) == 1 and taglist[0][0] == _T_TOKEN \ and taglist[0][1] == 0 and taglist[0][2] == idx if idx == len(buf): # # The whole buffer was one huge token, so we may have a # partial token again. # # Note: this modifies the list of chunks in self.partial # chunks.append(buf) # consumed the whole buffer return len(buf) # got the rest of the token. chunks.append(buf[:idx]) self.tokens.insert(0, string.join(chunks, '')) # no more partial token self.partial = None return idx if which == _T_STRING_END: if buf[0] != '@': self.tokens.insert(0, string.join(chunks, '')) return 0 chunks.append('@') start = 1 else: start = 0 self.string_end = None string_table = ( (None, _tt.Is, '@', +3, +1), (_UNUSED, _tt.Is + _tt.AppendMatch, '@', +1, -1), (self._set_end, _tt.Skip + _tt.CallTag, 0, 0, _SUCCESS), (None, _tt.EOF, _tt.Here, +1, _SUCCESS), # suck up everything that isn't an AT. move to next rule to look # for EOF (_UNUSED, _tt.AllInSet + _tt.AppendMatch, _not_at_set, 0, +1), # go back to look for double AT if we aren't at the end of the string (None, _tt.EOF, _tt.Here, -5, _SUCCESS), ) success, unused, idx = _tt.tag(buf, string_table, start, len(buf), chunks) # must have matched at least one item assert success if self.string_end is None: assert idx == len(buf) self.partial = (_T_STRING_SPAN, chunks) elif self.string_end < len(buf): self.partial = None self.tokens.insert(0, string.join(chunks, '')) else: self.partial = (_T_STRING_END, chunks) return idx def _parse_more(self): buf = self.rcsfile.read(self.CHUNK_SIZE) if not buf: return _EOF if self.partial: idx = self._handle_partial(buf) if idx is None: return _CONTINUE if idx < len(buf): self._parse_chunk(buf, idx) else: self._parse_chunk(buf) return _CONTINUE def get(self): try: return self.tokens.pop() except IndexError: pass while not self.tokens: action = self._parse_more() if action == _EOF: return None return self.tokens.pop() # _get = get # def get(self): token = self._get() print 'T:', `token` return token def match(self, match): if self.tokens: token = self.tokens.pop() if token != match: raise RuntimeError, ('Unexpected parsing error in RCS file.\n' 'Expected token: %s, but saw: %s' % (match, token)) else: token = self.get() if token != match: raise RuntimeError, ('Unexpected parsing error in RCS file.\n' 'Expected token: %s, but saw: %s' % (match, token)) def unget(self, token): self.tokens.append(token) def mget(self, count): "Return multiple tokens. 'next' is at the end." while len(self.tokens) < count: action = self._parse_more() if action == _EOF: ### fix this raise RuntimeError, 'EOF hit while expecting tokens' result = self.tokens[-count:] del self.tokens[-count:] return result class Parser(common._Parser): stream_class = _mxTokenStream