#!/usr/bin/env python # -*- coding: utf-8 -*- """simple parser / string tokenizer rather than returning a list of token types etc, we simple return a list of tokens... each tokenizing function takes a string as input and returns a list of tokens """ # Copyright 2002, 2003 St James Software # # This file is part of translate. # # translate is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # translate is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with translate; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA def stringeval(text): """takes away repeated quotes (escapes) and returns the string represented by the text""" stringchar = text[0] if text[-1] != stringchar or stringchar not in ("'",'"'): # scratch your head raise ValueError, "error parsing escaped string: %r" % text return text[1:-1].replace(stringchar+stringchar,stringchar) def stringquote(text): """escapes quotes as neccessary and returns a string representing the text""" if "'" in text: if '"' in text: return '"' + text.replace('"', '""') + '"' else: return '"' + text + '"' else: return "'" + text + "'" class ParserError(ValueError): """Intelligent parser error""" def __init__(self, parser, message, tokennum): """takes a message and the number of the token that caused the error""" tokenpos = parser.findtokenpos(tokennum) line, charpos = parser.getlinepos(tokenpos) ValueError.__init__(self, "%s at line %d, char %d (token %r)" % \ (message, line, charpos, parser.tokens[tokennum])) self.parser = parser self.tokennum = tokennum class SimpleParser: """this is a simple parser""" def __init__(self, defaulttokenlist=None, whitespacechars=" \t\r\n", includewhitespacetokens=0): if defaulttokenlist is None: self.defaulttokenlist = ['<=', '>=', '==', '!=', '+=', '-=', '*=', '/=', '<>'] self.defaulttokenlist.extend('(),[]:=+-') else: self.defaulttokenlist = defaulttokenlist self.whitespacechars = whitespacechars self.includewhitespacetokens = includewhitespacetokens self.standardtokenizers = [self.stringtokenize, self.removewhitespace, self.separatetokens] self.quotechars = ('"', "'") self.endquotechars = {'"':'"',"'":"'"} self.stringescaping = 1 def stringtokenize(self, text): """makes strings in text into tokens...""" tokens = [] laststart = 0 instring = 0 endstringchar, escapechar = '', '\\' gotclose, gotescape = 0, 0 for pos in range(len(text)): char = text[pos] if instring: if self.stringescaping and (gotescape or char == escapechar) and not gotclose: gotescape = not gotescape elif char == endstringchar: gotclose = not gotclose elif gotclose: tokens.append(text[laststart:pos]) instring, laststart, endstringchar = 0, pos, '' if not instring: if char in self.quotechars: if pos > laststart: tokens.append(text[laststart:pos]) instring, laststart, endstringchar, gotclose = 1, pos, self.endquotechars[char], 0 if laststart < len(text): tokens.append(text[laststart:]) return tokens def keeptogether(self, text): """checks whether a token should be kept together""" return self.isstringtoken(text) def isstringtoken(self, text): """checks whether a token is a string token""" return text[:1] in self.quotechars def separatetokens(self, text, tokenlist = None): """this separates out tokens in tokenlist from whitespace etc""" if self.keeptogether(text): return [text] if tokenlist is None: tokenlist = self.defaulttokenlist # loop through and put tokens into a list tokens = [] pos = 0 laststart = 0 lentext = len(text) while pos < lentext: foundtoken = 0 for token in tokenlist: lentoken = len(token) if text[pos:pos+lentoken] == token: if laststart < pos: tokens.append(text[laststart:pos]) tokens.append(token) pos += lentoken foundtoken, laststart = 1, pos break if not foundtoken: pos += 1 if laststart < lentext: tokens.append(text[laststart:]) return tokens def removewhitespace(self, text): """this removes whitespace but lets it separate things out into separate tokens""" if self.keeptogether(text): return [text] # loop through and put tokens into a list tokens = [] pos = 0 inwhitespace = 0 laststart = 0 for pos in range(len(text)): char = text[pos] if inwhitespace: if char not in self.whitespacechars: if laststart < pos and self.includewhitespacetokens: tokens.append(text[laststart:pos]) inwhitespace, laststart = 0, pos else: if char in self.whitespacechars: if laststart < pos: tokens.append(text[laststart:pos]) inwhitespace, laststart = 1, pos if laststart < len(text) and (not inwhitespace or self.includewhitespacetokens): tokens.append(text[laststart:]) return tokens def applytokenizer(self, inputlist, tokenizer): """apply a tokenizer to a set of text, flattening the result""" tokenizedlists = [tokenizer(text) for text in inputlist] joined = [] map(joined.extend, tokenizedlists) return joined def applytokenizers(self, inputlist, tokenizers): """apply a set of tokenizers to a set of text, flattening each time""" for tokenizer in tokenizers: inputlist = self.applytokenizer(inputlist, tokenizer) return inputlist def tokenize(self, source, tokenizers=None): """tokenize the text string with the standard tokenizers""" self.source = source if tokenizers is None: tokenizers = self.standardtokenizers self.tokens = self.applytokenizers([self.source], tokenizers) return self.tokens def findtokenpos(self, tokennum): """finds the position of the given token in the text""" currenttokenpos = 0 for currenttokennum in range(tokennum+1): currenttokenpos = self.source.find(self.tokens[currenttokennum], currenttokenpos) return currenttokenpos def getlinepos(self, tokenpos): """finds the line and character position of the given character""" sourcecut = self.source[:tokenpos] line = sourcecut.count("\n")+1 charpos = tokenpos - sourcecut.rfind("\n") return line, charpos def raiseerror(self, message, tokennum): """raises a ParserError""" raise ParserError(self, message, tokennum)