#!/usr/bin/env python # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- """ TextParser.py $Id: TextParser.py,v 1.66 2004/04/13 03:37:16 jimj Exp $ Copyright 1999,2000 by Holger Duerer Distributable under the GNU General Public License Version 2 or newer. """ ## The following section tries to get the PyPlucker directory onto the ## system path if called as a script and if it is not yet there: try: import PyPlucker except ImportError: import os, sys file = sys.argv[0] while os.path.islink (file): file = os.readlink (file) sys.path = [os.path.split (os.path.dirname (file))[0]] + sys.path try: import PyPlucker except ImportError: print "Cannot find where module PyPlucker is located!" sys.exit (1) # and forget the temp names... del file, os del PyPlucker ## ## Now PyPlucker things should generally be importable ## import string import re try: # if the user has the new xml package installed this might be faster # as that package includes one that uses C code to parse from xml.parsers import sgmllib except ImportError: import sgmllib # Fix for the missing comma in raw attributes import re, string # # Originally, plucker overrode this to catch the "@" of # mailto links. This bugfix is already in newer pythons. # If you have a new enough version (the sgmllib distributed # with Python 2.3.3), then we should use the (newer) python # version. This checks whether or not attrfind knows that # @ can appear in unquoted attribute values. If so, use the # python version. # temp=sgmllib.attrfind.match('href=mailto:user@host').groups()[2] if temp is None or '@' not in temp: sgmllib.attrfind = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') import sys import struct import urllib import htmlentitydefs from PyPlucker import PluckerDocs from PyPlucker import Url from PyPlucker import DEFAULT_LOCALE_CHARSET_ENCODING from PyPlucker.helper.CharsetMapping import charset_name_to_mibenum from PyPlucker import UtilFns message = UtilFns.message # the following constant states how big (approximately) one single # paragraphs should maximally be Max_Paragraph_Size = 1000 # how much more to allow in order not to break an anchor Max_Paragraph_Size_Anchor_Stretch = 150 ## The following are used in the parser to clean up things. _RE_WHITESPACE = re.compile ("[\n\f \t]+") _RE_NONSPACEWHITESPACE = re.compile ("[\n\f\t\t]+") _CLEANUP_TRANSTABLE = string.maketrans ("\f", "\n") ## ## There are several colorsets in use on the web. ## ## Plucker has traditionally used the HTML4 color names ## (Colornames_Strict). ## ## CSS2.1 ## adds orange ("FFA500") and several "colors" that depend on ## current system settings. It isn't clear what to do with ## system colors, except maybe defaulting to black. ## ## SVG adds several other colors (from the X windows color ## set, plus some grays). ## ## CSS3 ## color names represent the future of standards, but splits ## color names into several different profiles. Full support ## is more complicated than Plucker currently supports. ## ## Some web sites use Crayola colors. ## Colornames_Strict = { 'black': "000000", 'silver': "C0C0C0", 'gray': "808080", 'white': "FFFFFF", 'maroon': "800000", 'red': "FF0000", 'purple': "800080", 'fuchsia': "FF00FF", 'green': "008000", 'lime': "00FF00", 'olive': "808000", 'yellow': "FFFF00", 'navy': "000080", 'blue': "0000FF", 'teal': "008080", 'aqua': "00FFFF", } Colornames_SVG = { 'aliceblue': 'F0F8FF', 'antiquewhite': 'FAEBD7', 'aqua': '00FFFF', 'aquamarine': '7FFFD4', 'azure': 'F0FFFF', 'beige': 'F5F5DC', 'bisque': 'FFE4C4', 'black': '000000', 'blanchedalmond': 'FFEBCD', 'blue': '0000FF', 'blueviolet': '8A2BE2', 'brown': 'A52A2A', 'burlywood': 'DEB887', 'cadetblue': '5F9EA0', 'chartreuse': '7FFF00', 'chocolate': 'D2691E', 'coral': 'FF7F50', 'cornflowerblue': '6495ED', 'cornsilk': 'FFF8DC', 'crimson': 'DC143C', 'cyan': '00FFFF', 'darkblue': '00008B', 'darkcyan': '008B8B', 'darkgoldenrod': 'B8860B', 'darkgray': 'A9A9A9', 'darkgreen': '006400', 'darkgrey': 'A9A9A9', 'darkkhaki': 'BDB76B', 'darkmagenta': '8B008B', 'darkolivegreen': '556B2F', 'darkorange': 'FF8C00', 'darkorchid': '9932CC', 'darkred': '8B0000', 'darksalmon': 'E9967A', 'darkseagreen': '8FBC8F', 'darkslateblue': '483D8B', 'darkslategray': '2F4F4F', 'darkslategrey': '2F4F4F', 'darkturquoise': '00CED1', 'darkviolet': '9400D3', 'deeppink': 'FF1493', 'deepskyblue': '00BFFF', 'dimgray': '696969', 'dimgrey': '696969', 'dodgerblue': '1E90FF', 'firebrick': 'B22222', 'floralwhite': 'FFFAF0', 'forestgreen': '228B22', 'fuchsia': 'FF00FF', 'gainsboro': 'DCDCDC', 'ghostwhite': 'F8F8FF', 'gold': 'FFD700', 'goldenrod': 'DAA520', 'gray': '808080', 'green': '008000', 'greenyellow': 'ADFF2F', 'grey': '808080', 'honeydew': 'F0FFF0', 'hotpink': 'FF69B4', 'indianred': 'CD5C5C', 'indigo': '4B0082', 'ivory': 'FFFFF0', 'khaki': 'F0E68C', 'lavender': 'E6E6FA', 'lavenderblush': 'FFF0F5', 'lawngreen': '7CFC00', 'lemonchiffon': 'FFFACD', 'lightblue': 'ADD8E6', 'lightcoral': 'F08080', 'lightcyan': 'E0FFFF', 'lightgoldenrodyellow': 'FAFAD2', 'lightgray': 'D3D3D3', 'lightgreen': '90EE90', 'lightgrey': 'D3D3D3', 'lightpink': 'FFB6C1', 'lightsalmon': 'FFA07A', 'lightseagreen': '20B2AA', 'lightskyblue': '87CEFA', 'lightslategray': '778899', 'lightslategrey': '778899', 'lightsteelblue': 'B0C4DE', 'lightyellow': 'FFFFE0', 'lime': '00FF00', 'limegreen': '32CD32', 'linen': 'FAF0E6', 'magenta': 'FF00FF', 'maroon': '800000', 'mediumaquamarine': '66CDAA', 'mediumblue': '0000CD', 'mediumorchid': 'BA55D3', 'mediumpurple': '9370DB', 'mediumseagreen': '3CB371', 'mediumslateblue': '7B68EE', 'mediumspringgreen': '00FA9A', 'mediumturquoise': '48D1CC', 'mediumvioletred': 'C71585', 'midnightblue': '191970', 'mintcream': 'F5FFFA', 'mistyrose': 'FFE4E1', 'moccasin': 'FFE4B5', 'navajowhite': 'FFDEAD', 'navy': '000080', 'oldlace': 'FDF5E6', 'olive': '808000', 'olivedrab': '6B8E23', 'orange': 'FFA500', 'orangered': 'FF4500', 'orchid': 'DA70D6', 'palegoldenrod': 'EEE8AA', 'palegreen': '98FB98', 'paleturquoise': 'AFEEEE', 'palevioletred': 'DB7093', 'papayawhip': 'FFEFD5', 'peachpuff': 'FFDAB9', 'peru': 'CD853F', 'pink': 'FFC0CB', 'plum': 'DDA0DD', 'powderblue': 'B0E0E6', 'purple': '800080', 'red': 'FF0000', 'rosybrown': 'BC8F8F', 'royalblue': '4169E1', 'saddlebrown': '8B4513', 'salmon': 'FA8072', 'sandybrown': 'F4A460', 'seagreen': '2E8B57', 'seashell': 'FFF5EE', 'sienna': 'A0522D', 'silver': 'C0C0C0', 'skyblue': '87CEEB', 'slateblue': '6A5ACD', 'slategray': '708090', 'slategrey': '708090', 'snow': 'FFFAFA', 'springgreen': '00FF7F', 'steelblue': '4682B4', 'tan': 'D2B48C', 'teal': '008080', 'thistle': 'D8BFD8', 'tomato': 'FF6347', 'turquoise': '40E0D0', 'violet': 'EE82EE', 'wheat': 'F5DEB3', 'white': 'FFFFFF', 'whitesmoke': 'F5F5F5', 'yellow': 'FFFF00', 'yellowgreen': '9ACD32', } Colornames = Colornames_SVG def _parse_color (value): """Get the RGB value. Try text colorname (e.g. 'Silver'), then try to lower-case that (e.g., 'silver') then try an RGB value (e.g. #C0C0C0), then try an RGB value missing the # (e.g. C0C0C0) then try RGB (with and without the #) with only one char/color. then default to black ("#000000"). """ try: return Colornames[value] except KeyError: pass # This is redundant, if the page is coded entirely properly. try: return Colornames[value.lower()] except KeyError: pass if value[0] == '#': value = value[1:] try: val = '%06x' % string.atoi(value, 16) return val.upper() except ValueError: pass try: val = '%03x' % string.atoi(value, 16) val = val.upper() val = ((val[0]) * 2) + ((val[1]) * 2) + ((val[2]) * 2) return val except ValueError: pass message (1, "Giving up on color %s, using black.", value) return "000000" def _list_to_dict (alist): """Convert [(attr1,val), (attr2,val) ...] to {attr1:val, attr2:val} The sgml parser returns attributes as a list of key-value pairs. This function puts them in a dictionary, for more easier use.""" assert type(alist) == type([]) result = {} for (key,val) in alist: # string.lower is done by Python's own parser already # but the XML package's version does not. result[string.lower (key)] = cleanup_attribute (val) return result _entitycharref = re.compile('^(.*)&([#a-zA-Z][-.a-zA-Z0-9]*);(.*)$') _html_char_ref_pattern = re.compile('^&#([0-9]+);$') # These junk "alt" attribute values are not worth showing. junk_alt_attributes = ("img", "[img]", "spacer", "") def cleanup_attribute (text): m = _entitycharref.search (text) if not m: return text pre, content, post = m.groups (0) if content[0] == "#": content = content[1:] try: n = int(content) if 0 <= n <= 255: content = chr (n) else: #self._add_unicode_char(val, "&#%d;" % val) # Not in a "self", so can't add the unicode properly. content="&#%d" % val except ValueError: #content = "?" # might as well pass it through. no worse than a "?" content = "&#" + content else: if htmlentitydefs.entitydefs.has_key (content): content = htmlentitydefs.entitydefs[content] else: # content = "?" # usually unescaped & -- AT&T should return AT&T. content = "&" + content return cleanup_attribute (pre) + content + post def _clean_newlines (text): """Try to clean up newlines in source code to be UNIXy. We assume that CP/M derived OSes use \r\n as line terminator and MacOS uses only \r. Both version are converted to use only \n. XXX We should probably use python's universal newlines now.""" text = string.replace (text, "\r\n", "\n") text = string.replace (text, "\r", "\n") return text class AttributeStack: """A data structure to maintain information about the current text attributes. The raw value for the plucker DB is the font to set (as stated by Michael Nordstrom on plucker-dev or found in os.c): | OS2 | --- | 0: stdFont, 1: boldFont, 2: boldFont, 3: boldFont, | 4: boldFont, 5: stdFont, 6: stdFont, 7: stdFont | 8: stdFont 9: stdFont | | OS3 | --- | 0: stdFont, 1: largeBoldFont, 2: largeBoldFont, 3: largeFont, | 4: largeFont, 5: boldFont, 6: boldFont, 7: boldFont | 8: fixedWidthFont 9: stdFont The awk parser sets: : 7 : n (for n= 1, 2, 3, 4, 5, 6) """ def __init__ (self): self._tags = { "" : 0, "b": 7, "th": 7, # we also make table heads configurable "h1": 1, "h2": 2, "h3": 3, "h4": 4, "h5": 5, "h6": 6, "tt" : 8, "pre": 8, "small": 9, "sub" : 10, "sup" : 11, } self._stack = [""] self._alignment = [0] self._left_margin = 0 self._right_margin = 0 self._italics_depth = 0 self._underline_depth = 0 self._strike_depth = 0 self._forecolor = ["000000"] # Default color for text. self._tableborder_forecolor = ["default"] # Default color for table borders def indent (self, change_left=0, change_right=0): self._left_margin = min (60, max (0, self._left_margin + change_left)) self._right_margin = min (120, max (0, self._right_margin + change_right)) return (self._left_margin, self._right_margin) def change_italics (self, increment): """Add 'increment' to the italics counter and return the new value""" self._italics_depth = self._italics_depth + increment return self._italics_depth def change_underline (self, increment): """Add 'increment' to the underline counter and return the new value""" self._underline_depth = self._underline_depth + increment return self._underline_depth def change_strike (self, increment): """Add 'increment' to the strike counter and return the new value""" self._strike_depth = self._strike_depth + increment return self._strike_depth def push_alignment (self, newvalue): """Push a new alignment value. Return true if value has changed""" assert 0 <= newvalue <= 3, \ "Alignment value must be >=0 and <=3 but is %d" % newvalue self._alignment.append (newvalue) return self._alignment[-1] != self._alignment[-2] def pop_alignment (self, value): """Pop some alignment value. Return true if value has changed""" assert value == None or self._alignment[-1] == value, \ "Trying to pop alignment %s but is %s" % (value, self._alignment[-1]) res = self._alignment[-1] != self._alignment[-2] del self._alignment[-1] return res def get_alignment (self): """Return the current alignment.""" assert self._alignment != [], "Alignment stack must not be empty" return self._alignment[-1] def push_forecolor (self, value): """Push a new forecolor value. Return true if value has changed""" self._forecolor.append (value) return self._forecolor[-1] != self._forecolor[-2] def pop_forecolor (self, value): """Pop some forecolor value. Return true if value has changed""" assert self._forecolor[-1] == value, \ "Trying to pop forecolor %s but is %s" % (value, self._forecolor[-1]) foreres = self._forecolor[-1] != self._forecolor[-2] del self._forecolor[-1] return foreres def get_forecolor (self): """Return the current forecolor.""" assert self._forecolor != [], "Forecolor stack must not be empty" return self._forecolor[-1] def push (self, tag): """Push the style for 'tag' onto the stack. Return True if the style was actually changed by this.""" tag = string.lower (tag) assert self._tags.has_key (tag), "Unknown style code %s" % tag self._stack.append (tag) assert len(self._stack) >= 2 return self._tags[self._stack[-1]] != self._tags[self._stack[-2]] def pop (self, tag): """Pop a style from the stack and verify that it is 'tag'. Return true if the style now changed (i.e. new top of stack is different from the popped item).""" tag = string.lower (tag) assert self._tags.has_key (tag), "Unknown style code %s" % tag assert len(self._stack) >= 2, "Trying to pop from empty stack" res = self._tags[self._stack[-1]] != self._tags[self._stack[-2]] top = self._stack[-1] assert top == tag, "Expected TOS not found" self._stack = self._stack[:-1] assert self._stack != [], "Stack must not be empty" return res def get_style (self): """Return the numeric style code as used by Plucker for the current state.""" assert self._stack != [], "Stack must not be empty" return self._tags[self._stack[-1]] class TextDocBuilder: """Encapsulate the knowledge of when to change styles, add paragraphs, etc.""" def __init__ (self, url, config, **keyword_args): self._doc = PluckerDocs.PluckerTextDocument (url) self._config = config self._attributes = AttributeStack () self._paragraph = PluckerDocs.PluckerTextParagraph () self._is_new_paragraph = 1 self._is_new_line = 1 self._approximate_size = 0 self._anchor_dict = None self._max_para_size = ((keyword_args.has_key("max_paragraph_size") and keyword_args["max_paragraph_size"] > 0 and keyword_args["max_paragraph_size"]) or Max_Paragraph_Size) self._max_para_size_stretch = ((keyword_args.has_key("max_paragraph_size_anchor_stretch") and keyword_args["max_paragraph_size_anchor_stretch"] > 0 and keyword_args["max_paragraph_size_anchor_stretch"]) or Max_Paragraph_Size_Anchor_Stretch) # If document has no tag, then will draw in device's default text color # (which may not be black if they have hacked it with Kroma or similar utility) until # first color change, or new paragraph, then will go to black. This makes sure document # starts off in black. self._color_paragraphs = config.get_bool("color_paragraphs") if (self._color_paragraphs): self._paragraph.add_set_forecolor (self._attributes.get_forecolor ()) def _within_anchor (self): return not (self._anchor_dict is None) def set_charset(self, charset): self._doc.set_charset(charset_name_to_mibenum(charset)) def set_id_tag(self, tag): self._doc.register_doc(tag) def get_doc (self): """Finish up and get the PluckerTextDocument that we built""" self.close () return self._doc def close (self): """Finish off""" if not self._is_new_paragraph: self._doc.add_paragraph (self._paragraph) self._paragraph = PluckerDocs.PluckerTextParagraph () self._is_new_paragraph = 1 if not self._doc.get_charset(): # see if we can supply a default charset url = self._doc.get_url() if self._config: userspec = self._config.get_int('default_charset', 0) else: userspec = None locale_default = charset_name_to_mibenum(DEFAULT_LOCALE_CHARSET_ENCODING) # the userspec will take precedence if userspec: self._doc.set_charset(userspec) # OK, so we have no idea. Use the HTTP default of ISO-8859-1 (4) for # http: URLs, and the environment default (if any) for others elif (string.lower(url[:5]) == 'http:' or string.lower(url[:6]) == 'https:'): self._doc.set_charset(4) elif locale_default: self._doc.set_charset(locale_default) def add_name (self, name): """Give name to the current paragraph""" self._paragraph.add_name (name) def indent (self, change_left, change_right): (l, r) = self._attributes.indent (change_left, change_right) self._paragraph.add_set_margin (l, r) def start_italics (self): """Change to italics if not already so""" newval = self._attributes.change_italics (1) if newval > 1: # was already italics on, so nothing needs be done pass else: self._paragraph.add_italics_start () def end_italics (self): """Change to italics off if this is the last end_italics to come""" newval = self._attributes.change_italics (-1) if newval >= 1: # italics is still on (cascaded calls) pass else: self._paragraph.add_italics_end () def start_underline (self): """Change to underlining text if not already so""" newval = self._attributes.change_underline (1) if newval > 1: # was already in underline mode, so nothing needs be done pass else: self._paragraph.add_underline_start () def end_underline (self): """Change to underline off if this is the last end_underline to come""" newval = self._attributes.change_underline (-1) if newval >= 1: # underline is still on (cascaded calls) pass else: self._paragraph.add_underline_end () def start_strike (self): """Change to strikethrough if not already so""" newval = self._attributes.change_strike (1) if newval > 1: # was already strikethrough on, so nothing needs be done pass else: self._paragraph.add_strike_start () def end_strike (self): """Change to strikethrough off if this is the last end_strike to come""" newval = self._attributes.change_strike (-1) if newval >= 1: # strikethrough is still on (cascaded calls) pass else: self._paragraph.add_strike_end () def set_style (self, style): """Set current style to 'tag', where tag is "b", "h1", "h2", ...""" if self._attributes.push (style): # style has changed self._add_style_change () def unset_style (self, style): """Un-set a style change by a previous 'set_style'. Make sure it previously set 'style'.""" if self._attributes.pop (style): # style has changed self._add_style_change () def _add_style_change (self): """Add info about a new style to take effect.""" self._paragraph.add_style_change (self._attributes.get_style ()) def get_alignment (self): """Get current alignment (values 0, 1, or 2)""" return self._attributes.get_alignment () def set_alignment (self, value): """Set current alignment to 'value', where value = 0, 1, 2""" if self._attributes.push_alignment (value): # alignment has changed self._add_alignment_change () def unset_alignment (self, value): """Un-set an alignment change by a previous 'set_alignment'. Make sure it previously set 'value' (unless 'value' is None).""" if self._attributes.pop_alignment (value): # style has changed self._add_alignment_change () def get_forecolor (self): """Get current forecolor value. value should be an rgb""" return self._attributes.get_forecolor () def set_forecolor (self, value): """Set current forecolor to 'value' """ rgb = _parse_color (value) if not rgb: return # Don't want any white text on PDA's white form since would be invisible, # so if white, just darken it a bit to silver. if string.atoi(rgb, 16) == 0xFFFFFF: rgb = "C0C0C0" if self._attributes.push_forecolor (rgb): # forecolor has changed self._add_forecolor_change () def unset_forecolor (self, value): """Un-set an alignment change by a previous 'set_forecolor'. Make sure it previously set 'value' (unless 'value' is None).""" if self._attributes.pop_forecolor (value): # forecolor has changed self._add_forecolor_change () def _add_alignment_change (self): """Add info about a new alignment to take effect.""" self._paragraph.add_set_alignment (self._attributes.get_alignment ()) def _add_forecolor_change (self): """Add info about a new forecolor to take effect.""" self._paragraph.add_set_forecolor (self._attributes.get_forecolor ()) def _ship_paragraph (self): """Finish the current paragraph and start a fresh one""" # finish off the old paragraph the_anchor_dict = None if self._within_anchor (): the_anchor_dict = {} the_anchor_dict.update (self._anchor_dict) self.add_document_link_end () if self._attributes.change_italics (0): self._paragraph.add_italics_end () if self._attributes.change_underline (0): self._paragraph.add_underline_end () if self._attributes.change_strike (0): self._paragraph.add_strike_end () # now start new paragraph self._doc.add_paragraph (self._paragraph) self._paragraph = PluckerDocs.PluckerTextParagraph () self._is_new_paragraph = 1 self._is_new_line = 1 self._approximate_size = 0 if self._attributes.get_style (): # we are in non-default style self._add_style_change () if self._attributes.get_alignment (): # we are in non-default alignment self._add_alignment_change () if self._attributes.get_forecolor () or self._color_paragraphs: # we are in non-default forecolor self._paragraph.add_set_forecolor (self._attributes.get_forecolor ()) (l, r) = self._attributes.indent () message(4, "-- New paragraph: margins %d, %d", l, r) if l != 0 or r != 0: self._paragraph.add_set_margin (l, r) if self._attributes.change_italics (0): self._paragraph.add_italics_start () if self._attributes.change_underline (0): self._paragraph.add_underline_start () if self._attributes.change_strike (0): self._paragraph.add_strike_start () # re-start the link if there was one if the_anchor_dict is not None: self.add_document_link_start (the_anchor_dict) def add_vspace (self, n_units=2, additional=0): """Make the representation to have a new line. Add a new paragraph, unless this one is already new and has no extra spacing""" n_units = min (n_units, 7) if n_units==0 and not self._is_new_paragraph: if not self._is_new_line: # special case: use code self._paragraph.add_newline () self._is_new_line = 1 return if not additional: # is already newline and we don't want additional vspace return else: # we are on a new line, so we need to add 4 units to get an additional new line n_units = 4 if self._is_new_paragraph: if n_units == 0 and additional: n_units = 4 old_spacing = self._paragraph.get_extra_spacing () if not additional and (old_spacing >= n_units): # already enough space return if additional: new_spacing = old_spacing + n_units else: new_spacing = max (old_spacing, n_units) if new_spacing <= 7: self._paragraph.set_extra_spacing (new_spacing) else: while new_spacing > 7: self._paragraph.set_extra_spacing (7) new_spacing = new_spacing - 7 self.add_text (" ") self._ship_paragraph () self._paragraph.set_extra_spacing (new_spacing) else: self._ship_paragraph () self._paragraph.set_extra_spacing (n_units) def _find_text_split (self, line, size): """Split line so that the first part is approx. size bytes long. Return (first_part, rest).""" # XXX Why do we care? Could we use TextWrapper? first = line[:size] rest = line[size:] # We try to split at a space: if " " in rest: f = string.split(rest, None, 1) if len (f) > 0: # Shouldn't this always be the case? Mike reports that it can happen... first = first + f[0] if len (f) > 1: rest = f[1] else: rest = "" else: # Strange... how does this happen? first = first + rest rest = "" else: # No decent split found: just don't split it... first = first + rest rest = "" return (first, rest) def add_text (self, text): """Add some text, maybe even many lines.""" lines = string.split (text, "\n") for i in range (len (lines)): line = lines[i] while 1: new_size = self._approximate_size + len (line) if self._within_anchor (): max_size = self._max_para_size+self._max_para_size_stretch else: max_size = self._max_para_size if new_size < max_size: break rest_size = self._max_para_size - self._approximate_size if rest_size < 0: rest_size = 0 (first, rest) = self._find_text_split (line, rest_size) self._paragraph.add_text (first) self._approximate_size = self._approximate_size + len (first) self._is_new_paragraph = 0 self._is_new_line = 0 line = rest self._ship_paragraph () if not line: break if line: self._paragraph.add_text (line) self._approximate_size = self._approximate_size + len (line) self._is_new_paragraph = 0 self._is_new_line = 0 if i != len (lines)-1: # add the newline that was left out self.add_vspace (n_units=0, additional=1) def add_unicode_char (self, char_code, text_alternative): """Add a Unicode character, along with a non-Unicode text alternative.""" self._paragraph.add_unicode_char (char_code, text_alternative) self._is_new_line = 0 self._is_new_paragraph = 0 self._approximate_size = self._approximate_size + 7 + len(text_alternative) def add_image (self, attributes): """Add an image reference""" self._is_new_paragraph = 0 self._is_new_line = 0 self._paragraph.add_image_reference (attributes) # print 'image attributes are ' + str(attributes) def add_table (self, dict_of_items): """Add a table""" self._is_new_paragraph = 0 self._is_new_line = 0 self._paragraph.add_table (dict_of_items) def add_document_link_start (self, dict_of_items): """Add an achor start""" if not self._within_anchor (): self._is_new_paragraph = 0 self._is_new_line = 0 self._paragraph.add_anchor_start (dict_of_items) self._anchor_dict = dict_of_items def add_document_link_end (self): """Add an achor end""" if self._within_anchor (): self._is_new_paragraph = 0 self._is_new_line = 0 self._paragraph.add_anchor_end () self._anchor_dict = None def add_hr (self, height=0, width=0, perc_width=0): """Add a hr""" self._is_new_paragraph = 0 self._is_new_line = 0 self._paragraph.add_hr (height, width, perc_width) class PlainTextParser: """Parsing a simple Text""" def __init__ (self, url, text, headers, config, attribs): text = _clean_newlines (text) # This we use to build the document self._doc = TextDocBuilder (url, config) if headers.has_key("charset"): self._doc.set_charset (headers["charset"]) elif attribs.has_key("charset"): self._doc.set_charset (attribs["charset"]) self._url = url self._text = text # In these two lists we store tuples of (url, attributes) for encountered anchors # and image references. Currently we don't even search for these... self._anchors = [] self._images = [] self._doc.add_text (text) self._doc.close () def get_plucker_doc (self): """Get the PluckerTextDocument. Useful after a close()""" return self._doc.get_doc () def get_anchors (self): """Return the list of found anchors""" return self._anchors def get_images (self): """Return the list of found images""" return self._images def has_unknown (self): """Check if during parsing we found unknown things""" return 0 def print_unknown (self, prefix): """Print a summary of the unknown things found during parsing""" pass def get_unknown (self): """Get a list unknown things found during parsing.""" return {} # the following lists are derived from the HTML 4.01 spec. Don't change them! # Actually, LI is not in the spec, but the spec defines it to act very much # as a block-level element, so we put it in our list for the moment. HTML_BLOCK_ELEMENTS = ('head', 'body', 'li', 'dl', 'div', 'center', 'dir', 'menu', 'noscript', 'blockquote', 'form', 'hr', 'table', 'fieldset', 'address', 'noframes', 'isindex', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'pre') HTML_OPTIONAL_END_ELEMENTS = ('body', 'colgroup', 'dd', 'dt', 'head', 'html', 'li', 'option', 'p', 'tbody', 'tfoot', 'thead') HTML_TABLE_ELEMENTS = ('td', 'th', 'tr') HTML_FORBIDDEN_END_ELEMENTS = ('area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param') HTML_NO_ID_ELEMENTS = ('base', 'head', 'html', 'meta', 'script', 'style', 'title') class StructuredHTMLParser (sgmllib.SGMLParser): """Parsing correct HTML, and digesting it into a PluckerTextDocument.""" def __init__ (self, url, text, headers = {}, config = None, attribs = {}): sgmllib.SGMLParser.__init__ (self) # Convert all to for XHTML compatability text = string.replace (text, "/>", " />") text = _clean_newlines (text) # This we use to build the document self._doc = TextDocBuilder (url, config, max_paragraph_size=3000) self._url = url self._base = None # use this if defined for relative URLs self._config = config self._attribs = attribs # initialize verbosity... self._verbosity_stack = [] # We use different indicator for diffent depths of ,