############################################################################## # # This software is released under the Zope Public License (ZPL) Version 1.0 # # Copyright (c) Zope Corportation. All rights reserved. # ############################################################################## """A parser for HTML and XHTML.""" # This file is based on sgmllib.py, but the API is slightly different. # XXX There should be a way to distinguish between PCDATA (parsed # character data -- the normal case), RCDATA (replaceable character # data -- only char and entity references and end tags are special) # and CDATA (character data -- only end tags are special). import re import string # Regular expressions used for parsing interesting_normal = re.compile('[&<]') interesting_cdata = re.compile(r'<(/|\Z)') incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*);') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+);') starttagopen = re.compile('<[a-zA-Z]') piopen = re.compile(r'<\?') piclose = re.compile('>') endtagopen = re.compile(']*>') commentopen = re.compile('