import logging from sgmllib import SGMLParser import re from cgi import escape from Products.PortalTransforms.interfaces import itransform from Products.PortalTransforms.utils import log from Products.CMFDefault.utils import bodyfinder from Products.CMFDefault.utils import IllegalHTML from Products.CMFDefault.utils import SimpleHTMLParser from Products.CMFDefault.utils import VALID_TAGS from Products.CMFDefault.utils import NASTY_TAGS from Products.PortalTransforms.utils import safeToInt # tag mapping: tag -> short or long tag VALID_TAGS = VALID_TAGS.copy() NASTY_TAGS = NASTY_TAGS.copy() # add some tags to allowed types. This should be fixed in CMFDefault VALID_TAGS['ins'] = 1 VALID_TAGS['del'] = 1 VALID_TAGS['q'] = 1 VALID_TAGS['map'] = 1 VALID_TAGS['area'] = 1 msg_pat = """

System message: %s

%s """ def hasScript(s): """ Dig out evil Java/VB script inside an HTML attribute """ # look for "script" and "expression" javascript_pattern = re.compile("([\s\n]*?s[\s\n]*?c[\s\n]*?r[\s\n]*?i[\s\n]*?p[\s\n]*?t[\s\n]*?:)|([\s\n]*?e[\s\n]*?x[\s\n]*?p[\s\n]*?r[\s\n]*?e[\s\n]*?s[\s\n]*?s[\s\n]*?i[\s\n]*?o[\s\n]*?n)", re.DOTALL|re.IGNORECASE) s = decode_htmlentities(s) return javascript_pattern.findall(s) def decode_htmlentities(s): """ XSS code can be hidden with htmlentities """ entity_pattern = re.compile("&#(?Px?\w+)?;?") s = entity_pattern.sub(decode_htmlentity,s) return s def decode_htmlentity(m): entity_value = m.groupdict()['htmlentity'] if entity_value.lower().startswith('x'): try: return chr(int('0'+entity_value,16)) except ValueError: return entity_value try: return chr(int(entity_value)) except ValueError: return entity_value class StrippingParser(SGMLParser): """Pass only allowed tags; raise exception for known-bad. Copied from Products.CMFDefault.utils Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved. """ from htmlentitydefs import entitydefs # replace entitydefs from sgmllib def __init__(self, valid, nasty, remove_javascript, raise_error): SGMLParser.__init__( self ) self.result = [] self.valid = valid self.nasty = nasty self.remove_javascript = remove_javascript self.raise_error = raise_error self.suppress = False def handle_data(self, data): if self.suppress: return if data: self.result.append(escape(data)) def handle_charref(self, name): if self.suppress: return self.result.append('&#%s;' % name) def handle_comment(self, comment): pass def handle_decl(self, data): pass def handle_entityref(self, name): if self.suppress: return if self.entitydefs.has_key(name): x = ';' else: # this breaks unstandard entities that end with ';' x = '' self.result.append('&%s%s' % (name, x)) def unknown_starttag(self, tag, attrs): """ Delete all tags except for legal ones. """ if self.suppress: return if self.valid.has_key(tag): self.result.append('<' + tag) remove_script = getattr(self,'remove_javascript',True) for k, v in attrs: if remove_script and k.strip().lower().startswith('on'): if not self.raise_error: continue else: raise IllegalHTML, 'Script event "%s" not allowed.' % k elif remove_script and hasScript(v): if not self.raise_error: continue else: raise IllegalHTML, 'Script URI "%s" not allowed.' % v else: self.result.append(' %s="%s"' % (k, v)) #UNUSED endTag = '' % tag if safeToInt(self.valid.get(tag)): self.result.append('>') else: self.result.append(' />') elif self.nasty.has_key(tag): self.suppress = True if self.raise_error: raise IllegalHTML, 'Dynamic tag "%s" not allowed.' % tag else: # omit tag pass def unknown_endtag(self, tag): if self.nasty.has_key(tag) and not self.valid.has_key(tag): self.suppress = False if self.suppress: return if safeToInt(self.valid.get(tag)): self.result.append('' % tag) #remTag = '' % tag def getResult(self): return ''.join(self.result) def scrubHTML(html, valid=VALID_TAGS, nasty=NASTY_TAGS, remove_javascript=True, raise_error=True): """ Strip illegal HTML tags from string text. """ parser = StrippingParser(valid=valid, nasty=nasty, remove_javascript=remove_javascript, raise_error=raise_error) parser.feed(html) parser.close() return parser.getResult() class SafeHTML: """Simple transform which uses CMFDefault functions to clean potentially bad tags. Tags must explicit be allowed in valid_tags to pass. Only the tags themself are removed, not their contents. If tags are removed and in nasty_tags, they are removed with all of their contents. Objects will not be transformed again with changed settings. You need to clear the cache by e.g. 1.) restarting your zope or 2.) empty the zodb-cache via ZMI -> Control_Panel -> Database Management -> main || other_used_database -> Flush Cache. """ __implements__ = itransform __name__ = "safe_html" inputs = ('text/html',) output = "text/x-html-safe" def __init__(self, name=None, **kwargs): self.config = { 'inputs': self.inputs, 'output': self.output, 'valid_tags': VALID_TAGS, 'nasty_tags': NASTY_TAGS, 'remove_javascript': 1, 'disable_transform': 0, } self.config_metadata = { 'inputs' : ('list', 'Inputs', 'Input(s) MIME type. Change with care.'), 'valid_tags' : ('dict', 'valid_tags', 'List of valid html-tags, value is 1 if they ' + 'have a closing part (e.g.

...

) and 0 for empty ' + 'tags (like
). Be carefull!', ('tag', 'value')), 'nasty_tags' : ('dict', 'nasty_tags', 'Dynamic Tags that are striped with ' + 'everything they contain (like applet, object). ' + 'They are only deleted if they are not marked as valid_tags.', ('tag', 'value')), 'remove_javascript' : ("int", 'remove_javascript', '1 to remove javascript attributes that begin with on (e.g. onClick) ' + 'and attributes where the value starts with "javascript:" ' + '(e.g. tags. 0 to leave the attributes.'), 'disable_transform' : ("int", 'disable_transform', 'If 1, nothing is done.') } self.config.update(kwargs) if name: self.__name__ = name def name(self): return self.__name__ def __getattr__(self, attr): if attr == 'inputs': return self.config['inputs'] if attr == 'output': return self.config['output'] raise AttributeError(attr) def convert(self, orig, data, **kwargs): # note if we need an upgrade. if not self.config.has_key('disable_transform'): log(logging.ERROR, 'PortalTransforms safe_html transform needs to be ' 'updated. Please re-install the PortalTransforms product to fix.') # if we have a config that we don't want to delete # we need a disable option if self.config.get('disable_transform'): data.setData(orig) return data try: safe = scrubHTML( bodyfinder(orig), valid=self.config.get('valid_tags', {}), nasty=self.config.get('nasty_tags', {}), remove_javascript=self.config.get('remove_javascript', True), raise_error=False) except IllegalHTML, inst: data.setData(msg_pat % ("Error", str(inst))) else: data.setData(safe) return data def register(): return SafeHTML()