#!/usr/bin/env python # -*- coding: iso-8859-1 -*- """ Parser.py $Id: Parser.py,v 1.21 2003/02/18 01:29:38 chrish Exp $ Copyright 1999,2000 by Holger Duerer Distributable under the GNU General Public License Version 2 or newer. """ from PyPlucker import TextParser, ImageParser, PluckerDocs, ConversionParser from PyPlucker.ConversionParser import WordParser from UtilFns import message, error unknown_things = {} def generic_parser (url, headers, data, config, attributes): try: url = str (url) # convert to string if this is still a Url.ULR type = headers['content-type'] verbosity = config.get_int('verbosity', 1) if type == 'unknown/unknown' and attributes.has_key('type'): # note that this type is not an HTTP header, and may not contain parameters type = attributes['type'] if type == "text/html": parser = TextParser.StructuredHTMLParser (url, data, headers, config, attributes) for item in parser.get_unknown (): if unknown_things.has_key (item): unknown_things[item].append (url) else: unknown_things[item] = [url] return parser.get_plucker_doc () elif type == "text/plain": parser = TextParser.PlainTextParser (url, data, headers, config, attributes) return parser.get_plucker_doc () elif type == "mailto/text": # These are easy to handle, the document does it itself, so no # parsing needed as we generate the document directly return PluckerDocs.PluckerMailtoDocument (url) elif type[:6] == "image/": # this can fail, as some parsers do not recognize all image types... parser = ImageParser.get_default_parser(config) parsed = parser (url, type, data, config, attributes) return parsed.get_plucker_doc () elif type[:18] == "application/msword": return WordParser (url, data, headers, config, attributes) else: message(0, "%s type not yet handled" % type) return None except RuntimeError, text: error("Runtime error parsing document %s: %s" % (url, text)) return None except AssertionError, text: error("Assertion error parsing document %s: %s" % (url, text)) return None except: import traceback error("Unknown error parsing document %s:" % url) traceback.print_exc () return None