#!/usr/bin/env python """ Retriever.py $Id: Retriever.py,v 1.28 2003/12/17 03:46:01 jimj Exp $ Retrieve data identified by some URL from the appropiate location. Copyright 1999, 2000 by Holger Duerer Distributable under the GNU General Public License Version 2 or newer. """ import os, sys import string import re import urllib import types ## The following section tries to get the PyPlucker directory onto the ## system path if called as a script and if it is not yet there: try: import PyPlucker except ImportError: file = sys.argv[0] while os.path.islink (file): file = os.readlink (file) sys.path = [os.path.split (os.path.dirname (file))[0]] + sys.path try: import PyPlucker except ImportError: print "Cannot find where module PyPlucker is located!" sys.exit (1) # and forget the temp names... del file del PyPlucker ## ## Now PyPlucker things should generally be importable ## try: import gzip import StringIO _have_gzip = 1 except: _have_gzip = 0 from PyPlucker import Url, __version__ from UtilFns import error, message def GuessType (name): """Given a name, guess the mime type""" name = string.lower (name) def has_extension (ext, name=name): return name[-len(ext):] == ext known_map = { '.gif': 'image/gif', '.png': 'image/png', '.jpg': 'image/jpeg', '.jpe': 'image/jpeg', '.jpeg': 'image/jpeg', '.html': 'text/html', '.htm': 'text/html', '.txt': 'text/plain', '.asc': 'text/plain', } for ext in known_map.keys (): if has_extension (ext): return known_map[ext] return 'unknown/unknown' class PluckerFancyOpener (urllib.FancyURLopener): """A subclass of urllib.FancyURLopener, so we can remember an error code and the error text.""" def __init__(self, alias_list=None, config=None, *args): apply(urllib.FancyURLopener.__init__, (self,) + args) self._alias_list = alias_list self.remove_header ('User-agent') user_agent = (config and config.get_string('user_agent', None)) or 'Plucker/Py-%s' % __version__ self.addheader ('User-agent', user_agent) referrer = config and config.get_string('referrer', None) if referrer: self.addheader('Referer', referrer) self.addheader ('Accept', 'image/jpeg, image/gif, image/png, text/html, text/plain, text/xhtml;q=0.8, text/xml;q=0.6, text/*;q=0.4') if os.environ.has_key ('HTTP_PROXY') and (os.environ.has_key ('HTTP_PROXY_USER') and os.environ.has_key ('HTTP_PROXY_PASS')): import base64 self.addheader ('Proxy-Authorization', 'Basic %s' % string.strip(base64.encodestring("%s:%s" % (os.environ['HTTP_PROXY_USER'], os.environ['HTTP_PROXY_PASS'])))) #for header in self.addheaders: message(0, "%s", header) def remove_header (self, header): """Remove the header information 'header' if on the header list. Return if found on list. """ for i in range (len (self.addheaders)): if self.addheaders[i][0] == header: del self.addheaders[i] return 1 return 0 # Do not do this - addinfourl raised an exception on # None, because the default had already closed the file. # # Default error handling -- don't raise an exception, but remember the code # #def http_error_default(self, url, fp, errcode, errmsg, headers): # res = urllib.addinfourl(fp, headers, "http:" + url) # res.retcode = errcode # res.retmessage = errmsg # return res # Do not do this - urllib now handles redirection #def http_error_302(self, url, fp, errcode, errmsg, headers, # data=None): # # XXX The server can force infinite recursion here! # if self._alias_list: # if headers.has_key('location'): # newurl = headers['location'] # elif headers.has_key('uri'): # newurl = headers['uri'] # else: # return # old_url = Url.URL ('http:'+url) # new_url = Url.URL (newurl, old_url) # self._alias_list.add (old_url, new_url) # if headers.has_key('location'): # newurl = headers['location'] # elif headers.has_key('uri'): # newurl = headers['uri'] # return urllib.FancyURLopener.http_error_302 (self, url, fp, errcode, errmsg, headers, data) # #http_error_301 = http_error_302 #http_error_303 = http_error_302 def parse_http_header_value(headerval): mval = None parameters = [] parts = string.split (headerval, ";") if parts: mval = string.lower(parts[0]) for part0 in parts[1:]: part = string.strip(string.lower(part0)) m = re.match ('([-a-z0-9]+)=(.*)', part) if m: parameters.append(m.groups()) return mval, parameters class SimpleRetriever: """A very simple retriver. Not much of error checking, no persistent caching. Just a wrapper around urllib.""" def __init__ (self, pluckerdir, pluckerhome, configuration=None): self._plucker_dir = os.path.expanduser( os.path.expandvars (pluckerdir)) self._plucker_home = os.path.expanduser( os.path.expandvars (pluckerhome)) self._cache = {} self._configuration = configuration # without this, windows and no proxy was very slow self._urlopener = PluckerFancyOpener (config=self._configuration) def _retrieve_plucker (self, url, alias_list): path = url.get_path () if path[0] != '/': raise RuntimeError("plucker: URL must give absolute path! (%s)" % path) filename1 = os.path.join (self._plucker_dir, path[1:]) filename2 = os.path.join (self._plucker_home, path[1:]) if os.path.exists (filename1): filename = filename1 elif os.path.exists (filename2): filename = filename2 else: return ({'URL': url, 'error code': 404, 'error text': "File not found"}, None) try: file = open (filename, "rb") contents = file.read () file.close () except IOError, text: return ({'URL': url, 'error code': 404, 'error text': text}, None) return ({'URL': url, 'error code': 0, 'error text': "OK", 'content-type': GuessType (filename), 'content-length': len (contents)}, contents) def _retrieve (self, url, alias_list, post_data): """Really retrieve the url.""" if url.get_protocol () == 'plucker': return self._retrieve_plucker (url, alias_list) elif url.get_protocol () == 'mailto': # Nothing to fetch really... return ({'URL': url, 'error code': 0, 'error text': "OK", 'content-type': "mailto/text", 'content-length': 0}, "") else: # not a plucker:... URL try: real_url = str (url) webdoc = self._urlopener.open (real_url, post_data) if hasattr (webdoc, 'retcode'): headers_dict = {'URL': real_url, 'error code': webdoc.retcode, 'error text': webdoc.retmessage} doc_info = webdoc.info () if doc_info is not None: # This should always be a dict, but some people found None... :-( headers_dict.update (doc_info.dict) return (headers_dict, None) if hasattr (webdoc, 'url'): ####################################################################### # Redhat 7.x default Python installation will return # # webdoc.url without a protocol at the beginning # # (e.g. ://www.xyz.com instead of http://www.xyz.com). # # This is due to a bug in RH's /usr/lib/python1.5/urllib.py. # # -joefefifo@yahoo.com # ####################################################################### ################################################ # On Windows we wan't use # # URL(url).get_protocol to get the protokoll # # urllib.splittype(url) and all other url # # manipuling funktions are too buggy # ################################################ if sys.platform == 'win32': from PyPlucker.Url import URL webdoc_protocol = URL(webdoc.url).get_protocol else: (webdoc_protocol, webdoc_rest_of_url) = urllib.splittype(webdoc.url) # check to see we have a valid URL; if not, use one we started with if webdoc_protocol: real_url = webdoc.url headers_dict = {'URL': real_url} doc_info = webdoc.info () message(3, "doc_info is %s", doc_info); if doc_info is not None: # This should always be a dict, but some people found None... :-( headers_dict.update (doc_info.dict) if not headers_dict.has_key ('content-type'): message (1, "Guessing type for %s" % url.get_path ()) headers_dict['content-type'] = GuessType (url.get_path ()) else: ctype, parameters = parse_http_header_value(headers_dict['content-type']) headers_dict['content-type'] = ctype for parm in parameters: headers_dict[parm[0]] = parm[1] message(3, "headers_dict is %s", headers_dict); # Now get the contents contents = webdoc.read () # Check if encoded contents... if headers_dict.has_key ('content-encoding'): encoding = headers_dict['content-encoding'] if encoding == 'gzip' and _have_gzip: s = StringIO.StringIO (contents) g = gzip.GzipFile (fileobj=s) c = g.read () g.close () contents = c else: return ({'URL': real_url, 'error code': 404, 'error text': "Unhandled content-encoding '%s'" % encoding}, None) except IOError, text: return ({'URL': real_url, 'error code': 404, 'error text': text}, None) except OSError, text: return ({'URL': real_url, 'error code': 404, 'error text': text}, None) headers_dict['error code'] = 0 headers_dict['error text'] = "OK" return (headers_dict, contents) def retrieve (self, url, alias_list, post_data): """Fetch some data. Return a tuple (headers_dict, data)""" if not isinstance (url, Url.URL): url = str (url) # convert to string, if not yet so url = Url.URL (Url.CleanURL (url)) data_key = (str (url), post_data) if self._cache.has_key (data_key): # has been retrieved before, we just return the cached data return self._cache[data_key] else: result = self._retrieve (url, alias_list, post_data) self._cache[data_key] = result newurl = getattr(result, 'URL', url).as_string(with_fragment=None) alias_list.add(url,newurl) return result if __name__ == '__main__': # called as a script import sys retriever = SimpleRetriever ("~/.plucker", "~/.plucker") for name in sys.argv[1:]: print "\n\nFetching %s" % name (header, data) = retriever.retrieve (name, None, None) items = header.keys () items.sort () print "Headers:" for item in items: print " %s:\t%s" % (item, header[item]) print "Data:" text = repr (data)[1:-1] if len (text) > 80: text = text[:60] + " ... " + text[-10:] print " " + text