#!/usr/bin/env python """ Url.py $Id: Url.py,v 1.20 2003/12/17 02:28:47 jimj Exp $ Utility class to encapsulate information about an URL and the useful operations thereon. Copyright 1999, 2000 by Holger Duerer Distributable under the GNU General Public License Version 2 or newer. """ import urlparse, urllib, string, sys, os urlparse.uses_relative.append ('plucker') #urlparse.uses_netloc.append ('plucker') #urlparse.uses_params.append ('plucker') #urlparse.uses_query.append ('plucker') urlparse.uses_fragment.append ('plucker') ###################################################################### # Replacement for the urlparse lib, because this is buggy on Windows # ###################################################################### def windows_file_url_parse (url): prot='file' fragment='' i = string.rfind(url, '#') if i >= 0: fragment = url[i+1:] url = url[:i] path=url if string.lower(path[0:7]) == 'file://': path=path[7:] if string.lower(path[0:5]) == 'file:': path=path[5:] if ((string.upper(path[0:1]) >= 'A') and (string.upper(path[0:1]) <= 'Z')) and (path[1:2] == ':'): path = string.upper(path[0:1]) + path[1:] host='' params='' query='' return prot, host, path, params, query, fragment ###################################################################### # Replacement for the urlparse lib, because this is buggy on Windows # ###################################################################### def windows_file_urljoin(base, url): def add_fragment(path, frag): if frag != '': res = path + '#' + frag else: res = path return res i = string.find(url, ':') # a new http:// file:// not based to source is _not_ used if (i < 3) or (i > 10): (prot, host, path, params, query, fragment) = windows_file_url_parse (url) if path != '': ###################################### # FIX ME!!!! # # path like .\test\..\images\ # # are not work yet! # ###################################### # .\file.ext == file.ext if (path[0:2] == '.\\') or (path[0:2] == './'): path = path[2:] url = os.path.join (os.path.dirname(str (base)), add_fragment(path, fragment)) return url # one dir up if (path[0:3] == '..\\') or (path[0:3] == '../'): path = path[3:] url = os.path.join (os.path.dirname(os.path.dirname(str (base))), add_fragment(path, fragment)) return url # two dir up if (path[0:4] == '...\\') or (path[0:4] == '.../'): path = path[4:] url = os.path.join (os.path.dirname(os.path.dirname(os.path.dirname(str (base)))), add_fragment(path, fragment)) return url # Root dir if (path[0:1] == '\\') or (path[0:1] == '/'): path = path[1:] str_base = str (base) url = os.path.join ('file:' + str_base[5] + ':' , add_fragment(path, fragment)) return url # normale case else: url = os.path.join (os.path.dirname(str (base)), add_fragment(path, fragment)) return url else: url = base + '#' + fragment return url else: return url return url ###################################################################### # Replacement for the urlparse lib, because this is buggy on Windows # # And its behavior changed in Python 2.2.2 CRH ###################################################################### def plucker_file_urlunparse(protocol, host, path, params, query, fragment): text = '' if protocol != '': text = text + protocol + ':' + path if fragment != '': text = text + '#' + fragment return text class URL: """Encapsulate some useful things from urllib and urlparse""" def __init__ (self, url, base = None): if isinstance (url, URL) and base is None: # Simple copy constructor: make it more efficient self._protocol = url._protocol self._host = url._host self._path = url._path self._params = url._params self._query = url._query self._fragment = url._fragment else: url = str (url) # Sometimes an URL is interrupted by a line break. # Fetching works better if the linebreak is removed. url = string.replace(url, "\r", "") url = string.replace(url, "\n", "") if base is not None: if sys.platform == 'win32' and string.lower(str (base)[0:5]) == 'file:': url = windows_file_urljoin (str (base), url) else: url = urlparse.urljoin (str (base), url) # according to RFC 2396, this 'unquote' is inappropriate # according to the HTML 4.01 spec, this 'unquote' is unnecessary # url = urllib.unquote (url) if sys.platform == 'win32' and string.lower(url[0:5]) == 'file:': (prot, host, path, params, query, fragment) = windows_file_url_parse (url) else: (prot, host, path, params, query, fragment) = urlparse.urlparse (url) host = string.lower (host) self._protocol = prot self._host = host self._path = path self._params = params self._query = query self._fragment = fragment def as_string (self, with_fragment): if with_fragment: fragment = self._fragment else: fragment = "" if self._protocol == 'plucker' or self._protocol == 'file': text = plucker_file_urlunparse (self._protocol, self._host, self._path, self._params, self._query, fragment) else: text = urlparse.urlunparse ((self._protocol, self._host, self._path, self._params, self._query, fragment)) return text def __str__ (self): return self.as_string (with_fragment=1) def __repr__ (self): return "URL (%s)" % repr (self.as_string (with_fragment=1)) def get_protocol (self): return self._protocol def get_host (self): return self._host def get_path (self): return self._path def get_fragment (self): return self._fragment def get_full_path (self, with_fragment): if with_fragment: fragment = self._fragment else: fragment = "" if sys.platform == 'win32' and self._protocol == 'file': text = plucker_file_urlunparse ("", "", self._path, self._params, self._query, fragment) else: text = urlparse.urlunparse (("", "", self._path, self._params, self._query, fragment)) return text def remove_fragment (self): self._fragment = "" def CleanURL (url, base=None): """Remove leading and trailing white space and generally clean up this URL""" if isinstance (url, URL): # This branch is currently never taken, we get always called # with a string as 'url' if base is not None: # FIXME!! Does this make sense at all? URLs should always be # absoulte, so giving a base is moot... result = Url (url, base).as_string (with_fragment=1) else: result = url.as_string (with_fragment=1) else: url = string.strip (str (url)) url = URL (url, base) result = url.as_string (with_fragment=1) return result def CompareURL (rawurl1, rawurl2): """Compare URLs in a smart ordering method.""" def string_compare (x, y): # Plain alphabetic sort if x "defg9z". This tends to work nicely for ordering # various ebook downloads which often have urls like: # http://myserver.edu/BookTitle/Chapter2.html. # Moreover, index.html type files go first, and files # in a higher level of the hierarchy come before ones # that are lower down in the same hierarchy. # If the above rules are not enough to induce an order, # alphabetical ordering is used. # # The following order is observed: # http://xyz.com/aaaa/bbbb/cccc/hello.htm # http://xyz.com/gggg/index.html # http://xyz.com/gggg/beta.html # http://xyz.com/gggg/gamma99a.html # http://xyz.com/gggg/gamma99b.html # http://xyz.com/gggg/gamma100a.html # http://xyz.com/gggg/aaaa/index.html # http://xyz.com/gggg/aaaa/aaaalowerdown.html # http://xyz.com/gggg/z/sec1A.html # http://xyz.com/gggg/z/sec11.html # # Moreover, URLs using a \ are ordered as if they used a /, # unless the \ / difference is the only difference in which # case an alpha sort is used. # The input urls can have nulls followed by additional info. splitup1 = string.split(rawurl1, '\0') splitup2 = string.split(rawurl2, '\0') splitup1[0] = string.replace(splitup1[0], '\\', '/') splitup2[0] = string.replace(splitup2[0], '\\', '/') if splitup1[0] == splitup2[0]: # The parts before the nulls match, up to \ / interchange. if splitup1 == splitup2: # The whole urls match up to \ / interchange. return string_compare(rawurl1,rawurl2) else: # Compare the strings using /'s instead of \'s. return string_compare(string.join(splitup1),string.join(splitup2)) name1 = splitup1[0] name2 = splitup2[0] i=0 while iname2, since otherwise it would have # returned at the splitup1 == splitup2 condition. return 1 if i==len(name1): return -1 # i points to the first mismatch now # Check first whether perhaps one of these is not higher up in the # hierarchy. slash1 = string.find(name1[i:],'/') slash2 = string.find(name2[i:],'/') if -1 <> slash1 and -1 == slash2: return 1 elif -1 == slash1 and -1 <> slash2: return -1 # Now check whether the two do not differ in that one is an # index.html (or cognate) and the other is not, and then # put the index.html first, unless the other has no last # element. splitup1 = string.split(name1,'/') splitup2 = string.split(name2,'/') if splitup1[:-1] == splitup2[:-1]: splitup1 = string.split(splitup1[-1],':') splitup2 = string.split(splitup2[-1],':') if splitup1[:-1] == splitup2[:-1]: index1 = (splitup1[-1] == "index.html" or splitup1[-1] == "index.htm" or splitup1[-1] == "INDEX.HTM") index2 = (splitup2[-1] == "index.html" or splitup2[-1] == "index.htm" or splitup2[-1] == "INDEX.HTM") if index1 and not index2: return -1 elif not index1 and index2: return 1 # The remaining rule to be applied is to check for whether # the difference is not in numerical strings. if not name1[i] in string.digits and not name2[i] in string.digits: return string_compare(name1,name2) # Back up to the beginning of the number so 11 vs. 1A doesn't # become 1 vs. A. Since name1 and name2 match up to i, we # need only look at name1 when backing up. while i>0 and name1[i-1] in string.digits: i=i-1 if not name1[i] in string.digits or not name2[i] in string.digits: # The mismatch isn't just a numerical one, so do an alphabetic # compare. return string_compare(name1,name2) num1=get_integer_string(name1[i:]) num2=get_integer_string(name2[i:]) if num1==num2: #if the numbers differ in leading zeros, #just do an alphabetical comparison return string_compare(name1,name2) if len(num1)