#!/usr/bin/env python # -*- mode: python; indent-tabs-mode: nil; -*- """ Spider.py $Id: Spider.py,v 1.96 2004/04/10 23:06:39 chrish Exp $ Recursively gets documents and thus collects a document set. Copyright 1999, 2000 by Holger Duerer Distributable under the GNU General Public License Version 2 or newer. """ if __name__ == '__main__': ## The following section tries to get the PyPlucker directory onto the ## system path if called as a script and if it is not yet there: try: import PyPlucker except ImportError: import os, sys file = sys.argv[0] while os.path.islink (file): file = os.readlink (file) sys.path = [os.path.split (os.path.dirname (file))[0]] + sys.path try: import PyPlucker except ImportError: print "Cannot find where module PyPlucker is located!" sys.exit (1) # and forget the temp names... del file, os del PyPlucker ## ## Now PyPlucker things should generally be importable ## import PyPlucker from PyPlucker import Parser, ConfigFiles, __version__ from PyPlucker.Url import URL from PyPlucker.AliasList import AliasList from PyPlucker.UtilFns import message, error, show_exception import os, string, sys, types, re VALID_LINK_ATTRIBUTES = ( # from the A entity "name", "href", "hreflang", "type", "rel", "rev", "charset", "id", "class", "lang", "dir", "title", "style", "shape", "coords", "onfocus", "onblur", "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmousemove", "onmouseout", "onkeypress", "onkeydown", "onkeyup", "target", "tabindex", "accesskey", # from the IMG entity "src", "longdesc", "alt", "ismap", "usemap", "align", "border", "width", "height", "hspace", "vspace", # Plucker additions "bpp", "stayonhost", "stay_on_host", "stayondomain", "stay_on_domain", "staybelow", "url_pattern", "current_depth", "noimages", "maxdepth", "max_depth", "new_max_depth", "url", "max_width", "max_height", "alt_maxwidth", "alt_maxheight", "post", "post_data", # for the Plucker glyph support "glyph", "page_x", "page_y", "spacer", "vadjust", # other plucker tags "_plucker_from_image", "_plucker_from_url", "_plucker_id_tag", "_plucker_id_tag_inlineimage", "_plucker_id_tag_outoflineimage", "_plucker_alternate_image", ) LINK_ATTRIBUTES_TO_IGNORE = ( "src", "ismap", "usemap", "align", "border", "current_depth", "name", "href", "hreflang", "type", "rel", "rev", "charset", "id", "class", "lang", "dir", "title", "style", "shape", "coords", "onfocus", "onblur", "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmousemove", "onmouseout", "onkeypress", "onkeydown", "onkeyup", "target", "tabindex", "accesskey", ) class SpiderLink: """A class to maintain and encapsulate information about the various attributes related to links while spidering (MAXDEPTH, STAYONDOMAIN, STAYONHOST, STAYBELOW, NOIMAGES, etc.) In some cases we make a distinction, if the link has already been taken or not.""" def __init__ (self, url, dict={}): self._url = url self._dict = {} self._current_depth = None self._max_depth = None self._new_max_depth = None self._stay_on_domain = None self._stay_on_host = None self._stay_below = None self._maxwidth = None self._maxheight = None self._bpp = 1 self._url_pattern = None old = dict.copy() for key in old.keys(): if not key in VALID_LINK_ATTRIBUTES: message(2, "Ignoring invalid link attribute '%s'", key) del old[key] continue if (key[:9] == '_plucker_' or key in LINK_ATTRIBUTES_TO_IGNORE): del old[key] self._update_from_dict (old, after_taken=1) self._from_image = 0 self.set_post (None) def __str__ (self): res = "" return res def _get_domain_from_host(self, url_addy): # By definition the host has already removed the initial // markers # and the subsequent path. All that is left is in the form of # xxx.domain.type. url_len = len(url_addy) first_dot = string.rfind(url_addy, ".") if first_dot < 1: return "" temp_url = url_addy[0:first_dot] # Locate second-from-right period, from front... second_dot = string.rfind(temp_url, ".") if second_dot > 1: # have a second dot, so output this to end... second_dot = second_dot + 1 domain = url_addy[second_dot:url_len] return(domain) else: return(url_addy) def _update_from_dict (self, dict, after_taken): """Update private values from link attributes in dict. If 'after_taken' is true, means that _max_depth can be altered immediately. Otherwise the new value is stored in the helper variable _new_max_depth""" # POST processing if dict.has_key ('post'): self.set_post (dict['post']) # MAXWIDTH processing if dict.has_key ('maxwidth'): self.set_maxwidth (dict['maxwidth']) # MAXHEIGHT processing if dict.has_key ('maxheight'): self.set_maxheight (dict['maxheight']) # BPP processing if dict.has_key ('bpp'): bpp = dict['bpp'] try: bpp = int (bpp) self.set_bpp (bpp) except: pass # MAXDEPTH processing if dict.has_key ('maxdepth'): if after_taken: self.set_max_depth (int (dict['maxdepth'])) self.set_current_depth (1) else: self._new_max_depth = int (dict['maxdepth']) # NOIMAGES processing if dict.has_key ('noimages'): self.set_bpp (0) # STAYONHOST processing # This attribute is only evaluated *after* the link has been taken if after_taken and dict.has_key ('stayonhost'): self.set_stay_on_host (self._url.get_host ()) # STAYONDOMAIN processing # This attribute is only evaluated *after* the link has been taken if after_taken and dict.has_key ('stayondomain'): self.set_stay_on_domain (self._get_domain_from_host(self._url.get_host())) # STAYBELOW processing # This attribute is only evaluated *after* the link has been taken if after_taken and dict.has_key ('staybelow'): if string.lower (dict['staybelow']) == 'staybelow' or dict['staybelow'] == '': the_url = self._url.as_string (with_fragment=0) dict['staybelow'] = the_url #print "Setting STAYBELOW to %s" % the_url self.set_stay_below (dict['staybelow']) # Finally update it compleytely... self._dict.update (dict) def as_dict (self): newdict = self._dict.copy() newdict['url'] = self._url newdict['current_depth'] = self._current_depth newdict['max_depth'] = self._max_depth newdict['new_max_depth'] = self._new_max_depth newdict['stay_on_domain'] = self._stay_on_domain newdict['stay_on_host'] = self._stay_on_host newdict['stay_below'] = self._stay_below newdict['maxheight'] = self._maxheight newdict['maxwidth'] = self._maxwidth newdict['bpp'] = self._bpp newdict['_plucker_from_image'] = self._from_image newdict['post_data'] = self._post_data return newdict def make_child_attributes (self, url, dict, inline): """Generate a new SpiderLink object that represents the link from a document lead to by self. 'dict' contains the attributes specified for this link""" new = SpiderLink (url, self._dict) if self._current_depth is not None: if inline: new.set_current_depth (self._current_depth) else: new.set_current_depth (self._current_depth + 1) if self._max_depth is not None: new.set_max_depth (self._max_depth) if self._stay_on_domain is not None: new.set_stay_on_domain (self._stay_on_domain) if self._stay_on_host is not None: new.set_stay_on_host (self._stay_on_host) if self._stay_below is not None: new.set_stay_below (self._stay_below) if self._maxwidth is not None: new.set_maxwidth (self._maxwidth) if self._maxheight is not None: new.set_maxheight (self._maxheight) new.set_bpp (self._bpp) new._update_from_dict (dict, after_taken=0) new.set_post (None) return new def link_taken (self, dict_of_attributes): """Note that this link has been taken. The _max_depth is now changed to the new value (if there was one).""" if self._new_max_depth is not None: self._max_depth = self._new_max_depth self._current_depth = 1 self._update_from_dict (dict_of_attributes, after_taken=1) def get_post (self): """Return the data for the post operation or None if none was specified""" if self._post_data is not None: return str (self._post_data) else: return None def set_post (self, post_data): """Set the data for a post operation""" self._post_data = post_data def get_maxwidth (self): return self._maxwidth def set_maxwidth (self, value): self._maxwidth = value def get_maxheight (self): return self._maxheight def set_maxheight (self, value): self._maxheight = value def get_bpp (self): return self._bpp def set_bpp (self, value): self._bpp = value def set_current_depth (self, n): self._current_depth = n def set_max_depth (self, n): self._max_depth = n def set_stay_on_domain (self, host): self._stay_on_domain = self._get_domain_from_host(host) def set_stay_on_host (self, host): self._stay_on_host = host def set_stay_below (self, urlpart): self._stay_below = urlpart def check_fetch (self, as_image): """Check whether this link should be taken. If 'as_image' is true, this link points to an inline image""" # sys.stderr.write("check_fetch of %s...\n" % self._url.as_string(with_fragment=1)) # mailto: gets always included if (self._url.get_protocol() == 'mailto'): return 1 if self._max_depth is not None and self._current_depth is not None: if self._current_depth > self._max_depth: # Depth exceeded return 0 if as_image and self._bpp == 0: # No images wanted return 0 if self._stay_on_domain is not None: if self._stay_on_domain != self._get_domain_from_host(self._url.get_host()): # Got to another domain return 0 if self._stay_on_host is not None: if self._stay_on_host != self._url.get_host(): # Got to another host return 0 if self._stay_below is not None: target_url = self._url.as_string (with_fragment=0) target_part = target_url[:len (self._stay_below)] if self._stay_below != target_part: return 0 m = self._dict.get('url_pattern') if m and not m.match(self._url.as_string(with_fragment=0)): return 0 # default: fetch return 1 def get_from_image(self): return self._from_image def set_from_image(self, n): self._from_image = n self._dict['_plucker_from_image'] = n class Spider: """A class to collect web pages by spidering from the home document.""" def __init__ (self, retriever, parser, \ collection, \ exclusion_list, \ config, alias_list): """Call with a retriever and a parser function. 'retriever' gets called with a Url.URL and should return a (header-dict, data) tupe. In header-dict, the values 'error code' and 'error text' *must* be defined. If the data is valid, 'error code' must be zero. 'parser' gets called with the url and these header-dict and the body and should return a PluckerDocument or None (if failed). Implementation: We have one queue and two dictionaries in which we just store, whether some URL has been tried before (and collected or failed).""" self._retriever = retriever self._parser = parser self._config = config # _queue contains a list of (URL, attributes, key) pairs to fetch self._queue = [] # _collected contains a mapping of retrieved URLs to PluckerDocs if collection is None: self._collected = {} else: self._collected = collection # _failed contains a mapping of failed retrieval URLs to failure headers self._failed = {} self._exclusion_list = exclusion_list if alias_list is None: self._alias_list = AliasList () else: self._alias_list = alias_list self._fatal_error = 0 # whether we're going depth-first or breadth-first self._depth_first = config.get_bool('depth_first', 0) # Now initialize the first things we *do* want in the # collection of documents self._home_url = URL(config.get_string ('home_url', 'plucker:/home.html')) bpp = config.get_int ('bpp', 1) # ROB (Oct 26,2003) Commenting out this line. doing_big_imgs is never used. We check again in ImageParser.py # _related_images anyway, together with the rest of image parsing. #self._doing_big_imgs = config.get_string('alt_maxwidth', None) or config.get_string('alt_maxheight', None) attributes = {'maxdepth': "%d" % config.get_int ('home_maxdepth', 2), 'bpp': "%d" % bpp} if config.get_bool ('home_stayonhost', 0): attributes['stayonhost'] = 1 if config.get_bool ('home_stayondomain', 0): attributes['stayondomain'] = 1 tmp = config.get_string ('home_staybelow') if tmp is not None: attributes['staybelow'] = tmp tmp = config.get_string ('home_url_pattern') if tmp is not None: m = re.compile(tmp) if m: sys.stderr.write("Regexp pattern is '%s'\n" % m.pattern) attributes['url_pattern'] = m self.add_queue (self._home_url, SpiderLink (self._home_url, attributes), force=1) def _create_id_string (self, attributes): values = (type(attributes) == types.DictType and attributes.items()) or attributes.as_dict().items() valueslist = [] for value in values: if (value[0][:9] == '_plucker_' or value[0] in LINK_ATTRIBUTES_TO_IGNORE): continue valueslist.append((string.lower(value[0]), value[1],)) valueslist.sort() return str(valueslist) def _needs_processing (self, key, url, attr): if self._collected.has_key (key): # we need to register the already-collected document with the internal plucker keys that # are being held by the link under consideration, so that those keys are resolved self._register_document(attr, self._collected[key]) return 0 if self._failed.has_key (url): return 0 # We'll err on the side of adding the URL to the queue, since we already # check later to see whether or not we've already fetched it. # # for (other_url, attr, key) in self._queue: # if other_url == url: # return 0 # return 1 def add_queue (self, origurl, attr, force=0): """Maybe add url to the queue""" url = self._alias_list.get (origurl) # mailto: gets always included key = str(url) + '\0' + self._create_id_string(attr) if (url[:7] == 'mailto:'): self._queue.append ((url, attr, key)) return 1 if not force and self._exclusion_list: if not self._exclusion_list.check (url): message(2, "Excluding '%s'\n" % url) return 0 if force or self._needs_processing (key, url, attr): self._queue.append ((url, attr, key)) return 1 return 0 def done (self): """Check whether something rests to be done in the queue""" return len (self._queue) == 0 def process_all (self, verbose=0): """Process until all done""" # figure out how many we expect to read estimate = 0 statusfile = None statusfilename = self._config.get_string('status_file') if statusfilename: if os.path.exists(statusfilename): statusfile = open(statusfilename, 'r+') else: statusfile = open(statusfilename, 'w+') if statusfile: line = statusfile.readline() if line: tokens = string.split(line) if tokens: estimate = string.atoi(tokens[0]) # OK, process everything self._filenum = 1 while not self.done (): self.process (verbose, estimate, statusfile) message("---- all %d pages retrieved and parsed ----", len(self._collected)) if statusfile: statusfile.seek(0) statusfile.write("%d %d %d\n" % (len(self._collected), len(self._queue), estimate)) statusfile.flush() statusfile.close() def _register_document (self, attribs, doc): # See if the attribute dictionary has any keys that need to be bound # to this document, for links of various kinds. If so, call the # register_doc method on the doc with the specified index. attribute_dict = (type(attribs) == types.DictType and attribs) or attribs.as_dict() #sys.stderr.write(str(attribute_dict) + '\n') if attribute_dict.get('_plucker_from_image'): doc.register_doc(attribute_dict.get('_plucker_id_tag_inlineimage')) elif attribute_dict.get('_plucker_alternate_image'): #sys.stderr.write(str(attribute_dict) + '\n') doc.register_doc(attribute_dict.get('_plucker_id_tag_outoflineimage')) if attribute_dict.get('_plucker_id_tag'): #sys.stderr.write(str(attribute_dict) + '\n') doc.register_doc(attribute_dict.get('_plucker_id_tag')) def process (self, verbose, estimate, statusfile): """Process the next thing in the queue""" # We use a number of different forms of the 'URL to be fetched' # in this routine, as follows: # urltext -- the string form of the URL popped from the queue # url -- the URL object formed from "urltext" # #if verbose>1: # # To help debugging, we can write out the current queue # f = open ("spider.status", "w") # f.write ("Queue:\n") # for (key, attr) in self._queue: # if self._collected.has_key (key): # stat = "done" # elif self._failed.has_key (key): # stat = "failed" # else: # stat = "" # f.write (" %s %s %s\n" % (repr(key), stat, str(attr))) # f.write ("\nNot Collected:\n") # for key in self._failed.keys (): # attr = self._failed[key] # f.write (" %s %s\n" % (repr(key), str(attr))) # f.close () import tempfile if self._queue: if estimate: message ("---- %d expected, %d collected, %d to do ----\n", estimate, len (self._collected), len (self._queue)) else: message ("---- %d collected, %d to do ----\n", len (self._collected), len (self._queue)) if statusfile: statusfile.seek(0) statusfile.write("%d %d %d\n" % (len(self._collected), len(self._queue), estimate)) statusfile.flush() # de-queue end of queue if self._depth_first: (urltext, attributes, key) = self._queue[-1] del self._queue[-1] else: (urltext, attributes, key) = self._queue[0] del self._queue[0] attribute_dict_string = self._create_id_string(attributes) url = URL (urltext) if verbose: line_length = self._config.get_int('status_line_length', 60) urltext = str(url) if len (urltext) > line_length: urltext = urltext[:line_length - 20] + "....." + urltext[-15:] message("Processing %s...", urltext) # see if it's already been collected post_data = attributes.get_post () # urltext is the URL without any fragment urltext = url.as_string (with_fragment=0) # if there are parameters, we add them back on if post_data is not None: urltext_key = urltext + post_data else: urltext_key = urltext # OK, now that we know the URL used as the mapping key, check it key = urltext_key + '\0' + attribute_dict_string message(3, "checking " + str(key)) #sys.stderr.write('key is ' + key + '\n') if self._collected.has_key (key): # already collected message(" Already retrieved and parsed.") self._register_document(attributes, self._collected[key]) return # not collected, how about failed? if self._failed.has_key (urltext_key): # already tried, but failed message(" Already tried, but failed.") return # OK, not collected or failed -- so retrieve the contents (header, document) = self._retriever (url, alias_list=self._alias_list, \ post_data=post_data) assert header.has_key ('error code'), "Headers from retriever have no error code" assert header.has_key ('URL'), "Headers from retriever have no URL" # Check for successful fetch if header['error code'] != 0: # retrieving has failed. # self._failed[urltext_key] = header self._failed[urltext_key] = None if verbose: if header.has_key ('error code'): code = header['error code'] else: code = "No error code" if header.has_key ('error text'): text = header['error text'] else: text = "No error text" message(" Retrieval failed: %s -- %s." % (code, text)) failed_url = urltext if failed_url == self._alias_list.get (self._home_url): error("Fetching the home document failed. Aborting all!") self._queue = [] self._collected = {} self._fatal_error = 1 else: assert header.has_key ('content-type'), \ "Headers from retriever have no content-type (%s)" % repr (header) # Fetched OK message(" Retrieved ok.") # "new_url" is the URL the HTTP server sent back to us. new_url = URL (header['URL']).as_string (with_fragment=0) # We compare it with the URL we used to do the HTTP operation. #################################################################### # header['URL'] are file:C:\path\file.ext on python 1.52 and # # C:\path\file.ext om Python 2.0. Also the case of the Drive # # letter may change. So a move detected here. This should fix that.# #################################################################### if sys.platform == 'win32' and (string.lower(urltext[0:5]) == 'file:'): if new_url[0:5] != 'file:': new_url = 'file:' + new_url new_url = URL (new_url).as_string (with_fragment=0) # again, we form the mapping key to see if it's already been processed if post_data is not None: new_url_key = new_url + post_data else: new_url_key = new_url # now check to see if the new URL is different from the original, # and add an alias if that's the case if urltext != new_url: message(" Moved from '%s' to '%s'." % (urltext, new_url)) if self._alias_list.get (urltext) == urltext: # The move was not recognized by the Retriever! # This can be caused by specifying a file: URL # without the 'file:' part self._alias_list.add (urltext, new_url) # Check to see if we already have it, or if we're not fetching # this URL... if urltext != new_url: key = new_url_key + '\0' + attribute_dict_string if self._collected.has_key (key): message(" Already retrieved and parsed.") self._register_document(attributes, self._collected[key]) return if not self._exclusion_list.check (new_url): message(" Is excluded.") return # Check for a filter to run document through if header['content-type'][:5] == "text/": filter = self._config.get_string ('filter') if filter is not None: try: tmpfile = tempfile.mktemp() f = open(tmpfile, "wb") f.write(document) f.close() command = filter + " " + tmpfile pipe = os.popen(command) document = pipe.read() pipe.close() finally: try: os.unlink(tmpfile) except: pass # OK, it's fair game, so we parse it try: pluckerdoc = self._parser (new_url, header, document, self._config, attributes.as_dict()) except: show_exception(2) pluckerdoc = None # Successful parse? if pluckerdoc is None: # No. Signal error. headers = {'error code': -1, 'error text': "parsing failed"} # self._failed[new_url_key] = headers self._failed[new_url_key] = None message(" Parsing failed.") return # OK, at this point we have a valid pluckerdoc self._collected[key]=pluckerdoc #sys.stderr.write('logging ' + key + '\n') self._register_document(attributes, pluckerdoc) tables = pluckerdoc.get_tables() for i in range(0, len(tables)): attrs = tables[i].get_attrs() self._register_document(attrs, tables[i]) self._collected[attrs['href']] = tables[i] if pluckerdoc.is_multiimage_document(): pieces = pluckerdoc.get_pieces() for (piece_doc, piece_id) in pieces: piece_doc.register_doc(piece_id) tkey = piece_doc.get_url() + '\0' self._collected[tkey] = piece_doc # Now check for some extra processing, depending on the # type of page the URL pointed to # For text documents, we want to harvest any links in the text if pluckerdoc.is_text_document (): (hrefs, imagerefs) = pluckerdoc.get_external_references () doc_ref_count = 0 for (suburltext, dict) in hrefs: suburl = URL (suburltext) suburl.remove_fragment () if suburl.as_string(with_fragment=0)[:17] != "plucker:/~parts~/": # Subparts are not needed for fetching message(3, " Looking at suburl %s...", str(suburltext)) new_attr = attributes.make_child_attributes (suburl, dict, inline=0) if new_attr.check_fetch (as_image = 0): new_attr.link_taken (dict) if self.add_queue (suburl, new_attr): doc_ref_count = doc_ref_count + 1 img_ref_count = 0 for (suburltext, dict) in imagerefs: suburl = URL (suburltext) suburl.remove_fragment () new_attr = attributes.make_child_attributes (suburl, dict, inline=1) new_attr.set_from_image(1) if new_attr.check_fetch (as_image = 1): new_attr.link_taken (dict) if self.add_queue (suburl, new_attr): img_ref_count = img_ref_count + 1 else: message(2, " Not fetching image %s (already fetched)", new_attr) else: message(2, " Not fetching image %s", str (suburl)) message (" Parsed ok%s%s." % (((doc_ref_count > 0 or img_ref_count > 0) and "; ") or "", ("%s%s%s" % (((doc_ref_count > 0) and ("added %d document link%s" % (doc_ref_count, (doc_ref_count != 1 and "s") or ""))) or "", ((doc_ref_count > 0) and (img_ref_count > 0) and " and ") or "", ((img_ref_count > 0) and ("%d image%s" % (img_ref_count, (img_ref_count != 1 and "s") or ""))) or "")))) # pluckerdoc.clear_external_references() # For image documents, we want to create alternate renditions # of the image, if called for elif pluckerdoc.is_image_document() or pluckerdoc.is_multiimage_document(): alternate_count = 0 others = pluckerdoc.get_related_images() for (other_url, other_attributes) in others: message(2, " Rendering other versions of image %s...\n" % other_url) testkey = other_url + '\0' + self._create_id_string(other_attributes) if self._collected.has_key(testkey): message(3, " Reusing already-rendered image version %s.\n" % self._collected[testkey].get_url()) continue message(3, " Key is " + str(key) + ".\n") try: newdoc = self._parser(other_url, header, document, self._config, other_attributes) self._collected[testkey] = newdoc self._register_document(other_attributes, newdoc) alternate_count = alternate_count + 1 if newdoc.is_multiimage_document(): pieces = newdoc.get_pieces() for (piece_doc, piece_id) in pieces: piece_doc.register_doc(piece_id) tkey = piece_doc.get_url() + '\0' self._collected[tkey] = piece_doc except: show_exception(2) message(2, " Parsing of alternate rendition %s failed", testkey) message (" Parsed ok%s." % (((alternate_count > 0) and ("; added %d alternate rendition%s" % (alternate_count, (alternate_count != 1 and "s") or ""))) or "")) else: # all other document types message (" Parsed ok.") def get_collected (self): return self._collected def encountered_fatal_error (self): return self._fatal_error def execute_commands (item_name, config): verbosity = config.get_int ('verbosity', 1) for affix in [''] + map (lambda n: str (n), range (1,10)): command = config.get_string (item_name + affix, "") if command: message("Executing '%s': %s" % (item_name+affix, command)) try: if os.system (command): raise RuntimeError("execution of command '%s' failed" % command) except: error("Error during execution of '%s': %s" % (item_name+affix, command)) def main (config, excl_lists=[]): import os, sys from PyPlucker.Parser import generic_parser from PyPlucker.Retriever import SimpleRetriever from PyPlucker.Writer import CacheWriter, PDBWriter from PyPlucker.ExclusionList import ExclusionList import PyPlucker.PluckerDocs PyPlucker.PluckerDocs.PluckerTextDocument.seamless_fragments = config.get_bool ('seamless_fragments',1) PyPlucker.PluckerDocs.PluckerTextDocument.link_fragments = config.get_bool ('link_fragments',0) pluckerdir = config.get_string ('pluckerdir') assert pluckerdir is not None pluckerhome = config.get_string ('PLUCKERHOME') assert pluckerhome is not None if not os.path.exists (pluckerhome) or not os.path.isdir (pluckerhome): error ("Pluckerhome (%s) does not exist or isn't a directory\n" % pluckerhome) sys.exit (1) if not os.path.exists (pluckerdir) or not os.path.isdir (pluckerdir): error ("Pluckerdir (%s) does not exist or isn't a directory\n" % pluckerdir) sys.exit (1) verbosity = config.get_int ('verbosity', 1) message("Pluckerdir is '%s'..." % pluckerdir) if verbosity: if os.environ.has_key ('HTTP_PROXY') and not (os.environ.has_key ('HTTP_PROXY_USER') and os.environ.has_key ('HTTP_PROXY_PASS')): message("Using proxy '%s'..." % os.environ['HTTP_PROXY']) if os.environ.has_key ('HTTP_PROXY') and (os.environ.has_key ('HTTP_PROXY_USER') and os.environ.has_key ('HTTP_PROXY_PASS')): message("Using proxy '%s' with authentication for user '%s'..." % (os.environ['HTTP_PROXY'],os.environ['HTTP_PROXY_USER'])) # try to set a socket timeout -- thanks to lemburg for this patch socket_timeout = config.get_int('retrieval_timeout', 0) if socket_timeout: try: try: import timeoutsocket except: from PyPlucker.helper import timeoutsocket message(2, "Setting retrieval timeout to %d seconds", socket_timeout) timeoutsocket.setDefaultSocketTimeout(socket_timeout) except: message(2, "Couldn't set retrieval timeout to desired %d seconds -- will wait forever if necessary", socket_timeout) import PyPlucker.PluckerDocs PyPlucker.PluckerDocs._DOC_HEADER_SIZE = 8 PyPlucker.PluckerDocs._PARA_HEADER_SIZE = 4 alias_list = AliasList () if config.get_bool ('zlib_compression', 0): try: import zlib except ImportError: message(0, "Your Python installation does not support ZLib compression.") message(0, "We fall back to DOC compression.") config.set ('zlib_compression', 'false') if config.get_bool ('zlib_compression', 0): owner_id = config.get_string('owner_id_build') if owner_id: PyPlucker.PluckerDocs.UseZLibCompression (owner_id) else: PyPlucker.PluckerDocs.UseZLibCompression () message(2, "ZLib compression turned on") # # Load the exclusion lists.. # exclusion_list = ExclusionList (include_by_default=1) filename = os.path.join (pluckerhome, 'exclusionlist.txt') if os.path.exists (filename): message(2, "Using exclusion list %s", filename) exclusion_list.load_file (filename) filename = os.path.join (pluckerdir, 'exclusionlist.txt') if os.path.exists (filename): message(2, "Using exclusion list %s", filename) exclusion_list.load_file (filename) config_excl_list = config.get_string ('exclusion_lists') if config_excl_list is not None: config_excl_list = string.split (config_excl_list, os.pathsep) for filename in config_excl_list: if not os.path.isabs (filename): filename = os.path.join (pluckerdir, filename) if os.path.exists (filename) and \ (os.path.isfile (filename) or os.path.islink (filename)): message(2, "Adding extra exclusion list %s", filename) exclusion_list.load_file (filename) for filename in excl_lists: if os.path.exists (filename) and \ (os.path.isfile (filename) or os.path.islink (filename)): message(2, "Adding extra exclusion list %s", filename) exclusion_list.load_file (filename) # # finished loading exclusion lists # home_url = config.get_string ('home_url', 'plucker:/home.html') if not URL(home_url).get_protocol(): home_url = 'file:' + home_url config.set('home_url', home_url) if home_url != 'plucker:/home.html': alias_list.add ('plucker:/home.html', home_url) retriever = SimpleRetriever (pluckerdir, pluckerhome, config) max_depth = config.get_int ('home_maxdepth', 2) assert config.get_bool ('use_cache') is not None if config.get_bool ('use_cache'): cachedir = os.path.join (pluckerdir, config.get_string ('cache_dir_name', 'cache')) if not (os.path.exists(cachedir) and os.path.isdir(cachedir)): error("cache directory does not exist: " + cachedir + "\n") return 1 else: if not (os.path.exists(pluckerdir) and os.path.isdir(pluckerdir)): error("Plucker directory does not exist: " + cachedir + "\n") return 1 doc_file = config.get_string ('doc_file') if not doc_file: doc_file = config.get_string ('db_file') if doc_file: deprecated( "db_file", "doc_file" ) if (doc_file == ''): filename = doc_file else: extension = config.get_string("filename_extension", "pdb") filename = os.path.join (pluckerdir, doc_file + "." + extension) doc_name = config.get_string ('doc_name') if not doc_name: doc_name = config.get_string ('db_name') if doc_name: deprecated( "db_name", "doc_name" ) elif (doc_file != ''): # use basename in case only file name is given doc_name = os.path.basename (doc_file) else: # generate name based on home URL if len(home_url) > 31: doc_name = "..." + home_url[-28:] else: doc_name = home_url spider = Spider (retriever.retrieve, generic_parser, \ collection=None, \ exclusion_list=exclusion_list, \ config=config, alias_list=alias_list) spider.process_all(verbose=verbosity) if spider.encountered_fatal_error (): error("Fatal error while processing. Nothing written.") return 1 if verbosity > 2: PyPlucker.PluckerDocs.display_registrations() message("\nWriting out collected data...") collection = spider.get_collected () # at this point, we don't need anything except the collection, # so we can release all the memory used by spider del spider if config.get_bool ('use_cache'): writer = CacheWriter (collection, config, cachedir) message("Writing to cache dir %s" % cachedir) else: writer = PDBWriter (collection, config, name=doc_name, version=1, filename=filename) message("Writing document '%s' to file %s" % (doc_name, filename)) mapping = writer.write (verbose=verbosity, alias_list=alias_list) if verbosity > 2: mapping.print_mapping() if verbosity > 1: items = Parser.unknown_things.keys () if items: message(2, "Unknown items encountered:") items.sort () for item in items: message(2, " %s: %s" % (item, Parser.unknown_things[item])) message("Done!") return 0 def deprecated ( oldname, newname ): sys.stderr.write("NOTE: %s is a deprecated option. Please use the %s option instead.\n" % ( oldname, newname )) def realmain (outputstream, argv=None): import getopt, os, string, re, tempfile if not argv: argv = sys.argv if os.environ.has_key ('PLUCKERHOME'): pluckerhome = os.environ['PLUCKERHOME'] else: pluckerhome = os.path.expanduser ("~/.plucker") if os.environ.has_key ('PLUCKERDIR'): pluckerdir = os.environ['PLUCKERDIR'] else: pluckerdir = None def display_version (): message(0, "Plucker version %s\n" % __version__ ) def usage (reason=None, pluckerhome=pluckerhome): if reason: error(str(reason)) sys.stderr.write("Usage: %s [OPTIONS] [HOMEURL]\n" % sys.argv[0]) sys.stderr.write("(Type '%s --help' for more information.)\n" % sys.argv[0]) sys.exit (1) def display_help (pluckerhome=pluckerhome): message(0, "Usage: %s [OPTIONS] [HOMEURL]" % sys.argv[0]) message(0, " where HOMEURL is a 'file:' or 'http:' URL (which can alternatively") message(0, " be specified with --home-url=) and OPTIONS are:") message(0, " -c, --update-cache:") message(0, " Write a traditional cache directory in the ") message(0, " -f , --doc-file=") message(0, " Specify the name of the output file (see also --pluckerdir).") message(0, " If not specified, the Plucker doc will be written") message(0, " to stdout, if stdout is not a tty.") message(0, " -h, --help: Print this help") message(0, " -N , --doc-name ") message(0, " Specify the name of the document (NOT the filename).") message(0, " Defaults to -f's argument.") message(0, " -q, --quiet: Be quiet, i.e. set verbosity level 0") message(0, " -v: Set verbosity level 1 (which is the default)") message(0, " -V , --verbosity=:") message(0, " Set verbosity level ") message(0, " Verbosity level 0 is silent except for errors") message(0, " Verbosity level 1 gives progress status (default)") message(0, " Verbosity level 2 is used for debugging") message(0, " -P, --pluckerhome=:") message(0, " Use as plucker home instead of the default ") message(0, " %s (~/.plucker/ or $PLUCKERHOME)", pluckerhome) message(0, " -p, --pluckerdir=:") message(0, " Use as plucker dir instead of the default ") message(0, " Defaults to same as plucker home.") message(0, " --bpp=: Bits per pixel for images; defaults to 1") message(0, " = 0, 1, 2, 4, 8 or 16 (16 not on Windows)") message(0, " --noimages: Do not include images (same as --bpp=0)") message(0, " -H , --home-url=: Use as the root document.") message(0, " Defaults to plucker:/home.html (i.e. home.html in") message(0, " the plucker dir)") message(0, " -M , --maxdepth=:") message(0, " Use MAXDEPTH= on the home document. Defaults to 2") message(0, " -E , --exclusion-list : ") message(0, " Add to list of files searched for exclusion lists") message(0, " -s , --extra-section=:") message(0, " Add to the list of searched section in the config files") message(0, " --zlib-compression, --doc-compression:") message(0, " Specify which compression method to use. (For expert's use)") message(0, " --compression=:") message(0, " Use as the compression format") message(0, " for the database. Allowable options are 'doc', for") message(0, " Palm DOC compression, or 'zlib', for zlib compression.") message(0, " Zlib compression is typically better than DOC compression.") message(0, " --no-urlinfo: Do not include info about the URLs") message(0, " --category=[;;..;]:") message(0, " Put in the database as the default") message(0, " viewer category for the database.") message(0, " It is possible to assign several categories separated by ';'") message(0, " --depth-first:") message(0, " Do a depth-first pass through the web graph, rather than") message(0, " the default breadth-first traversal.") message(0, " --stayonhost: Do not follow external URLs") message(0, " --stayondomain:") message(0, " Do not follow URLs off of this domain") message(0, " --staybelow=:") message(0, " Automatically exclude any URL that doesn't begin with .") message(0, " --maxheight=:") message(0, " Set maximum height of images to pixels.") message(0, " --maxwidth=:") message(0, " Set maximum width of images to pixels.") message(0, " --alt-maxheight=:") message(0, " Set alternative maximum height of images to pixels. This value") message(0, " is used for 'big' versions of inline images that had to be scaled") message(0, " down in size to obey the MAXWIDTH and MAXHEIGHT parameters.") message(0, " Special values that can be used are 0 and -1, which do the following:") message(0, " Set --alt-maxheight=0 and --alt-maxheight=0 to link to original sized") message(0, " 'big' versions.") message(0, " Set --alt-maxheight=-1 and --alt-maxheight=-1 to not link to any") message(0, " 'big' versions at all.") message(0, " --alt-maxwidth=:") message(0, " Set alternative maximum width of images to pixels. This value") message(0, " is used for 'big' versions of inline images that had to be scaled") message(0, " down in size to obey the MAXWIDTH and MAXHEIGHT parameters.") message(0, " --launchable, --not-launchable:") message(0, " Set (or unset) the launchable bit in the output file.") message(0, " --backup, --no-backup:") message(0, " Set or clear the backup bit in the output file.") message(0, " --beamable, --not-beamable:") message(0, " Set or clear the beamable bit in the output file.") message(0, " --charset=:") message(0, " Set the default charset to that specified by .") message(0, " --owner-id=:") message(0, " Set owner-id of the output document to .") message(0, " --url-pattern=:") message(0, " Only fetch URLs if they match the regular expression specified") message(0, " by .") message(0, " --title=:") message(0, " Set the title of the document to .") message(0, " --author=:") message(0, " Set the author of the document to .") message(0, " --status-file=:") message(0, " Use as the status file.") message(0, " --referrer=:") message(0, " Send the specified when fetching remote pages.") message(0, " --user-agent=:") message(0, " Send the specified as the identification of") message(0, " of the 'browser' when fetching remote pages. Defaults to") message(0, " Plucker/Py-%s." % __version__) message(0, " --http-proxy=:") message(0, " HTTP proxy used on machine. Example is http://proxy.aol.com:8080") message(0, " --http-proxy-user=:") message(0, " HTTP proxy username") message(0, " --http-proxy-pass=:") message(0, " HTTP proxy password") message(0, " --version:") message(0, " Print out the version of Plucker that is being run and then exit.") message(0, " --tables:") message(0, " Generate real tables instead of vertical lists of cells.") message(0, " --fragments=") message(0, " Set the way the 30K fragments of a page are joined to") message(0, " :") message(0, " seamless: seamless connection for 1.5+ viewer") message(0, " link: insert links to previous/next fragment") message(0, " both: insert links and support seamless connection") message(0, " Defaults to seamless.") message(0, " --creator-id=") message(0, " Sets 4 character Palm creator ID of output database. Example is Plkr") message(0, " Plucker viewer loads documents with creator ID of Plkr into") message(0, " the document library.") message(0, " Defaults to Plkr") message(0, " --filter=:") message(0, " Pass fetched documents through filter prior to parsing.") message(0, " --bookmarks=:") message(0, " Set bookmarks added to database to .") message(0, " pages: add a bookmark for each text/* page collected.") message(0, " links: add a bookmark for each bookmark type link tag.") message(0, " both: add both kinds of bookmarks.") message(0, " Defaults to None.") message(0, "") message(0, "Note that you must specify either -f or specify HOMEURL as an argument,") message(0, " or specify -c to update a cache.") message(0, "") try: home_url = None verbosity = None bpp = None max_depth=None seamless_fragments=None link_fragments=None use_cache = None use_file = None doc_name = None exclusion_lists = [] extra_sections = [] zlib_compression = None no_url_info = None stayondomain = None stayonhost = None staybelow = None category = None maxwidth = None maxheight = None alt_maxwidth = None alt_maxheight = None launchable = None backup = None copy_protect = None iconfile = None default_charset = None owner_id = None url_pattern = None referrer = None user_agent = None title = None author = None statusfile = None depthfirst = None tables = None filter = None bookmarks = None bookmark_pages = None http_proxy = None http_proxy_user = None http_proxy_pass = None creator_id = None (opts, args) = getopt.getopt(argv[1:], "f:chqvV:p:P:H:E:M:N:s:", \ [ "db-file=", "doc-file=", "help", "quiet", "pluckerdir=", "pluckerhome=", "bpp=", "noimages", "exclusion-list=", "maxdepth=", "db-name=", "doc-name=", "extra-section=", "verbosity=", "zlib-compression", "doc-compression", "no-urlinfo", "stayondomain", "stayonhost", "staybelow=", "category=", "maxheight=", "maxwidth=", "alt-maxheight=", "alt-maxwidth=", "compression=", "home-url=", "update-cache", "launchable", "not-launchable", "backup", "no-backup", "beamable", "not-beamable", "icon=", "charset=", "owner-id=", "url-pattern=", "referrer=", "user-agent=", "title=", "author=", "status-file=", "version", "tables", "depth-first", "http-proxy=", "http-proxy-user=", "http-proxy-pass=", "fragments=", "creator-id=", "bookmarks="]) if args: # usage ("Only options are allowed as arguments.") if len(args) > 1: if args[1][0] == '-': usage("All options (such as '" + string.join(args[1:]) + "') must be specified before the argument (" + str(args) + ").") else: usage("Only one 'root' document should be specified as an argument.") root = args[0] if (len(root) > 5) and ((string.lower(root[:5]) == 'http:') or (string.lower(root[:5]) == 'file:')): home_url = root elif os.path.exists(root): home_url = 'file:'+root else: usage("Can't locate " + root) for (opt, arg) in opts: if opt == "--version": display_version() sys.exit(0) if opt == "-h" or opt == "--help": display_help() sys.exit(0) elif opt == "--fragments" and arg == "both": seamless_fragments = 'true' link_fragments = 'true' elif opt == "--fragments" and arg == "link": seamless_fragments = 'false' link_fragments = 'true' elif opt == "--fragments" and arg == "seamless": seamless_fragments = 'true' link_fragments = 'false' elif opt == "-c" or opt == "--update-cache": use_cache = 1 elif opt == "-f" or opt == "--doc-file": use_file = arg elif opt == "--db-file": deprecated( "db-file", "doc-file" ) use_file = arg elif opt == "--bpp": bpp = int (arg) if bpp != 0 and bpp != 1 and bpp != 2 and bpp != 4 and bpp != 8 and bpp != 16: usage ("Only 0, 1, 2, 4, 8 or 16 allowed for -bpp") elif opt == "--noimages": bpp = 0 elif opt == "-H" or opt == "--home-url": if home_url and (home_url <> arg): usage("Two different root URLs specified: " + home_url + " and " + arg) home_url = arg elif opt == "-E" or opt == "--exclusion-list": exclusion_lists.append (arg) elif opt == "-q" or opt == "--quiet": verbosity = 0 elif opt == "-v": verbosity = 1 elif opt == "-V" or opt == "--verbosity": verbosity = int (arg) elif opt == "-M" or opt == "--maxdepth": max_depth = int (arg) elif opt == "-N" or opt == "--doc-name": doc_name = arg if len(doc_name) > 26: message(0, "Only the first 26 characters \"%s\" of the document name will be used.", arg[:26]) doc_name = arg[:26] elif opt == "--db-name": deprecated( "db-name", "doc-name" ) doc_name = arg elif opt == "--pluckerdir" or opt == "-p": pluckerdir = arg elif opt == "--pluckerhome" or opt == "-P": pluckerhome = arg elif opt == "-s" or opt == "--extra-section": extra_sections.append (arg) elif opt == "--zlib-compression": zlib_compression = 'true' elif opt == "--doc-compression": zlib_compression = 'false' elif opt == "--compression" and arg == "doc": zlib_compression = 'false' elif opt == "--compression" and arg == "zlib": zlib_compression = 'true' elif opt == "--no-urlinfo": no_url_info = 'true' elif opt == "--stayondomain": stayondomain = 'true' elif opt == "--stayonhost": stayonhost = 'true' elif opt == "--staybelow": staybelow = arg elif opt == "--url-pattern": url_pattern = arg elif opt == "--category": category = arg elif opt == "--maxheight": maxheight = arg elif opt == "--maxwidth": maxwidth = arg elif opt == "--alt-maxheight": alt_maxheight = arg elif opt == "--alt-maxwidth": alt_maxwidth = arg elif opt == "--launchable": launchable = 1 elif opt == "--not-launchable": launchable = 0 elif opt == "--backup": backup = 1 elif opt == "--no-backup": backup = 0 elif opt == "--beamable": copy_protect = 0 elif opt == "--not-beamable": copy_protect = 1 elif opt == "--icon": iconfile = arg elif opt == "--charset": default_charset = arg elif opt == "--owner-id": owner_id = arg elif opt == "--referrer": referrer = arg elif opt == "--user-agent": user_agent = arg elif opt == "--title": title = arg elif opt == "--author": author = arg elif opt == "--status-file": statusfile = arg elif opt == "--depth-first": depthfirst = 1 elif opt == "--tables": tables = 1 elif opt == "--http-proxy": http_proxy = arg elif opt == "--http-proxy-user": http_proxy_user = arg elif opt == "--http-proxy-pass": http_proxy_pass = arg elif opt == "--creator-id": creator_id = arg if len(creator_id) > 4: message(0, "Only the first 4 characters \"%s\" of the creator ID will be used.", arg[:4]) creator_id = arg[:4] elif opt == "--filter": filter = arg elif opt == "--bookmarks": if arg == "both": bookmarks = 'true' bookmark_pages = 'true' elif arg == "links": bookmarks = 'true' bookmark_pages = 'false' elif arg == "pages": bookmarks = 'false' bookmark_pages = 'true' else: bookmarks = 'false' bookmark_pages = 'false' else: usage ("Error: Unknown option '%s'" % opt) except getopt.error, text: usage (str(text)) config = ConfigFiles.Configuration (pluckerhome, pluckerdir, extra_sections=extra_sections, error_logger=error) if '_jython' in sys.builtin_module_names: config.set('jython', 1) if http_proxy is None: http_proxy = config.get_string ( 'http_proxy' ) if http_proxy is not None: os.environ['HTTP_PROXY'] = http_proxy if http_proxy_user is None: http_proxy_user = config.get_string ( 'http_proxy_user' ) if http_proxy_user is not None: os.environ['HTTP_PROXY_USER'] = http_proxy_user if http_proxy_pass is None: http_proxy_pass = config.get_string ( 'http_proxy_pass' ) if http_proxy_pass is not None: os.environ['HTTP_PROXY_PASS'] = http_proxy_pass if pluckerdir is None: pluckerdir = config.get_string ('pluckerdir') if pluckerdir is None: # also not in the config, so we default to plucker home pluckerdir = pluckerhome config.set ('pluckerdir', pluckerdir) if use_file is None and use_cache is None: if config.get_string ('db_file') is not None and config.get_bool ('use_cache'): usage ("Config files specify both a 'db_file' and a 'use_cache=1'.\nYou must decide by specifiying an argument!") if config.get_string ('doc_file') is not None and config.get_bool ('use_cache'): usage ("Config files specify both a 'doc_file' and a 'use_cache=1'.\nYou must decide by specifiying an argument!") if config.get_string ('db_file') is None and config.get_string ('doc_file') is None: if config.get_string ('use_cache') is None: if config.get_int('jython', 0): usage("Output to stdout not allowed in Java version.") elif sys.stdout.isatty(): usage("No output filename specified, and stdout is a terminal!") else: use_file = '' verbosity = 0 else: config.set ('use_cache', 0) if use_file and use_cache: usage ("You must not specify both -f and -c!") if doc_name and use_cache: usage ("Specify -N/--doc-name only with -f!") if use_file and outputstream: usage ("Don't specify a --doc_file when passing an outputstream!") if outputstream: usage ("Output streams not yet supported") if verbosity is None: verbosity = config.get_int('verbosity', 1) PyPlucker.UtilFns.set_verbosity(verbosity) config.set ('verbosity', verbosity) if zlib_compression is None: if config.get_string ('compression') == "doc": zlib_compression = 'false' elif config.get_string ('compression') == "zlib": zlib_compression = 'true' if owner_id: if zlib_compression == 'false': message('Specification of an owner-id forces use of zlib compression...') zlib_compression = 'true' mibenum = None # if not specified on command line, look in .pluckerrc if default_charset is None: default_charset = config.get_string("default_charset") # if we have one, validate it if default_charset is not None: from PyPlucker.helper.CharsetMapping import charset_name_to_mibenum, charset_known_names import string, re mibenum = charset_name_to_mibenum(default_charset) if mibenum: config.set('default_charset', mibenum) else: usage ("Error: Unsupported charset '" + default_charset + "' specified as default charset.\n" " Charset must be either a decimal MIBenum value, or one of " + str(charset_known_names())) # update the config with the user options if use_file is not None: config.set ('doc_file', use_file) config.set ('use_cache', 0) if doc_name is not None: config.set ('doc_name', doc_name) if use_cache is not None: config.set ('use_cache', 1) if use_file: config.set ('use_cache', 0) if category is not None: import string category_count = string.count (category, ";") if category_count < 16: config.set ('category', category) else: usage ("Max number of categories is 16!") if bpp is not None: config.set ('bpp', bpp) if max_depth is not None: config.set ('home_maxdepth', max_depth) if home_url is not None: config.set ('home_url', home_url) if zlib_compression: config.set ('zlib_compression', zlib_compression) if no_url_info: config.set ('no_urlinfo', no_url_info) if seamless_fragments: config.set ('seamless_fragments', seamless_fragments) if link_fragments: config.set ('link_fragments', link_fragments) if stayonhost: config.set ('home_stayonhost', stayonhost) if stayondomain: if stayonhost: message("Warning: --stayonhost and --stayondomain both set. Ignoring --stayondomain.") else: config.set ('home_stayondomain', stayondomain) if staybelow: config.set ('home_staybelow', staybelow) if url_pattern: config.set ('home_url_pattern', url_pattern) if maxheight is not None: config.set ('maxheight', maxheight) if maxwidth is not None: config.set ('maxwidth', maxwidth) if alt_maxheight is not None: config.set ('alt_maxheight', alt_maxheight) if alt_maxwidth is not None: config.set ('alt_maxwidth', alt_maxwidth) if launchable == 1: config.set ('launchable_bit', 1) config.set ('icon', 1) elif launchable == 0: config.set ('launchable_bit', 0) if backup == 1: config.set ('backup_bit', 1) elif backup == 0: config.set ('backup_bit', 0) if copy_protect == 1: config.set ('copyprevention_bit', 1) elif copy_protect == 0: config.set ('copyprevention_bit', 0) if iconfile is not None: config.set ('icon', 1) config.set ('big_icon', iconfile) if owner_id is not None: config.set ('owner_id_build', owner_id) if referrer is not None: config.set ('referrer', referrer) if user_agent is not None: config.set ('user_agent', user_agent) if author is not None: config.set ('author_md', author) if title is not None: config.set ('title_md', title) if mibenum is not None: config.set ('default_charset', mibenum) if statusfile is not None: config.set ('status_file', statusfile) if depthfirst is not None: config.set ('depth_first', 1) if tables is not None: config.set ('tables', 1) if creator_id is None: creator_id = config.get_string ( 'creator_id', 'Plkr' ) if creator_id is not None: config.set ('creator_id', creator_id) if filter is not None: config.set ('filter', filter) if bookmarks is not None: config.set ('bookmarks', bookmarks) if bookmark_pages is not None: config.set ('bookmark_pages', bookmark_pages) for i in range (len (exclusion_lists)): exclusion_lists[i] = os.path.join (pluckerdir, exclusion_lists[i]) execute_commands ("before_command", config) retval = main (config, exclusion_lists) execute_commands ("after_command", config) if tempfile.tempdir is not None and tempfile.template is not None: """ Clean up any temp files """ base = re.compile (tempfile.template) for file in os.listdir (tempfile.tempdir): if base.match (file): os.remove(os.path.join(tempfile.tempdir, file)) return retval if __name__ == '__main__': sys.exit(realmain(None))