#!/usr/bin/env python # -*- mode: python; indent-tabs-mode: nil; -*- """ Writer.py $Id: Writer.py,v 1.39 2004/04/08 00:57:34 chrish Exp $ Write a collection of documents into various formats. Currently implemented: the traditional Plucker cache format. Copyright 1999,2000 by Holger Duerer Distributable under the GNU General Public License Version 2 or newer. """ import os, struct, string, time, helper.PQAAppInfo, sys, urllib import PyPlucker from PyPlucker import Url, PluckerDocs from PyPlucker.helper import prc, dict from PyPlucker.helper.CharsetMapping import charset_mibenum_to_name from PyPlucker.UtilFns import message, error from PyPlucker.Url import CompareURL class Mapper: """This class handles all the mappings from URLs and PluckerDocs.PluckerDocument instances to record-IDs. It contains a method "get_docs" which returns all the PluckerDocument instances it knows about; it contains a method "build_links" which returns a sequence of all the URLs in record-ID order, with zero-length URLs for unused record-IDs. It contains a method "print_mapping" which sends a display of the mapping to stderr. Finally, it contains a method "get_or_add", which takes either a URL or a PluckerDocument instance, and returns its record-ID.""" def __init__ (self, collection, alias_list): # maintains a mapping of URLs to PluckerDocs.PluckerDocument instances. # Keys are either a string URL, in which case the value is just a single instance, # or a (url, fragment-id) pair, in which case the value is a (doc-instance, paragraph-number) pair. self._url_to_doc_mapping = {} temp_list = [] for (key, doc) in collection.items(): url = string.split(key, '\0')[0] self._url_to_doc_mapping[url] = doc # record internal URL name, as well if isinstance(doc, PluckerDocs.PluckerDocument): self._url_to_doc_mapping[doc.get_url()] = doc # has sub-docs? If so, get them and record them subdocs = doc.get_documents() if len(subdocs) > 1: # first subdoc is always the main doc, so we skip that for subdoc in subdocs[1:]: self._url_to_doc_mapping[subdoc.get_url()] = subdoc # check for internal fragment names in the page name_mapping = isinstance(doc, PluckerDocs.PluckerTextDocument) and doc.get_name_map() if name_mapping: for (name, (internalurl, paragraph_number)) in name_mapping.items(): internalurl = alias_list.get(internalurl, internalurl) temp_list.append((key, url, name, internalurl, paragraph_number,)) for (key, url, name, internalurl, paragraph_number) in temp_list: doc = self._url_to_doc_mapping.get(internalurl) if not doc: sys.stderr.write("***** Can't find doc for URL " + str((key, url, internalurl, paragraph_number,)) + '\n') else: self._url_to_doc_mapping[(url, name)] = (doc, paragraph_number) # a list of URL->URL mappings self._alias_list = alias_list or {} # Maps a PluckerDocs.PluckerDocument instance to a record ID. # Keys are either a doc, or a tuple containing (doc, para#). self._doc_to_id_mapping = {} # Maps straight URLs without documents to a record ID. # Used mainly for external links. self._url_to_id_mapping = {} # first record ID issued. Records 1-10 are reserved. self._current_id = 11 # make sure record number 2 goes to the 'home' document (why?) url = self._alias_list.get('plucker:/home.html') if url: while self._alias_list.has_key(url): url = self._alias_list.get(url) doc = self._url_to_doc_mapping.get(url) if doc: self._doc_to_id_mapping[doc] = 2 else: self._url_to_id_mapping[url] = 2 else: doc = self._url_to_doc_mapping.get('plucker:/home.html') if doc: self._doc_to_id_mapping[doc] = 2 else: self._url_to_id_mapping[url] = 2 if doc: # note that the first part is already done parts = doc.get_documents()[1:] for subdoc in parts: self._get_id_for_doc(doc) # finally, make sure each doc has an ID assigned sorted_list=collection.items() sorted_list.sort(lambda x, y: CompareURL(x[0],y[0])) for (url, doc) in sorted_list: parts = doc.get_documents() for subdoc in parts: self._get_id_for_doc(subdoc) del sorted_list def _get_id_for_doc(self, idoc, add=1): if type(idoc) == type(()): doc = idoc[0] else: doc = idoc id = self._doc_to_id_mapping.get(doc) if not id: id = self._url_to_id_mapping.get(doc.get_url()) if id: self._doc_to_id_mapping[doc] = id if not id: if not add: return None if isinstance(doc, PluckerDocs.PluckerIndexDocument): # there's only one, and it always has record # 1 id = 1 elif isinstance(doc, PluckerDocs.PluckerBookmarkDocument): id = 6 elif isinstance(doc, PluckerDocs.PluckerLinkIndexDocument): id = 3 elif isinstance(doc, PluckerDocs.PluckerCategoryDocument): id = 4 elif isinstance(doc, PluckerDocs.PluckerMetadataDocument): id = 5 else: id = self._current_id self._current_id = self._current_id + 1 self._doc_to_id_mapping[doc] = id url_mapping = self._url_to_doc_mapping.get(doc.get_url()) if (url_mapping != doc): if (url_mapping != None): message("URL %s for doc %s points to doc %s\n" % (doc.get_url(), str(doc), str(url_mapping))) self._url_to_doc_mapping[doc.get_url()] = doc # message("new document " + str(doc) + " => " + str(id) + "\n") if type(idoc) == type(()): return (id, idoc[1]) else: return id def _get_id_for_url (self, url, add=1): doc = self._url_to_doc_mapping.get(url) id = doc and self._get_id_for_doc(doc, add) id = id or self._url_to_id_mapping.get(url) if not id: # possibly valid main part, but invalid tag. Return ID for main part in that case. if type(url) == type(()): id = self._get_id_for_url(url[0], 0) if not id and add: # OK, no ID, but we should assign one id = self._current_id self._current_id = self._current_id + 1 self._url_to_id_mapping[url] = id # message("** Gave ID %s to url %s\n" % (id, url)) return id def get_or_add (self, url_or_doc): # For a standard URL, returns the numeric record ID. # For a URL which has a fragment-id: # If the fragment is a paragraph of a text page, a pair # (record-id, paragraph-id) is returned. # Otherwise, just the record id is returned. # If arg is PluckerDocument, returns the id assigned for that document. # If arg is integer, treats it as a registered-document id. Get-only. if type(url_or_doc) == type(''): import urllib url, tag = urllib.splittag(url_or_doc) finalurl = self._alias_list.get(url, url) if tag: id = self._get_id_for_url((finalurl, tag)) else: id = self._get_id_for_url(finalurl) return id elif isinstance(url_or_doc, PluckerDocs.PluckerDocument): url = url_or_doc.get_url() if not self._url_to_doc_mapping.has_key(url): self._url_to_doc_mapping[url] = url_or_doc if not self._doc_to_id_mapping.has_key(url_or_doc) and self._url_to_id_mapping.has_key(url): self._doc_to_id_mapping[url_or_doc] = self._url_to_id_mapping[url] if not self._doc_to_id_mapping.has_key(url_or_doc): message(2, "New document %s added", url_or_doc) return self._get_id_for_doc(url_or_doc) else: raise ValueError("not a URL or an instance of " + str(PluckerDocs.PluckerDocument)) def build_links (self): # build and return a list of the URL strings for all IDs used key_dict = self._url_to_doc_mapping.copy() key_dict.update(self._url_to_id_mapping) # build a list of all URLs and associated IDs for key in key_dict.keys(): if type(key) == type('') and len(key) > 7 and key[:7] == 'mailto:': del key_dict[key] continue if type(key) == type(()): # either resolved tag, in which case value is tuple, # or unresolved tag, in which case value is integer value = key_dict[key] del key_dict[key] if type(value) == type(()): # truncate key to just plain record key = key[0] value = value[0] else: key = key[0] + '#' + key[1] key_dict[key] = value if isinstance(key_dict[key], PluckerDocs.PluckerDocument): key_dict[key] = self._get_id_for_doc(key_dict[key]) # invert the dictionary for item in key_dict.items(): del key_dict[item[0]] key_dict[item[1]] = item[0] # build up the list of URLs urls = [] for i in range(self._current_id): urls.append(key_dict.get(i) or '') urls[1] = '' # no URL needed for index record return urls def get_docs(self): # return a list of all the PluckerDocuments known to the mapper return self._doc_to_id_mapping.keys() def print_mapping(self): # print a list of all the URL's and associated IDs message(0, '*********\n') message(0, 'PluckerDoc record ids:') for (doc, id) in self._doc_to_id_mapping.items(): #sys.stderr.write(str(doc) + ' ' + str(id) + '\n') if type(doc) == type(()): url = doc[0].get_url() message(0, '%70s => %3d (%s)\n' % (url, id, str(doc[1]))) else: url = doc.get_url() message(0, '%70s => %3d\n' % (url, id)) if len(self._url_to_id_mapping) > 0: message(0, 'Non-included URL record ids:') for (url, id) in self._url_to_id_mapping.items(): message(0, '%70s => %3d\n' % (url, id)) message(0, '*********\n') class Writer: """Abstract base class from which to derive the various writers for documents""" def __init__ (self, collection, config, urlmapper=None): self._collection = collection self._config = config self._mapper = urlmapper def save_data (self, data, url, id, verbose): """This needs to be implemented in the derived class to actually output the 'data' (human readably denoted as 'url') as something with id 'id'.""" raise NotImplementedError("PyPlucker.Writer.Writer.save_doc()") def _write_doc (self, out_dict, pluckerdoc, url, id, verbose): def _print_convert_msg (url, id, verbose, config): if verbose > 1: line_length = config.get_int('status_line_length', 60) urltext = str (url) if len (urltext) > line_length: urltext = urltext[:line_length - 20] + "....." + urltext[-15:] message("Converted %4d: %s" % (id, urltext)) if id != self._mapper.get_or_add(pluckerdoc): raise ValueError("bad id %d instead of %d" % (id, self._mapper.get_or_add(pluckerdoc))) if pluckerdoc.is_text_document (): dumps = pluckerdoc.dump_record_with_splits (self._mapper) # sys.stderr.write("dumps is %s\n" % str(map(lambda p: (p[0], p[1]), dumps))) if dumps[0][1] != id: message("****** bad id %d instead of %d" % (dumps[0][1], id,)) for dump in dumps: (the_url, the_id, dump) = dump if the_id == 0: the_id = id # original out_dict [the_id] = (dump, the_url, the_id, verbose) _print_convert_msg(the_url, the_id, verbose, self._config) return else: dump = pluckerdoc.dump_record (id) out_dict [id] = (dump, url, id, verbose) _print_convert_msg(url, id, verbose, self._config) def write (self, verbose, alias_list=None): """Write out the collection. Returns the mapping that was used to generate the ids.""" def _print_convert_msg (url, verbose, config): if verbose > 0: line_length = config.get_int('status_line_length', 60) urltext = str (url) if len (urltext) > line_length: urltext = urltext[:line_length - 20] + "....." + urltext[-15:] message("Converting %s..." % urltext) self._mapper = Mapper(self._collection, alias_list.as_dict()) # figure default charset mibenum = self._config.get_int('default_charset', 0) or None charsets = {} if verbose > 2: self._mapper.print_mapping() out_dict = {} bookmarks = {} for pluckerdoc in self._mapper.get_docs(): id = self._mapper.get_or_add(pluckerdoc) _print_convert_msg(pluckerdoc.get_url(), verbose, self._config) if pluckerdoc.is_multiimage_document (): pluckerdoc.resolve_ids (self._mapper) if pluckerdoc.is_table_document (): pluckerdoc.resolve_ids (self._mapper) if pluckerdoc.is_text_document (): pluckerdoc.resolve_ids (self._mapper) doc_mibenum = pluckerdoc.get_charset() if verbose > 2: charset_name = charset_mibenum_to_name(doc_mibenum) message(2, pluckerdoc.get_url() + ' has charset ' + str(doc_mibenum) + ((charset_name and " (" + charset_name + ")") or "") + "\n") if charsets.has_key(doc_mibenum): charsets[doc_mibenum].append(id) else: charsets[doc_mibenum] = [id] # Add doc.bookmarks to bookmark list if self._config and self._config.get_bool('bookmark_pages', 0): key = pluckerdoc.get_url() pid = self._mapper.get_or_add(key) key = string.split(key, ":") key = key[-1] key = string.split(key, "/") key = key[-1] key = string.split(key, "?") key = key[0] if not len(key): key = 'Home Page' if not bookmarks.has_key(key): bookmarks[key] = (pid, 0) if self._config and self._config.get_bool('bookmarks', 0): tmp_book = pluckerdoc.get_bookmark_ids() for key in tmp_book.keys(): if not bookmarks.has_key(key): bookmarks[key] = tmp_book[key] self._write_doc (out_dict, pluckerdoc, pluckerdoc.get_url(), id, verbose) ## Do some error checking if not out_dict.has_key (2): raise RuntimeError("The collection process failed to generate a 'home' document") ## set up the metadata mapping, if any metadata = {} # set the default to the charset which has the 'most' pages items = charsets.items() if len(items) > 0: # have to allow for image-only document items.sort(lambda x, y: ((len(x[1]) < len(y[1]) and 1) or ((len(x[1]) > len(y[1])) and -1) or 0)) mibenum = items[0][0] odd_charsets = [] if len(items) > 1: for item in items[1:]: for id in item[1]: odd_charsets.append((id, item[0] or 0,)) else: mibenum = None odd_charsets = [] if mibenum != None: metadata['CharSet'] = mibenum if verbose > 1: charset_name = charset_mibenum_to_name(mibenum) message('Default charset is MIBenum ' + str(mibenum) + ((charset_name and " (" + charset_name + ")") or "")) else: message('No default charset') if len(odd_charsets) > 0: metadata['ExceptionalCharSets'] = odd_charsets message("ExceptionalCharSets is " + str(odd_charsets) + "\n") intended_owner = self._config.get_string('owner_id_build') if intended_owner: metadata['OwnerID'] = intended_owner message(2, "OwnerID is '%s'", intended_owner) author = self._config.get_string('author_md') if author: metadata['Author'] = author message(2, "Author is '%s'", author) title = self._config.get_string('title_md') if title: metadata['Title'] = title message(2, "Title is '%s'", title) ## write the index record tmp_url = "plucker:/~special~/index" type = PluckerDocs.PluckerIndexDocument (tmp_url, self._config, metadata, bookmarks) self._write_doc (out_dict, type, tmp_url, 1, verbose) ## write the bookmark record (if any) if len(bookmarks): tmp_url = "plucker:/~special~/bookmarks" bookdoc = PluckerDocs.PluckerBookmarkDocument(tmp_url, bookmarks) self._write_doc (out_dict, bookdoc, tmp_url, 6, verbose) ## write the URL information, if desired if not self._config.get_bool ('no_urlinfo', 0): links = self._mapper.build_links() # for i in range(len(links)): # message(0, "%3d: '%s'", i, links[i]) linksdocs = [] for i in range(1, len(links), 200): tmp_url = "plucker:/~special~/links" + str(i) linksdoc = PluckerDocs.PluckerLinksDocument(tmp_url, links, i) self._mapper.get_or_add(linksdoc) linksdocs.append(linksdoc) # now make links index tmp_url = "plucker:/~special~/pluckerlinks" indexdoc = PluckerDocs.PluckerLinkIndexDocument(tmp_url, linksdocs, self._mapper) self._mapper.get_or_add(indexdoc) # OK, write the links index document self._write_doc (out_dict, indexdoc, tmp_url, 3, verbose) # and write the various links documents for doc in linksdocs: self._write_doc (out_dict, doc, doc.get_url(), self._mapper.get_or_add(doc), verbose) ## write the category information, if present if self._config.get_string ('category') is not None: tmp_url = "plucker:/~special~/category" type = PluckerDocs.PluckerCategoryDocument (tmp_url, self._config) self._write_doc (out_dict, type, tmp_url, 4, verbose) ## write the metadata record, if any if metadata: tmp_url = "plucker:/~special~/metadata" type = PluckerDocs.PluckerMetadataDocument (tmp_url, metadata) self._write_doc (out_dict, type, tmp_url, 5, verbose) ## now write everything else the_ids = out_dict.keys () the_ids.sort () # they are numeric, so sort does the right thing for id in the_ids: dump, the_url, the_id, verbose = out_dict[id] self.save_data (dump, the_url, the_id, verbose) if verbose: line_length = self._config.get_int('status_line_length', 60) urltext = str (the_url) if len (urltext) > line_length: urltext = urltext[:line_length - 20] + "....." + urltext[-15:] message("Wrote %d <= %s" % (the_id, urltext)) return self._mapper class CacheWriter (Writer): """A Writer that writes the traditional format of a separate files in a cache directory""" def __init__ (self, collection, config, cachedir): Writer.__init__ (self, collection, config) self._cachedir = cachedir def write (self, verbose, alias_list): cachedir = os.path.expandvars (self._cachedir) cachedir = os.path.expanduser (cachedir) if not os.path.exists (cachedir): error("%s does not exists!" % cachedir) return if not os.path.isdir (cachedir): error("%s is not a directory" % cachedir) return # clear the cache directory for name in os.listdir (cachedir): fname = os.path.join (cachedir, name) if os.path.isfile (fname): os.unlink (fname) # Now call the super class to do the actual work return Writer.write (self, verbose, alias_list=alias_list) def save_data (self, data, url, id, verbose): filename = os.path.join (self._cachedir, "%d" % id) file = open (filename, "wb") file.write (data) file.close () class PDBWriter (Writer): """A Writer that writes the items into a ready-to-synch PDB file.""" def __init__ (self, collection, config, name, version, filename): Writer.__init__ (self, collection, config) self._filename = filename self._dbname = name self._dbversion = version self._pdb_file = None self._flag_copy_prevention = config.get_bool ('copyprevention_bit') self._flag_launchableData = config.get_bool ('launchable_bit') self._flag_backup = config.get_bool ('backup_bit') self._icon = config.get_bool ('icon') or config.get_bool('launchable_bit') self._big_icon = config.get_string ('big_icon','') self._small_icon = config.get_string ('small_icon','') self._config = config self._creator_id = config.get_string('creator_id', 'Plkr') def write (self, verbose, alias_list, mapping=None): if os.path.exists (self._filename): os.unlink (self._filename) if self._filename == '': if sys.platform == "win32": import msvcrt msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) self._pdb_file = prc.File (sys.stdout, read=0, write=1) else: self._pdb_file = prc.File (self._filename, read=0, write=1) info = self._pdb_file.getDBInfo () info['name'] = self._dbname info['version'] = self._dbversion info['creator'] = self._creator_id #info['creator'] = 'Plkr' info['type'] = 'Data' info['createDate'] = int (time.time()) info['modifyDate'] = info['createDate'] info['backupDate'] = -2082844800L info['flagCopyPrevention'] = self._flag_copy_prevention info['flagLaunchableData'] = self._flag_launchableData info['flagBackup'] = self._flag_backup if self._icon: self._pdb_file.setAppBlock( \ helper.PQAAppInfo.pqa_app_info_block(self._config, \ self._dbname, \ self._dbversion, \ self._big_icon, \ self._small_icon)) self._pdb_file.setDBInfo (info) # Now call the super class to do the actual work result = Writer.write (self, verbose, alias_list=alias_list) self._pdb_file.close () return result def save_data (self, data, url, id, verbose): assert self._pdb_file is not None, "write_doc called with unintialized pdb file" self._pdb_file.setRecord (attr=0, id=id, cat=0, data=data) class DictWriter (Writer): """A Writer that writes each record into a passed dictionary with the record number as the key""" def __init__ (self, collection, config, dict): Writer.__init__ (self, collection, config) self._dict = dict def save_data (self, data, url, id, verbose): self._dict[id] = data