# # This file is part of Documancer (http://documancer.sf.net) # # Copyright (C) 2002-2005 Vaclav Slavik # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # $Id: indexer.py,v 1.35 2005/02/05 10:36:19 vaclavslavik Exp $ # # Fulltext search index, using PyLucene # import os, os.path, tempfile, urllib, urlparse, sys, re, string, shutil, sets from HTMLParser import HTMLParser, HTMLParseError import utils, providers, book, cache import indexers _fulltextIndexer = None def getFulltextIndexer(): """Returns FulltextIndexer object to use.""" global _fulltextIndexer if _fulltextIndexer == None: _fulltextIndexer = indexers.createBest() return _fulltextIndexer def shutdown(): global _fulltextIndexer _fulltextIndexer = None def getCacheObject(book): return cache.get(book).objects['index.lucene'] class FulltextCacheObject(cache.DirCacheObject): """Cache object for indexing of given @a book. @a deps is list of cache objects that the index depends on (typically, list of all indexed files).""" def __init__(self, book, deps): cache.DirCacheObject.__init__(self, book, 'index.lucene', deps) def doUpdate(self, filename, ctrl): cache.DirCacheObject.doUpdate(self, filename, ctrl) return updateIndex(self.book, filename, ctrl) def isActive(self): return self.book.getAttr(book.ATTR_INDEXED) == '1' def disableIndex(book): getCacheObject(book).clear() def updateIndex(bk, idir, ctrl): port = utils.getServerPort() prov = bk.getProviderObj() startURL = prov.getURLForIndexing(bk) indexer = getFulltextIndexer() indexer.startIndexing(idir) retval = True try: # determine which files to scan: urlPrefix = urlparse.urljoin(startURL, './') # dir of startURL # fetch the page plus all pages we discover: knownURLs = sets.Set() todo = sets.Set() todo.add(startURL) knownURLs.add(startURL) while len(todo) > 0: if ctrl.cancel: retval = False break page = todo.pop() percent = 100 * (1 - float(len(todo))/float(len(knownURLs))) ctrl.message('[%02i%%] %s' % (percent, page)) fetchPages(page, bk, indexer, todo, knownURLs, urlPrefix) finally: indexer.stopIndexing() return retval # types of text (sorted by decreasing importance): TXT_TITLE = 0 TXT_HEADING = 1 TXT_SUBHEADING = 2 TXT_EMPHASIZED = 3 TXT_NORMAL = 4 class HtmlProcessor(HTMLParser): """ Processes HTML code and does three things with it: * extracts URLs * extracts content type if present * categorizes text by its importance (based on what tags surround it) """ def __init__(self): self.text = '' HTMLParser.__init__(self) self.content_type = 'text/html' self.curType = [('', TXT_NORMAL)] self.text = ['', '', '', '', ''] self.urls = [] def handle_starttag(self, tag, attrs): cur_level = self.curType[-1][1] if tag == 'meta': at={} for a,v in attrs: at[a] = v if (at.has_key('http-equiv') and at['http-equiv'].lower() == 'content-type'): try: self.content_type = at['content'] except KeyError: pass return if tag == 'a': for attr, value in attrs: if attr == 'href': self.urls.append(value) return if tag in ['u', 'b', 'i']: new_level = TXT_EMPHASIZED elif tag in ['h3', 'h4']: new_level = TXT_SUBHEADING elif tag in ['h1', 'h2']: new_level = TXT_HEADING elif tag == 'title': new_level = TXT_TITLE else: return #

foo

should be heading, not emphasized text, so # we have to keep track of the highest current rating: self.curType.append((tag, min(new_level, cur_level))) def handle_endtag(self, tag): if tag in ['title', 'h1', 'h2', 'h3', 'h4', 'u', 'b', 'i']: cur_tag, cur_level = self.curType[-1] if cur_tag == tag: self.curType.pop() else: # Incorrect HTML code, the tags don't match. We have no # idea what's going on, so lets bail out and resent the # memory to initial state: self.curType = [('', TXT_NORMAL)] def handle_data(self, data): level = self.curType[-1][1] self.text[level] = ' '.join((self.text[level], data)) def indexHTMLFile(processed, contenttype, indexer, url): if contenttype.startswith('text/html; charset='): processed.content_type = contenttype if processed.content_type.startswith('text/html; charset='): charset=processed.content_type[len('text/html; charset='):] else: charset='iso-8859-1' # if the page doesn't have title, use heading instead: if processed.text[TXT_TITLE] == '': processed.text[TXT_TITLE] = processed.text[TXT_HEADING] def recode(txt, charset): try: try: txt2 = unicode(txt, charset, errors='replace') except LookupError: txt2 = unicode(txt, 'iso-8859-1', errors='replace') except TypeError: # already in unicode return txt return txt2 data = { 'title' : recode(processed.text[TXT_TITLE], charset), 'headings' : recode(processed.text[TXT_HEADING], charset), 'subheadings' : recode(processed.text[TXT_SUBHEADING], charset), 'emphasized' : recode(processed.text[TXT_EMPHASIZED], charset), 'contents' : recode(processed.text[TXT_NORMAL], charset) } indexer.indexDocument(url, data) def fetchPages(page, b, indexer, todo, knownURLs, urlPrefix): #print page, '(todo:%i, found:%i)' % (len(todo), len(knownURLs)) # return value: discovered pages newpages = [] provider = b.getProviderObj() result = provider.serve(b, urllib.unquote(page)) if result == None: # invalid page, not server by the provider return dlfile, contenttype = result if not (contenttype.startswith('text/html') or contenttype == 'text/plain'): # we don't know how to index this file return proc = HtmlProcessor() try: proc.feed(dlfile) proc.close() except HTMLParseError: pass # what else can we do? try: # index the file: indexHTMLFile(proc, contenttype, indexer, page) except: # FIXME: what?! import traceback print traceback.print_exc() for url in proc.urls: # normalize the URL to be absolute: try: url = urlparse.urljoin(page, url) scheme, host, path, query, fragment = urlparse.urlsplit(url) # We're only interested in files from Documancer's server. # They'll have empty protocol and host because we pass them as # book-local URLs here (e.g. "/usr/share/doc/boost/index.html"): if scheme == '' and host == '' and path.startswith(urlPrefix): url = urlparse.urlunsplit((scheme, host, path, query, '')) if url not in knownURLs: knownURLs.add(url) todo.add(url) except UnicodeDecodeError: # links with non-ascii chars can cause this exception; ignore # these invalid links, we can't handle them gracefully pass # FIXME: report warning to the user? def search(b, query): obj = getCacheObject(b) if not obj.isActive() or not obj.exists() or \ not os.path.isfile(os.path.join(obj.getFilename(), 'segments')): utils.uiCallback.error( "Book '%s' doesn't have fulltext index!" % b.title) return [] idir = obj.getFilename() result = [] querystr = '?query=%s' % urllib.quote(query, "") myquery = "title:%s^10" % query + \ "headings:%s^5" % query + \ "subheadings:%s^4" % query + \ "emphasized:%s^2" % query + \ query #FIXME: escaping of query string! #FIXME: allow or and and expressions # Retrieve matches from the index: obj.lock() try: for r in getFulltextIndexer().search(idir, myquery): score = '%.1f' % (r.score * 100) url = b.makeFullURL(r.url) + querystr result.append((r.title, url, score)) finally: obj.unlock() return result