#
# This file is part of Documancer (http://documancer.sf.net)
#
# Copyright (C) 2002-2005 Vaclav Slavik
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# $Id: indexer.py,v 1.35 2005/02/05 10:36:19 vaclavslavik Exp $
#
# Fulltext search index, using PyLucene
#
import os, os.path, tempfile, urllib, urlparse, sys, re, string, shutil, sets
from HTMLParser import HTMLParser, HTMLParseError
import utils, providers, book, cache
import indexers
_fulltextIndexer = None
def getFulltextIndexer():
"""Returns FulltextIndexer object to use."""
global _fulltextIndexer
if _fulltextIndexer == None:
_fulltextIndexer = indexers.createBest()
return _fulltextIndexer
def shutdown():
global _fulltextIndexer
_fulltextIndexer = None
def getCacheObject(book):
return cache.get(book).objects['index.lucene']
class FulltextCacheObject(cache.DirCacheObject):
"""Cache object for indexing of given @a book.
@a deps is list of cache objects that the index depends on
(typically, list of all indexed files)."""
def __init__(self, book, deps):
cache.DirCacheObject.__init__(self, book, 'index.lucene', deps)
def doUpdate(self, filename, ctrl):
cache.DirCacheObject.doUpdate(self, filename, ctrl)
return updateIndex(self.book, filename, ctrl)
def isActive(self):
return self.book.getAttr(book.ATTR_INDEXED) == '1'
def disableIndex(book):
getCacheObject(book).clear()
def updateIndex(bk, idir, ctrl):
port = utils.getServerPort()
prov = bk.getProviderObj()
startURL = prov.getURLForIndexing(bk)
indexer = getFulltextIndexer()
indexer.startIndexing(idir)
retval = True
try:
# determine which files to scan:
urlPrefix = urlparse.urljoin(startURL, './') # dir of startURL
# fetch the page plus all pages we discover:
knownURLs = sets.Set()
todo = sets.Set()
todo.add(startURL)
knownURLs.add(startURL)
while len(todo) > 0:
if ctrl.cancel:
retval = False
break
page = todo.pop()
percent = 100 * (1 - float(len(todo))/float(len(knownURLs)))
ctrl.message('[%02i%%] %s' % (percent, page))
fetchPages(page, bk, indexer, todo, knownURLs, urlPrefix)
finally:
indexer.stopIndexing()
return retval
# types of text (sorted by decreasing importance):
TXT_TITLE = 0
TXT_HEADING = 1
TXT_SUBHEADING = 2
TXT_EMPHASIZED = 3
TXT_NORMAL = 4
class HtmlProcessor(HTMLParser):
"""
Processes HTML code and does three things with it:
* extracts URLs
* extracts content type if present
* categorizes text by its importance (based on what tags surround it)
"""
def __init__(self):
self.text = ''
HTMLParser.__init__(self)
self.content_type = 'text/html'
self.curType = [('', TXT_NORMAL)]
self.text = ['', '', '', '', '']
self.urls = []
def handle_starttag(self, tag, attrs):
cur_level = self.curType[-1][1]
if tag == 'meta':
at={}
for a,v in attrs: at[a] = v
if (at.has_key('http-equiv') and
at['http-equiv'].lower() == 'content-type'):
try:
self.content_type = at['content']
except KeyError: pass
return
if tag == 'a':
for attr, value in attrs:
if attr == 'href':
self.urls.append(value)
return
if tag in ['u', 'b', 'i']:
new_level = TXT_EMPHASIZED
elif tag in ['h3', 'h4']:
new_level = TXT_SUBHEADING
elif tag in ['h1', 'h2']:
new_level = TXT_HEADING
elif tag == 'title':
new_level = TXT_TITLE
else:
return
#
foo
should be heading, not emphasized text, so
# we have to keep track of the highest current rating:
self.curType.append((tag, min(new_level, cur_level)))
def handle_endtag(self, tag):
if tag in ['title', 'h1', 'h2', 'h3', 'h4', 'u', 'b', 'i']:
cur_tag, cur_level = self.curType[-1]
if cur_tag == tag:
self.curType.pop()
else:
# Incorrect HTML code, the tags don't match. We have no
# idea what's going on, so lets bail out and resent the
# memory to initial state:
self.curType = [('', TXT_NORMAL)]
def handle_data(self, data):
level = self.curType[-1][1]
self.text[level] = ' '.join((self.text[level], data))
def indexHTMLFile(processed, contenttype, indexer, url):
if contenttype.startswith('text/html; charset='):
processed.content_type = contenttype
if processed.content_type.startswith('text/html; charset='):
charset=processed.content_type[len('text/html; charset='):]
else:
charset='iso-8859-1'
# if the page doesn't have title, use heading instead:
if processed.text[TXT_TITLE] == '':
processed.text[TXT_TITLE] = processed.text[TXT_HEADING]
def recode(txt, charset):
try:
try:
txt2 = unicode(txt, charset, errors='replace')
except LookupError:
txt2 = unicode(txt, 'iso-8859-1', errors='replace')
except TypeError:
# already in unicode
return txt
return txt2
data = {
'title' : recode(processed.text[TXT_TITLE], charset),
'headings' : recode(processed.text[TXT_HEADING], charset),
'subheadings' : recode(processed.text[TXT_SUBHEADING], charset),
'emphasized' : recode(processed.text[TXT_EMPHASIZED], charset),
'contents' : recode(processed.text[TXT_NORMAL], charset)
}
indexer.indexDocument(url, data)
def fetchPages(page, b, indexer, todo, knownURLs, urlPrefix):
#print page, '(todo:%i, found:%i)' % (len(todo), len(knownURLs))
# return value: discovered pages
newpages = []
provider = b.getProviderObj()
result = provider.serve(b, urllib.unquote(page))
if result == None:
# invalid page, not server by the provider
return
dlfile, contenttype = result
if not (contenttype.startswith('text/html') or contenttype == 'text/plain'):
# we don't know how to index this file
return
proc = HtmlProcessor()
try:
proc.feed(dlfile)
proc.close()
except HTMLParseError:
pass # what else can we do?
try:
# index the file:
indexHTMLFile(proc, contenttype, indexer, page)
except:
# FIXME: what?!
import traceback
print traceback.print_exc()
for url in proc.urls:
# normalize the URL to be absolute:
try:
url = urlparse.urljoin(page, url)
scheme, host, path, query, fragment = urlparse.urlsplit(url)
# We're only interested in files from Documancer's server.
# They'll have empty protocol and host because we pass them as
# book-local URLs here (e.g. "/usr/share/doc/boost/index.html"):
if scheme == '' and host == '' and path.startswith(urlPrefix):
url = urlparse.urlunsplit((scheme, host, path, query, ''))
if url not in knownURLs:
knownURLs.add(url)
todo.add(url)
except UnicodeDecodeError:
# links with non-ascii chars can cause this exception; ignore
# these invalid links, we can't handle them gracefully
pass # FIXME: report warning to the user?
def search(b, query):
obj = getCacheObject(b)
if not obj.isActive() or not obj.exists() or \
not os.path.isfile(os.path.join(obj.getFilename(), 'segments')):
utils.uiCallback.error(
"Book '%s' doesn't have fulltext index!" % b.title)
return []
idir = obj.getFilename()
result = []
querystr = '?query=%s' % urllib.quote(query, "")
myquery = "title:%s^10" % query + \
"headings:%s^5" % query + \
"subheadings:%s^4" % query + \
"emphasized:%s^2" % query + \
query
#FIXME: escaping of query string!
#FIXME: allow or and and expressions
# Retrieve matches from the index:
obj.lock()
try:
for r in getFulltextIndexer().search(idir, myquery):
score = '%.1f' % (r.score * 100)
url = b.makeFullURL(r.url) + querystr
result.append((r.title, url, score))
finally:
obj.unlock()
return result