# # This file is part of Documancer (http://documancer.sf.net) # # Copyright (C) 2002,2003,2004 Vaclav Slavik # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # $Id: server.py,v 1.11 2004/12/19 22:25:16 vaclavslavik Exp $ # # HTTP server serving all the documents # import sys, BaseHTTPServer, string, re, urllib from HTMLParser import HTMLParser, HTMLParseError from threading import * import utils, book, providers serverLock = Lock() def runServer(inBackground): server_address = ('', utils.getServerPort()) httpd = DocumancerHTTPServer(server_address) if inBackground: class ServerThread(Thread): def run(self): self.httpd.serve_forever() thread = ServerThread() thread.httpd = httpd thread.setDaemon(1) thread.start() else: httpd.serve_forever() class DocumancerHTTPServer(BaseHTTPServer.HTTPServer): def __init__(self, server_addr): BaseHTTPServer.HTTPServer.__init__(self, server_addr, RequestHandler) def handle_error(self, request, client_address): pass # NB: this is an ugly hack -- we need information about the number of # highlighted hits so that we can navigate among them, but there's no # way to pass this information from HTTP request (FIXME: maybe by some # JavaScript hack?) through Mozilla to Documancer, so we keep dictionary # of fulfilled highlighting requests and look into it from Documancer # code. queryHitsCounts = {} class RequestHandler(BaseHTTPServer.BaseHTTPRequestHandler): def send_document(self, contents, mime): try: self.send_response(200) self.send_header('Content-Type', mime) self.send_header('Content-Length', len(contents)) self.end_headers() # we can't send the data in one chunk because TCP buffer may be # too small for them and e.g. Windows doesn't like it: size = len(contents) pos = 0 CHUNKSIZE = 1024 while pos < size: self.wfile.write(contents[pos:pos+CHUNKSIZE]) pos += CHUNKSIZE except: # strangely, "except IOError" causes a problem: Documancer's server # thread hangs and stops responding; but IOError exception is *not* # catched pass def log_message(self, *args): pass def do_GET(self): if self.client_address[0] != '127.0.0.1': self.send_error(403, 'Request forbidden') return serverLock.acquire() urlAndParams = self.path.split("?") if len(urlAndParams) == 2 and urlAndParams[1][:6] == "query=": query = urllib.unquote(urlAndParams[1][6:].replace("+", " ")) else: query = None parts = urlAndParams[0].split("/") if len(parts) < 3: self.send_error(404, "Invalid URL") serverLock.release() return bookId = utils.demangleBookName(parts[1]) url = urllib.unquote(string.join(parts[2:], "/")) if bookId not in book.books.keys(): self.send_error(404, "Unknown book") serverLock.release() return bookObj = book.books[bookId] prov = providers.providers[bookObj.provider] reply = prov.serve(bookObj, url) if reply != None: data,mime = reply if query != None: global queryHitsCounts data, hitsCount = highlightQuery(data, query, mime) key = 'http://localhost:%i%s' % (utils.getServerPort(), self.path) queryHitsCounts[key] = hitsCount self.send_document(data, mime) else: self.send_error(404, "File not found (%s)" % url) serverLock.release() return serverLock.release() class HighlightingParser(HTMLParser): def __init__(self, words): HTMLParser.__init__(self) self.words = words self.result = [] # Construct regex pattern that matches these words: self.regex = re.compile(r"\b(%s)\b" % reduce(lambda a,b: "%s|%s" % (a,b), words), re.IGNORECASE) self.match_number = -1 self.do_highlight = 1 def handle_starttag(self, tag, attrs): if len(attrs) == 0: self.result.append('<%s>' % tag) else: self.result.append( '<%s %s>' % (tag, ' '.join(['%s="%s"' % (x[0],x[1]) for x in attrs]))) if tag == 'title' or tag == 'script': self.do_highlight = 0 def handle_startendtag(self, tag, attrs): if len(attrs) == 0: self.result.append('<%s/>' % tag) else: self.result.append( '<%s %s/>' % (tag, ' '.join(['%s="%s"' % (x[0],x[1]) for x in attrs]))) def handle_endtag(self, tag): self.result.append('' % tag) if tag == 'title' or tag == 'script': self.do_highlight = 1 def handle_data(self, data): if self.do_highlight: self.result.append(self.regex.sub(self._replacement, data)) else: self.result.append(data) def _replacement(self, match): # Adds some markup around a regex match self.match_number += 1 return ('%s' % (self.match_number, match.group(0))) def handle_charref(self, name): self.result.append('&#%s;' % name) def handle_entityref(self, name): self.result.append('&%s;' % name) def get_result(self): return ''.join(self.result) def get_hits_count(self): return self.match_number + 1 def highlightQuery(html_text, query, mime): """ Hight matches in HTML markup with yellow background.""" if mime != 'text/html': return (html_text, 0) # Only highlight real words, not operators: words = map(lambda x: re.escape(x), filter((lambda x: len(x) > 0 and x not in ["and", "or", "not"]), query.split(' ')) ) if len(words) == 0: return (html_text, 0) try: hp = HighlightingParser(words) hp.feed(html_text) hp.close() except HTMLParseError: return (html_text, 0) return (hp.get_result(), hp.get_hits_count())