#
#  This file is part of Documancer (http://documancer.sf.net)
#
#  Copyright (C) 2002-2005 Vaclav Slavik
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License version 2 as
#  published by the Free Software Foundation.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#  $Id: indexer.py,v 1.35 2005/02/05 10:36:19 vaclavslavik Exp $
#
#  Fulltext search index, using PyLucene
#


import os, os.path, tempfile, urllib, urlparse, sys, re, string, shutil, sets
from HTMLParser import HTMLParser, HTMLParseError
import utils, providers, book, cache

import indexers


_fulltextIndexer = None

def getFulltextIndexer():
    """Returns FulltextIndexer object to use."""
    global _fulltextIndexer
    if _fulltextIndexer == None:
        _fulltextIndexer = indexers.createBest()
    return _fulltextIndexer

def shutdown():
    global _fulltextIndexer
    _fulltextIndexer = None

def getCacheObject(book):
    return cache.get(book).objects['index.lucene']


class FulltextCacheObject(cache.DirCacheObject):
    """Cache object for indexing of given @a book.
       @a deps is list of cache objects that the index depends on
       (typically, list of all indexed files)."""   
    def __init__(self, book, deps):
        cache.DirCacheObject.__init__(self, book, 'index.lucene', deps)
    def doUpdate(self, filename, ctrl):
        cache.DirCacheObject.doUpdate(self, filename, ctrl)
        return updateIndex(self.book, filename, ctrl)
    
    def isActive(self):
        return self.book.getAttr(book.ATTR_INDEXED) == '1'


def disableIndex(book):
    getCacheObject(book).clear()

def updateIndex(bk, idir, ctrl):
    port = utils.getServerPort()

    prov = bk.getProviderObj()
    startURL = prov.getURLForIndexing(bk)

    indexer = getFulltextIndexer()
    indexer.startIndexing(idir)

    retval = True

    try:
        # determine which files to scan:
        urlPrefix = urlparse.urljoin(startURL, './') # dir of startURL
        # fetch the page plus all pages we discover:
        knownURLs = sets.Set()
        todo = sets.Set()
        todo.add(startURL)
        knownURLs.add(startURL)
        while len(todo) > 0:
            if ctrl.cancel:
                retval = False
                break
           
            page = todo.pop()
            percent = 100 * (1 - float(len(todo))/float(len(knownURLs)))
            ctrl.message('[%02i%%] %s' % (percent, page))
            
            fetchPages(page, bk, indexer, todo, knownURLs, urlPrefix)
    finally:
        indexer.stopIndexing()

    return retval


# types of text (sorted by decreasing importance):
TXT_TITLE       = 0
TXT_HEADING     = 1
TXT_SUBHEADING  = 2
TXT_EMPHASIZED  = 3
TXT_NORMAL      = 4

class HtmlProcessor(HTMLParser):
    """
    Processes HTML code and does three things with it:
      * extracts URLs
      * extracts content type if present
      * categorizes text by its importance (based on what tags surround it)
    """
    
    def __init__(self):
        self.text = ''
        HTMLParser.__init__(self)
        self.content_type = 'text/html'
        self.curType = [('', TXT_NORMAL)]
        self.text = ['', '', '', '', '']
        self.urls = []


    def handle_starttag(self, tag, attrs):
        cur_level = self.curType[-1][1]
        
        if tag == 'meta':
            at={}
            for a,v in attrs: at[a] = v
            if (at.has_key('http-equiv') and
                   at['http-equiv'].lower() == 'content-type'):
                try:
                    self.content_type = at['content']
                except KeyError: pass
            return
        
        if tag == 'a':
            for attr, value in attrs:
                if attr == 'href':
                    self.urls.append(value)
            return
        
        if tag in ['u', 'b', 'i']:
            new_level = TXT_EMPHASIZED
        elif tag in ['h3', 'h4']:
            new_level = TXT_SUBHEADING
        elif tag in ['h1', 'h2']:
            new_level = TXT_HEADING
        elif tag == 'title':
            new_level = TXT_TITLE
        else:
            return
       
        # <h1><b>foo</b></h1> should be heading, not emphasized text, so
        # we have to keep track of the highest current rating:
        self.curType.append((tag, min(new_level, cur_level)))


    def handle_endtag(self, tag):
        if tag in ['title', 'h1', 'h2', 'h3', 'h4', 'u', 'b', 'i']:
            cur_tag, cur_level = self.curType[-1]
            if cur_tag == tag:
                self.curType.pop()
            else:
                # Incorrect HTML code, the tags don't match. We have no
                # idea what's going on, so lets bail out and resent the
                # memory to initial state:
                self.curType = [('', TXT_NORMAL)]


    def handle_data(self, data):
        level = self.curType[-1][1]
        self.text[level] = ' '.join((self.text[level], data))


def indexHTMLFile(processed, contenttype, indexer, url):
    if contenttype.startswith('text/html; charset='):
        processed.content_type = contenttype
    if processed.content_type.startswith('text/html; charset='):
        charset=processed.content_type[len('text/html; charset='):]
    else:
        charset='iso-8859-1'

    # if the page doesn't have title, use heading instead:
    if processed.text[TXT_TITLE] == '':
        processed.text[TXT_TITLE] = processed.text[TXT_HEADING]

    def recode(txt, charset):
        try:
            try:
                txt2 = unicode(txt, charset, errors='replace')
            except LookupError:
                txt2 = unicode(txt, 'iso-8859-1', errors='replace')
        except TypeError:
            # already in unicode
            return txt
        return txt2
 
    data = {
        'title'       : recode(processed.text[TXT_TITLE], charset),
        'headings'    : recode(processed.text[TXT_HEADING], charset),
        'subheadings' : recode(processed.text[TXT_SUBHEADING], charset),
        'emphasized'  : recode(processed.text[TXT_EMPHASIZED], charset),
        'contents'    : recode(processed.text[TXT_NORMAL], charset)
    }
    
    indexer.indexDocument(url, data)


def fetchPages(page, b, indexer, todo, knownURLs, urlPrefix):
    #print page, '(todo:%i, found:%i)' % (len(todo), len(knownURLs))

    # return value: discovered pages
    newpages = []

    provider = b.getProviderObj()

    result = provider.serve(b, urllib.unquote(page))
    if result == None:
        # invalid page, not server by the provider
        return

    dlfile, contenttype = result

    if not (contenttype.startswith('text/html') or contenttype == 'text/plain'):
        # we don't know how to index this file
        return

    proc = HtmlProcessor()
    try:
        proc.feed(dlfile)
        proc.close()
    except HTMLParseError:
        pass # what else can we do?

    try:
        # index the file:
        indexHTMLFile(proc, contenttype, indexer, page)
    except:
        # FIXME: what?!
        import traceback
        print traceback.print_exc()

    for url in proc.urls:
        # normalize the URL to be absolute:
        try:
            url = urlparse.urljoin(page, url)
            scheme, host, path, query, fragment = urlparse.urlsplit(url)
       
            # We're only interested in files from Documancer's server.
            # They'll have empty protocol and host because we pass them as
            # book-local URLs here (e.g. "/usr/share/doc/boost/index.html"):
            if scheme == '' and host == '' and path.startswith(urlPrefix):
                url = urlparse.urlunsplit((scheme, host, path, query, ''))
                if url not in knownURLs:
                    knownURLs.add(url)
                    todo.add(url)

        except UnicodeDecodeError:
            # links with non-ascii chars can cause this exception; ignore
            # these invalid links, we can't handle them gracefully
            pass # FIXME: report warning to the user?


def search(b, query):
    obj = getCacheObject(b)

    if not obj.isActive() or not obj.exists() or \
       not os.path.isfile(os.path.join(obj.getFilename(), 'segments')):
        utils.uiCallback.error(
                "Book '%s' doesn't have fulltext index!" % b.title)
        return []
    
    idir = obj.getFilename()

    result = []
    querystr = '?query=%s' % urllib.quote(query, "")

    myquery = "title:%s^10" % query + \
              "headings:%s^5" % query + \
              "subheadings:%s^4" % query + \
              "emphasized:%s^2" % query + \
              query
              #FIXME: escaping of query string!
              #FIXME: allow or and and expressions

    # Retrieve matches from the index:
    obj.lock()
    try:
        for r in getFulltextIndexer().search(idir, myquery):
            score = '%.1f' % (r.score * 100)
            url = b.makeFullURL(r.url) + querystr
            result.append((r.title, url, score))
    finally:
        obj.unlock()

    return result