ports//net/straw/work/straw-0.27/src/lib/SummaryParser.py

""" Summaryparser.py

Wrapper module to feedparser and responsible for assigning data to Feed and
SummaryItems.
"""
__copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
__license__ = """
Straw is free software; you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.

Straw is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place - Suite 330, Boston, MA 02111-1307, USA. """


import sys
import types
import copy
import htmlentitydefs
import HTMLParser
import string
import time

import feedparser
import error

import utils
import SummaryItem
import ParsedSummary

class TitleImgParser(HTMLParser.HTMLParser):
    def __init__(self, feed=None):
        HTMLParser.HTMLParser.__init__(self)
        self._chars = list()
        self._image_urls = list()
        self._feed = feed

    def set_feed(self, feed):
        self._feed = feed

    def get_image_urls(self):
        return self._image_urls

    def get_text(self, nchars=None):
        text = ''.join(self._chars).strip()
        if nchars:
            text = text[:nchars]
        return text

    def close(self):
        self.flush()
        HTMLParser.HTMLParser.close(self)

    def flush(self):
        del self._chars[:]
        del self._image_urls[:]

    def handle_starttag(self, tag, attrs):
        if tag == 'img':
            for name, value in attrs:
                if name == 'src':
                    url = utils.complete_url(value, self._feed.location)
                    self._image_urls.append(url)
        return

    def handle_data(self, data):
        self._chars.append(data)

    def handle_charref(self, ref):
        # called for each character reference, e.g. for '&#160;', ref will be '160'
        if not self._chars: return
        ref = ref.lower()
        if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
            text = '&#%s;' % ref
        else:
            if ref[0] == 'x':
                c = int(ref[1:], 16)
            else:
                c = int(ref)
            text = unichr(c).encode('utf-8')
        self._chars.append(text)

    def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
        if not self._chars: return
        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
            text = '&%s;' % ref
        else:
            # entity resolution graciously donated by Aaron Swartz
            def name2cp(k):
                import htmlentitydefs
                if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
                    return htmlentitydefs.name2codepoint[k]
                k = htmlentitydefs.entitydefs[k]
                if k.startswith('&#') and k.endswith(';'):
                    return int(k[2:-1]) # not in latin-1
                return ord(k)
            try: name2cp(ref)
            except KeyError: text = '&%s;' % ref
            else: text = unichr(name2cp(ref)).encode('utf-8')
        self._chars.append(text)



def _remove_ids_if_duplicates(items):
    ids = {}
    duplicates = False
    for i in items:
        if i.guid is not None and i.guid != "":
            if ids.has_key(i.guid):
                duplicates = True
                break
            ids[i.guid] = True
    if duplicates:
        for i in items:
            i.guid = None
            i.guidislink = False
    return

def _to_unicode(text, encoding):
    if text and not isinstance(text, types.UnicodeType):
        text = unicode(text, encoding)
    return text

def feedparser_parse(data):
    pc = feedparser.parse(data)
    enc = pc.get('encoding', utils.get_locale_encoding())
    if not enc:
        enc = sys.getdefaultencoding()
    return (pc, enc)

def parse_channel_info(parsed, parsed_content, encoding):
    parsed.title = _to_unicode(parsed_content.feed.get('title', ''), encoding)
    parsed.description = _to_unicode(parsed_content.feed.get('description',''), encoding)
    parsed.link = _to_unicode(parsed_content.feed.get('link', ''), encoding)
    parsed.copyright = _to_unicode(parsed_content.feed.get('copyright',''), encoding)
    parsed.last_build_date = parsed_content.feed.get('modified')
    parsed.creator = _to_unicode(parsed_content.feed.get('creator', ''), encoding)
    return parsed

def parse(content, feed):
    parsed = ParsedSummary.ParsedSummary()
    parsed_content, encoding = feedparser_parse(content)
    parsed = parse_channel_info(parsed, parsed_content, encoding)
    for entry in parsed_content.entries:
        item = _parse_entry(entry, feed)
        parsed.addItem(item)
    _remove_ids_if_duplicates(parsed.items)
    return parsed

def sanitize_content(data, feed, limit=60):
    images = None
    title = ""
    try:
        tp = TitleImgParser(feed)
        try:
            tp.feed(data)
            images = [image for image in tp.get_image_urls()]
            title = tp.get_text(limit)
        except Exception, ex:
            error.log(ex)
    finally:
        tp.close()
    return (title, images)

def _parse_entry(entry, feed):
    item = SummaryItem.SummaryItem()
    item.feed = feed
    content = []
    description = ""
    title = _("No title")

    if entry.has_key('content'):
        # it can have multiple content, so we just aggregate them for now.
        for c in entry.content:
            try:
                if c.value not in content:
                    content.append(c.value)
            except TypeError, te:
                error.log(te)
                pass

    if not len(content) and entry.has_key('summary'):
        content.append(entry.get('summary',''))

    description = "<br/>".join(content)

    title = entry.get('title', '')
    if description:
        alttitle, images = sanitize_content(description, feed)
        [item.add_image(image) for image in images]
        if not title:
            # get the first MAXSPLIT words of the description and make that as our
            # title
            dwords = string.splitfields(alttitle, maxsplit=6)
            title = ' '.join(dwords[:]) + ' ...'
    title = title.replace('\n','')
    item.title = title

    item.description = description
    item.guidislink = entry.get('guidislink', False)
    item.link = entry.get('link', None)
    item.guid = entry.get('guid', None)
    item.creator = entry.get('author', None)
    item.contributors = entry.get('contributors', None)
    item.pub_date = entry.get('modified_parsed', time.localtime())
    item.license_urls.append(entry.get('license', None))
    item.fm_license = entry.get('fm_license', None)
    item.fm_changes = entry.get('fm_changes', None)
    item.publication_name = entry.get('prism_publicationname', None)
    item.publication_volume = entry.get('prism_volume', None)
    item.publication_number = entry.get('prism_number', None)
    item.publication_section = entry.get('prism_section',None)
    item.publication_starting_page = entry.get('prism_startingpage', None)
    item.enclosures = entry.get('enclosures', None)

    if entry.has_key('source'):
        url = entry.source.get('url', None)
        text = entry.source.get('value', None)
        if url and text:
            item.source = {'url': url,
                           'text': text}
        else:
            # There's no point displaying the source if there's no url in the
            # first place. This is a violation of the RSS 0.92 spec
            # http://backend.userland.com/rss092.
            item.source = None

    return item
syntax highlighted by Code2HTML, v. 0.9.1