""" Summaryparser.py
Wrapper module to feedparser and responsible for assigning data to Feed and
SummaryItems.
"""
__copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
__license__ = """
Straw is free software; you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.
Straw is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place - Suite 330, Boston, MA 02111-1307, USA. """
import sys
import types
import copy
import htmlentitydefs
import HTMLParser
import string
import time
import feedparser
import error
import utils
import SummaryItem
import ParsedSummary
class TitleImgParser(HTMLParser.HTMLParser):
def __init__(self, feed=None):
HTMLParser.HTMLParser.__init__(self)
self._chars = list()
self._image_urls = list()
self._feed = feed
def set_feed(self, feed):
self._feed = feed
def get_image_urls(self):
return self._image_urls
def get_text(self, nchars=None):
text = ''.join(self._chars).strip()
if nchars:
text = text[:nchars]
return text
def close(self):
self.flush()
HTMLParser.HTMLParser.close(self)
def flush(self):
del self._chars[:]
del self._image_urls[:]
def handle_starttag(self, tag, attrs):
if tag == 'img':
for name, value in attrs:
if name == 'src':
url = utils.complete_url(value, self._feed.location)
self._image_urls.append(url)
return
def handle_data(self, data):
self._chars.append(data)
def handle_charref(self, ref):
# called for each character reference, e.g. for ' ', ref will be '160'
if not self._chars: return
ref = ref.lower()
if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
text = '&#%s;' % ref
else:
if ref[0] == 'x':
c = int(ref[1:], 16)
else:
c = int(ref)
text = unichr(c).encode('utf-8')
self._chars.append(text)
def handle_entityref(self, ref):
# called for each entity reference, e.g. for '©', ref will be 'copy'
if not self._chars: return
if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
text = '&%s;' % ref
else:
# entity resolution graciously donated by Aaron Swartz
def name2cp(k):
import htmlentitydefs
if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
k = htmlentitydefs.entitydefs[k]
if k.startswith('&#') and k.endswith(';'):
return int(k[2:-1]) # not in latin-1
return ord(k)
try: name2cp(ref)
except KeyError: text = '&%s;' % ref
else: text = unichr(name2cp(ref)).encode('utf-8')
self._chars.append(text)
def _remove_ids_if_duplicates(items):
ids = {}
duplicates = False
for i in items:
if i.guid is not None and i.guid != "":
if ids.has_key(i.guid):
duplicates = True
break
ids[i.guid] = True
if duplicates:
for i in items:
i.guid = None
i.guidislink = False
return
def _to_unicode(text, encoding):
if text and not isinstance(text, types.UnicodeType):
text = unicode(text, encoding)
return text
def feedparser_parse(data):
pc = feedparser.parse(data)
enc = pc.get('encoding', utils.get_locale_encoding())
if not enc:
enc = sys.getdefaultencoding()
return (pc, enc)
def parse_channel_info(parsed, parsed_content, encoding):
parsed.title = _to_unicode(parsed_content.feed.get('title', ''), encoding)
parsed.description = _to_unicode(parsed_content.feed.get('description',''), encoding)
parsed.link = _to_unicode(parsed_content.feed.get('link', ''), encoding)
parsed.copyright = _to_unicode(parsed_content.feed.get('copyright',''), encoding)
parsed.last_build_date = parsed_content.feed.get('modified')
parsed.creator = _to_unicode(parsed_content.feed.get('creator', ''), encoding)
return parsed
def parse(content, feed):
parsed = ParsedSummary.ParsedSummary()
parsed_content, encoding = feedparser_parse(content)
parsed = parse_channel_info(parsed, parsed_content, encoding)
for entry in parsed_content.entries:
item = _parse_entry(entry, feed)
parsed.addItem(item)
_remove_ids_if_duplicates(parsed.items)
return parsed
def sanitize_content(data, feed, limit=60):
images = None
title = ""
try:
tp = TitleImgParser(feed)
try:
tp.feed(data)
images = [image for image in tp.get_image_urls()]
title = tp.get_text(limit)
except Exception, ex:
error.log(ex)
finally:
tp.close()
return (title, images)
def _parse_entry(entry, feed):
item = SummaryItem.SummaryItem()
item.feed = feed
content = []
description = ""
title = _("No title")
if entry.has_key('content'):
# it can have multiple content, so we just aggregate them for now.
for c in entry.content:
try:
if c.value not in content:
content.append(c.value)
except TypeError, te:
error.log(te)
pass
if not len(content) and entry.has_key('summary'):
content.append(entry.get('summary',''))
description = "<br/>".join(content)
title = entry.get('title', '')
if description:
alttitle, images = sanitize_content(description, feed)
[item.add_image(image) for image in images]
if not title:
# get the first MAXSPLIT words of the description and make that as our
# title
dwords = string.splitfields(alttitle, maxsplit=6)
title = ' '.join(dwords[:]) + ' ...'
title = title.replace('\n','')
item.title = title
item.description = description
item.guidislink = entry.get('guidislink', False)
item.link = entry.get('link', None)
item.guid = entry.get('guid', None)
item.creator = entry.get('author', None)
item.contributors = entry.get('contributors', None)
item.pub_date = entry.get('modified_parsed', time.localtime())
item.license_urls.append(entry.get('license', None))
item.fm_license = entry.get('fm_license', None)
item.fm_changes = entry.get('fm_changes', None)
item.publication_name = entry.get('prism_publicationname', None)
item.publication_volume = entry.get('prism_volume', None)
item.publication_number = entry.get('prism_number', None)
item.publication_section = entry.get('prism_section',None)
item.publication_starting_page = entry.get('prism_startingpage', None)
item.enclosures = entry.get('enclosures', None)
if entry.has_key('source'):
url = entry.source.get('url', None)
text = entry.source.get('value', None)
if url and text:
item.source = {'url': url,
'text': text}
else:
# There's no point displaying the source if there's no url in the
# first place. This is a violation of the RSS 0.92 spec
# http://backend.userland.com/rss092.
item.source = None
return item
syntax highlighted by Code2HTML, v. 0.9.1