""" Summaryparser.py Wrapper module to feedparser and responsible for assigning data to Feed and SummaryItems. """ __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc." __license__ = """ Straw is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. Straw is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. """ import sys import types import copy import htmlentitydefs import HTMLParser import string import time import feedparser import error import utils import SummaryItem import ParsedSummary class TitleImgParser(HTMLParser.HTMLParser): def __init__(self, feed=None): HTMLParser.HTMLParser.__init__(self) self._chars = list() self._image_urls = list() self._feed = feed def set_feed(self, feed): self._feed = feed def get_image_urls(self): return self._image_urls def get_text(self, nchars=None): text = ''.join(self._chars).strip() if nchars: text = text[:nchars] return text def close(self): self.flush() HTMLParser.HTMLParser.close(self) def flush(self): del self._chars[:] del self._image_urls[:] def handle_starttag(self, tag, attrs): if tag == 'img': for name, value in attrs: if name == 'src': url = utils.complete_url(value, self._feed.location) self._image_urls.append(url) return def handle_data(self, data): self._chars.append(data) def handle_charref(self, ref): # called for each character reference, e.g. for ' ', ref will be '160' if not self._chars: return ref = ref.lower() if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): text = '&#%s;' % ref else: if ref[0] == 'x': c = int(ref[1:], 16) else: c = int(ref) text = unichr(c).encode('utf-8') self._chars.append(text) def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' if not self._chars: return if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref else: # entity resolution graciously donated by Aaron Swartz def name2cp(k): import htmlentitydefs if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3 return htmlentitydefs.name2codepoint[k] k = htmlentitydefs.entitydefs[k] if k.startswith('&#') and k.endswith(';'): return int(k[2:-1]) # not in latin-1 return ord(k) try: name2cp(ref) except KeyError: text = '&%s;' % ref else: text = unichr(name2cp(ref)).encode('utf-8') self._chars.append(text) def _remove_ids_if_duplicates(items): ids = {} duplicates = False for i in items: if i.guid is not None and i.guid != "": if ids.has_key(i.guid): duplicates = True break ids[i.guid] = True if duplicates: for i in items: i.guid = None i.guidislink = False return def _to_unicode(text, encoding): if text and not isinstance(text, types.UnicodeType): text = unicode(text, encoding) return text def feedparser_parse(data): pc = feedparser.parse(data) enc = pc.get('encoding', utils.get_locale_encoding()) if not enc: enc = sys.getdefaultencoding() return (pc, enc) def parse_channel_info(parsed, parsed_content, encoding): parsed.title = _to_unicode(parsed_content.feed.get('title', ''), encoding) parsed.description = _to_unicode(parsed_content.feed.get('description',''), encoding) parsed.link = _to_unicode(parsed_content.feed.get('link', ''), encoding) parsed.copyright = _to_unicode(parsed_content.feed.get('copyright',''), encoding) parsed.last_build_date = parsed_content.feed.get('modified') parsed.creator = _to_unicode(parsed_content.feed.get('creator', ''), encoding) return parsed def parse(content, feed): parsed = ParsedSummary.ParsedSummary() parsed_content, encoding = feedparser_parse(content) parsed = parse_channel_info(parsed, parsed_content, encoding) for entry in parsed_content.entries: item = _parse_entry(entry, feed) parsed.addItem(item) _remove_ids_if_duplicates(parsed.items) return parsed def sanitize_content(data, feed, limit=60): images = None title = "" try: tp = TitleImgParser(feed) try: tp.feed(data) images = [image for image in tp.get_image_urls()] title = tp.get_text(limit) except Exception, ex: error.log(ex) finally: tp.close() return (title, images) def _parse_entry(entry, feed): item = SummaryItem.SummaryItem() item.feed = feed content = [] description = "" title = _("No title") if entry.has_key('content'): # it can have multiple content, so we just aggregate them for now. for c in entry.content: try: if c.value not in content: content.append(c.value) except TypeError, te: error.log(te) pass if not len(content) and entry.has_key('summary'): content.append(entry.get('summary','')) description = "
".join(content) title = entry.get('title', '') if description: alttitle, images = sanitize_content(description, feed) [item.add_image(image) for image in images] if not title: # get the first MAXSPLIT words of the description and make that as our # title dwords = string.splitfields(alttitle, maxsplit=6) title = ' '.join(dwords[:]) + ' ...' title = title.replace('\n','') item.title = title item.description = description item.guidislink = entry.get('guidislink', False) item.link = entry.get('link', None) item.guid = entry.get('guid', None) item.creator = entry.get('author', None) item.contributors = entry.get('contributors', None) item.pub_date = entry.get('modified_parsed', time.localtime()) item.license_urls.append(entry.get('license', None)) item.fm_license = entry.get('fm_license', None) item.fm_changes = entry.get('fm_changes', None) item.publication_name = entry.get('prism_publicationname', None) item.publication_volume = entry.get('prism_volume', None) item.publication_number = entry.get('prism_number', None) item.publication_section = entry.get('prism_section',None) item.publication_starting_page = entry.get('prism_startingpage', None) item.enclosures = entry.get('enclosures', None) if entry.has_key('source'): url = entry.source.get('url', None) text = entry.source.get('value', None) if url and text: item.source = {'url': url, 'text': text} else: # There's no point displaying the source if there's no url in the # first place. This is a violation of the RSS 0.92 spec # http://backend.userland.com/rss092. item.source = None return item