#!/usr/bin/env python """ ExclusionList.py $Id: ExclusionList.py,v 1.7 2002/01/17 02:16:52 janssen Exp $ A class to handle information about what URLs to exclude from the plucking process. Copyright 2000 by Holger Duerer Distributable under the GNU General Public License Version 2 or newer. """ import re, string, sys from PyPlucker.UtilFns import error, message class ExclusionList: """A class to maintain information about what URLs to exclude from the plucking process. This implements an ordered, prioritizes list of regexps that will state that an URLs should either be included or excluded. Items are loaded into the list from a file. Each line in a file is processed separately: - all leading and trailing white space is removed - empty lines are ignored - lines starting with a '#' are ignored (comment) - everthing else should be of the format: prio:action:regexp where: - prio is an integer - action is either a plus or a minus ('+' or '-') - regexp is a valid regular expression """ def __init__ (self, include_by_default=1): self._items = {} self._default_action = include_by_default def load_file (self, filename): """Load new items from the file 'filename'.""" f = open (filename, "r") for orig_line in f.readlines(): if orig_line[-1] == '\n': orig_line = orig_line[:-1] line = string.strip (orig_line) if not line: # empty line continue if line[0] == '#': # a comment continue m = re.match (r"([-+]?\d+):([-+]):(.*)", line) if not m: error ("ExclusionList: Cannot parse line: %s\n" % orig_line) else: prio = int (m.group (1)) action = m.group (2) == '+' regexp = m.group (3) new_item = (action, regexp) if self._items.has_key (prio): self._items[prio].append (new_item) else: self._items[prio] = [new_item] def check (self, url): """Check if 'url' is to be included (result=1) or excluded (result=0).""" prios = self._items.keys () prios.sort () prios.reverse () for prio in prios: for action, regexp in self._items[prio]: if re.match (regexp, url): return action return self._default_action