# Copyright (c) 2001 Chris Withers # # This Software is released under the MIT License: # http://www.opensource.org/licenses/mit-license.html # See license.txt for more details. # # $Id: html2text.py,v 1.1.1.1 2005/01/28 06:51:02 ats_shib Exp $ import sgmllib from string import lower, replace, split, join class HTML2Text(sgmllib.SGMLParser): from htmlentitydefs import entitydefs # replace entitydefs from sgmllib def __init__(self, ignore_tags=(), indent_width=4, page_width=80): sgmllib.SGMLParser.__init__(self) self.result = "" self.indent = 0 self.ol_number = 0 self.page_width=page_width self.inde_width=indent_width self.lines=[] self.line=[] self.ignore_tags = ignore_tags def add_text(self,text): # convert text into words words = split(replace(text,'\n',' ')) self.line.extend(words) def add_break(self): self.lines.append((self.indent,self.line)) self.line=[] def generate(self): # join lines with indents indent_width = self.inde_width page_width = self.page_width out_paras=[] for indent,line in self.lines+[(self.indent,self.line)]: i=indent*indent_width indent_string = i*' ' line_width = page_width-i out_para='' out_line=[] len_out_line=0 for word in line: len_word = len(word) if len_out_line+len_word