# =head1 NAME Text::EtText::HTML2EtText - convert from HTML to the EtText editable-text format =head1 SYNOPSIS my $t = new Text::EtText::HTML2EtText; print $t->html2text ($html); or my $t = new Text::EtText::HTML2EtText; print $t->html2text (); # from STDIN =head1 DESCRIPTION ethtml2text will convert a HTML file into the EtText editable-text format, for use with webmake or ettext2html. For more information on the EtText format, check the WebMake documentation on the web at http://webmake.taint.org/ . =head1 METHODS =over 4 =cut package Text::EtText::HTML2EtText; use Carp; use strict; use locale; use HTML::Entities; use vars qw{ @ISA }; @ISA = qw(); ########################################################################### =item $f = new Text::EtText::HTML2EtText Constructs a new C object. =cut sub new { my $class = shift; $class = ref($class) || $class; my $self = { # in parameters: 'text_line_width' => 72, # line width 'text_wrap_lines' => 1, # wrap to fit in line width 'text_strip_para_fonts' => 1, # strip font tags surrounding paras 'text_link_indent' => ' ', # default indent for links 'html_link_open' => '[[', # characters used to wrap links 'html_link_close' => ']]', # characters used to wrap links }; bless ($self, $class); $self; } ########################################################################### =item $text = $f->html2text( [$html] ) Convert HTML, either from the argument or from STDIN, into EtText. =cut sub html2text { my ($self, @txt) = @_; local ($_); my $txt = ''; if ($#txt >= 0) { $txt = join ('', @txt); } else { while () { $txt .= $_; } } my $line1 = ('-' x $self->{text_line_width}); $txt =~ s{(.*?)}{ "".&protect_html ($1).""; }gies; $txt =~ s{(.*?)<\/listing>}{ "".&protect_html ($1).""; }gies; $txt =~ s{(.*?)<\/xmp>}{ "<xmp>".&protect_html ($1).""; }gies; $txt =~ s{
(.*?)<\/pre>}{
    "
".&protect_html ($1)."
"; }gies; $txt =~ s/\s+/ /gs; $txt =~ s/^ //gs; $txt =~ s/ $//gs; $txt =~ s/ *

*/\n\n/gis; $txt =~ s/\s*<\/p>\s*/\n\n/gis; $txt =~ s/]+)>\s*/\n/gis; $txt =~ s/(]+)>\s*)/\n\n$1/gis; $txt =~ s/<(td|tr)(?:| [^>]+)>\s*/\n<$1>\n\n/gis; $txt =~ s/<\/(td|tr)>\s*/\n\n<\/$1>\n/gis; $txt =~ s/<\/(table)>\s*/\n<\/$1>\n\n/gis; $txt =~ s/]+)>\s*/\n${line1}\n/gis; $txt =~ s/

\s*(.*?)\s*<\/h1>\s*/"\n\n$1\n".('=' x length($1))."\n\n";/geis; $txt =~ s/

\s*(.*?)\s*<\/h2>\s*/"\n\n$1\n".('-' x length($1))."\n\n";/geis; $txt =~ s/

\s*(.*?)\s*<\/h3>\s*/"\n\n$1\n".('~' x length($1))."\n\n";/geis; $txt =~ s/\n[ \t]+/\n/gs; $txt =~ s/^\s+//gs; $txt =~ s/\s+$//gs; $txt =~ s{\s*