#
=head1 NAME
Text::EtText::EtText2HTML - convert from the simple EtText editable-text format
into HTML
=head1 SYNOPSIS
my $t = new Text::EtText::EtText2HTML;
print $t->text2html ($text);
or
my $t = new Text::EtText::EtText2HTML;
print $t->text2html (); # from STDIN
=head1 DESCRIPTION
ettext2html will convert a text file in the EtText editable-text format into
HTML.
For more information on the EtText format, check the WebMake documentation on
the web at http://webmake.taint.org/ .
=head1 METHODS
=over 4
=cut
package Text::EtText::EtText2HTML;
use Carp;
use strict;
use locale;
use HTML::Entities;
use Text::EtText;
use Text::EtText::LinkGlossary;
use Text::EtText::DefaultGlossary;
use Text::EtText::Lists;
use vars qw{
@ISA $ATTRS_WITH_URLS $BALANCED_TAG_GEN_TAGS
$URL_PROTECTOR $prot
};
@ISA = qw();
# attributes that can take URL arguments: cf. HTML::LinkExtor.
$ATTRS_WITH_URLS =
qr{href|src|lowsrc|usemap|action|background|codebase|code}i;
$BALANCED_TAG_GEN_TAGS =
qr{(?:b|i|em|q|strong|h\d|code|abbr|acronym|address|big|cite|del|ins|s|small|strike|sub|sup|u|samp|kbd|var|span)}i;
$URL_PROTECTOR = '[[URL]]';
###########################################################################
=item $f = new Text::EtText::EtText2HTML
Constructs a new C(.*?)<\/pre>}{
$_ = $1; $self->protect_html(\$_);
"$_
";
}geis;
}
sub do_text_markup {
my ($self, $html) = @_;
local ($_);
# first, find all-underscore lines -- avoid ....
# avoids an issue with Suelette Davis & Julian Assange's _Underground_
$$html =~ s/\n\s*_{10,}\s*\n/\n
\n\n/gs;
# convert **foo** to foo
$$html =~ s,\*\*(.*?)\*\*,$1,gs;
$$html =~ s,\_\_(.*?)\_\_,$1,gs;
$$html =~ s,\#\#(.*?)\#\#,$1,gs;
# Caolan's patch to do one-char markup: (off by default
# currently, set "EtTextOneCharMarkup" = 1 to turn it on)
if ($self->{options}->{"EtTextOneCharMarkup"}) {
$$html =~ s,(\s|[\>\<\_\']*)\*
([\>\<\_\']*[\>\<\_\']*.*?[\>\<\_\']*[\>\<\_\']*)
\*([\>\<\_\']*|\s),$1$2$3,gsx; #'
$$html =~ s,(\s|[\>\<\*\']*)\_
([\>\<\*\']*[\>\<\*\']*.*?[\>\<\*\']*[\>\<\*\']*)
\_([\>\<\*\']*|\s),$1$2$3,gsx; #'
$$html =~ s,(\s|[\>\<\*\']*)\'
([\>\<\*\_]*[\>\<\*\_]*.*?[\>\<\*\_]*[\>\<\*\_]*)
\'([\>\<\*\_]*|\s),$1$2$3,gsx; #'
}
# convert b{text} to text #{
1 while $$html =~ s{(${BALANCED_TAG_GEN_TAGS})\{([^\{\}]+)\}}
{<$1>$2$1>}gisx;
# convert span.foo{text} to text etc. #{
1 while $$html =~ s{(${BALANCED_TAG_GEN_TAGS})\.(\S+)\{([^\{\}]+)\}}
{<$1 class="$2">$3$1>}gisx;
}
sub do_ettext_link_definitions {
my ($self, $html) = @_;
local ($_);
$$html =~ s{^\s+\[([^\]]+)\]\:\s+(\S+)\s*$}{
$self->{links}->{$1} = $2; "\n";
}giem;
$$html =~ s{^\s+Auto:\s+\[([^\]]+)\]\:\s+(\S+)\s*$}{
$self->{auto_links}->{$1} = $2; "\n";
}giem;
}
sub do_old_ettext_links {
my ($self, $html) = @_;
local ($_);
$$html =~ s{\"([^\"]+?)\"\s*\[([^\]\s]+)\]}{ #"
$self->link_write (1, $2, $1);
}ges;
$$html =~ s{\b([^>\s]+)\s*\[([^\]\s]+)\]}{
$self->link_write (0, $2, $1);
}ges;
$$html =~ s{(<[^>]+>)\s*\[([^\]\s]+)\]}{
$self->link_write (0, $2, $1);
}ges;
# glossary links.
if (defined $self->{glossary}) {
$self->update_glossary();
$$html =~ s{((?!=).\s)\"([^\"]+?)\"}{ #"
$1.$self->link_write (1, $2, $2);
}geis;
}
}
sub do_ettext_links {
my ($self, $html) = @_;
local ($_);
# [[this is a link [label]]
$$html =~ s{\[\[(.+?)\s+\[(.+?)\]\]}{ #"
$self->link_write (1, $2, $1);
}ges;
# glossary links.
if (defined $self->{glossary}) {
$self->update_glossary();
$$html =~ s{((?!=).\s)\[\[(.+?)\]\]}{ #"
$1.$self->link_write (1, $2, $2);
}ges;
}
}
sub _handle_link_href {
my ($base, $url) = @_;
if ($url =~ /${URL_PROTECTOR}/) { return $url; }
$url =~ s/^\"(.*)\"$/$1/g;
$url =~ s/^\'(.*)\'$/$1/g;
# first protect existing hrefs and src tags. This only operates
# on hrefs with protocol: tags at the start.
if ($url =~ s/^([A-Za-z0-9_-]+):/$1${URL_PROTECTOR}:/) {
return $url;
}
if ($url =~ /^\$\(/) { return $url; }
if ($url =~ /^\.{0,2}\//) { return $URL_PROTECTOR.$url; }
if (defined $base && $base ne '') { return $URL_PROTECTOR.$base.$url; }
$URL_PROTECTOR.$url;
}
sub update_glossary {
my ($self) = @_;
# if we have a glossary, add our new links to it.
# trim out ones that are numeric only, or 1 char long.
if (defined $self->{glossary}) {
my ($k, $v);
while (($k, $v) = (each %{$self->{links}})) {
next unless is_valid_glossary_key ($k);
$self->{glossary}->put_link ($k, $v);
}
my @newkeys = ();
while (($k, $v) = (each %{$self->{auto_links}})) {
$self->{glossary}->put_auto_link ($k, $v);
push (@newkeys, $k);
}
if ($#newkeys >= 0) {
$self->{glossary}->add_auto_link_keys (@newkeys);
}
$self->{glossary}->close();
}
}
# ---------------------------------------------------------------------------
sub do_segmented_traverse {
my ($self, $html) = @_;
local ($_);
my $url = undef;
my $done = '';
$prot = $URL_PROTECTOR;
# De-relativise relative links.
my $base = $self->{options}->{"EtTextBaseHref"};
if (defined $base && $base ne '') {
if ($base !~ /\/$/) { $base .= '/'; }
}
if ($self->{options}->{"EtTextHrefsRelativeToTop"} && $base eq '') {
$base = '$(TOP/)';
}
$self->{base} = $base;
$self->{auto_links_re} = undef;
if (defined $self->{glossary}) {
$_ = join ('|', $self->{glossary}->get_auto_link_keys());
if ($_ ne '') { $self->{auto_links_re} = qr{$_}; }
}
# hmmm... tricky. Since it's very easy to accidentally put an auto link key
# inside HTML tags, for example if the key is "ntk" and the HTML tag is e.g.
# "ntk", we need to parse the document in a
# more lex-ish style, and skip any text inside links or in HTML tags.
LOOP:
{
if ($$html =~ /\G([^<]+)/gsc) {
$done .= $self->markup_ettext_segment($1);
redo LOOP;
}
if ($$html =~ /\G(]*href[^>]*>)(.*?)<\/a>/gisc) {
$done .= $self->markup_a_href($1, $2);
redo LOOP;
}
if ($$html =~ /\G(<[^>]*>)/gsc) {
$done .= $self->markup_html_segment($1);
redo LOOP;
}
}
$done =~ s{\Q${URL_PROTECTOR}\E}{}gs;
$$html = $done;
}
sub markup_a_href {
my ($self, $ahref, $linktext) = @_;
$_ = $self->markup_html_segment($ahref);
$_.$linktext."";
}
sub markup_html_segment {
my ($self) = shift;
local ($_) = shift;
# $2
\n\n";
}ges;
$$html =~ s{(^\n+|\n\n)([^\n]+)[ \t]*\n={3,}\n}{
my ($pre, $text, $name) = ($1, $2, make_a_name($2));
"$1$2
\n\n";
}ges;
$$html =~ s{(^\n+|\n\n)([^\n]+)[ \t]*\n\~{3,}\n}{
my ($pre, $text, $name) = ($1, $2, make_a_name($2));
"$1$2
\n\n";
}ges;
$$html =~ s{(^\n+|\n\n)([0-9A-Z][^a-z]+)[ \t]*\n\n}{
my ($pre, $text, $name) = ($1, $2, make_a_name($2));
"$1$2
\n\n";
}ges;
# now create HRs. Currently we don't bother looking at the
# character used, and so all hrs look the same; perhaps this
# would be a TODO. Not yet though.
$$html =~ s/\n-{10,} *\n/\n
\n\n/gs;
$$html =~ s/\n={10,} *\n/\n
\n\n/gs;
$$html =~ s/\n\~{10,} *\n/\n
\n\n/gs;
# break into paragraphs.
# $$html =~ s,\n\s*\n,\n
,gs; # but HR tags or headings don't need paras. # $$html =~ s{
\s* # (
]+|)>.*?<\/pre>)
# \s*<\/p>}{$1}gisx;
# $$html =~ s{\s*
# ((?:<[^>]+>\s*)
# *?)\s*<\/p>}{$1}gisx;
#
# $$html .= "
";
}
sub do_sidebars {
my ($self, $html) = @_;
# handle and blocks, used to do sidebars
# or images on paragraphs
$$html =~ s{\s*\s*(.*?)\s*\s*etleft\s*>\s*(.*?)\s*
}
{
$1
$2
}gis;
$$html =~ s{\s*\s*(.*?)\s*\s*etright\s*>\s*(.*?)\s*
}
{
$2
$1
}gis;
}
sub do_final_cleanup {
my ($self, $html) = @_;
local ($_);
# trim the spare para markers at start and end.
$$html =~ s,^\s*,,s;
$$html =~ s,\s*$,,s;
# $$html =~ s,^,
,s unless ($$html =~ m,^\s*
,);
# $$html =~ s,$,
,s unless ($$html =~ m,<\/p>\s*$,);
# Remove tags around blocks that do not contain any real text,
# and are instead just blocks of HTML tags.
$$html =~ s{
(.*?)<\/p>}{
$_ = $1;
if (/>[^<]*\S/) {
"
" . $_ . "
";
} elsif (!/>/) {
"" . $_ . "
";
} else {
$_;
}
}geis;
# trim para markers before the tag, in case one was
# present in the doc to start with. Ditto for after the .
$$html =~ s,^\s*<(?:/p|p)>\s*<(doctype|html),<$1,is;
$$html =~ s,(<\/html>)\s*<(?:/p|p)>\s*$,$1,is;
}
sub link_write {
my ($self, $was_glossary_link, $linklabel, $text) = @_;
my $url;
# see if the link label was a proper link specification instead
# of a symbolic one. Don't do this if it was wrapped by quotes,
# though.
if (defined $linklabel && !$was_glossary_link &&
$linklabel =~ /^(?:\$|http:|file:|ftp:)/i)
{
$url = $linklabel; goto gotone;
}
# check to see if there was a link label at all -- if
# there wasn't, we could have been a glossary link.
$linklabel ||= $text;
if (defined (($url = $self->{links}->{$linklabel}))) {
goto gotone;
}
elsif (defined (($url = $self->{auto_links}->{$linklabel}))) {
goto gotone;
}
elsif (defined ($self->{glossary}) &&
defined ($url = $self->{glossary}->get_auto_link($linklabel)))
{
goto gotone;
}
elsif (length $linklabel > 3 &&
defined ($self->{glossary}) &&
defined ($url = $self->{glossary}->get_link($linklabel)))
{
goto gotone;
}
elsif ($was_glossary_link)
{
warn "Link not found (use ''quotes'' to avoid warning): \"$linklabel\"\n";
return $text;
}
else
{
return $text;
}
gotone:
return "".$text."";
}
###########################################################################
sub protect_html {
my ($self, $html) = @_;
$$html =~ s/ /\016\001/gs;
$$html =~ s/\t/\016\002/gs;
$$html =~ s/\016\003/gs;
$$html =~ s/>/\016\004/gs;
$$html =~ s/\&/\016\005/gs;
$$html =~ s/\"/\016\006/gs;
$$html =~ s/\'/\016\007/gs;
$$html =~ s/\//\016\010/gs;
# skip 010-015, they're commonly used
$$html =~ s/:/\016\017/gs;
$$html =~ s/\n/\016\020/gs;
$$html =~ s/\[/\016\021/gs;
}
###########################################################################
sub unprotect_html {
my ($self, $html) = @_;
$$html =~ s/\016\001/ /gs;
$$html =~ s/\016\002/\t/gs;
$$html =~ s/\016\003//gs;
$$html =~ s/\016\005/\&/gs;
$$html =~ s/\016\006/\"/gs;
$$html =~ s/\016\007/\'/gs;
$$html =~ s/\016\010/\//gs;
# skip 010-015, they're commonly used
$$html =~ s/\016\017/:/gs;
$$html =~ s/\016\020/\n/gs;
$$html =~ s/\016\021/[/gs;
}
###########################################################################
sub markup_lists {
my ($self, $html) = @_;
my $lister = new Text::EtText::Lists();
$$html = $lister->run (split (/^/, $$html));
}
###########################################################################
1;
__END__
=back
=head1 MORE DOCUMENTATION
See also http://webmake.taint.org/ for more information.
=head1 SEE ALSO
C
C
C
C
C
C
C
C
=head1 AUTHOR
Justin Mason Ejm /at/ jmason.orgE
=head1 COPYRIGHT
WebMake is distributed under the terms of the GNU Public License.
=head1 AVAILABILITY
The latest version of this library is likely to be available from CPAN
as well as:
http://webmake.taint.org/
=cut