package HTML::Copy;
use 5.008;
use strict;
use warnings;
use File::Spec;
use File::Basename;
use File::Path;
use Cwd;
use IO::File;
use utf8;
use Encode;
use Encode::Guess;
use Carp;
use Data::Dumper;
use HTML::Parser 3.40;
use HTML::HeadParser;
use URI::file;
use base qw(HTML::Parser Class::Accessor);
__PACKAGE__->mk_accessors(qw(source_path
destination_path
link_attributes
has_base
source_uri
destination_uri));
#use Data::Dumper;
=head1 NAME
HTML::Copy - copy a HTML file without breaking links.
=head1 VERSION
Version 1.22
=cut
our $VERSION = '1.22';
=head1 SYMPOSIS
use HTML::Copy;
HTML::Copy->htmlcopy($source_path, $destination_path);
# or
$p = HTML::Copy->new($source_path);
$p->copy_to($destination_path);
=head1 DESCRIPTION
This module is to copy a HTML file without beaking links in the file. This module is a sub class of HTML::Parser.
=head1 REQUIRED MODULES
=over 2
=item L
=back
=head1 CLASS METHODS
=head2 htmlcopy
HTML::Copy->htmlcopy($source_path, $destination_path);
Parse contents of $source_path, change links and write into $destination_path.
=cut
sub htmlcopy($$$) {
my ($class, $source_path, $destination_path) = @_;
my $p = $class->new($source_path);
return $p->copy_to($destination_path);
}
=head2 parse_file
$html_text = HTML::Copy->parse_file($source_path, $destination_path);
Parse contents of $source_path and change links to copy into $destination_path. But don't make $destination_path. Just return modified HTML. The encoding of strings is converted into utf8.
=cut
sub parse_file($$$) {
my ($class, $source_path, $destination_path) = @_;
my $p = $class->new($source_path);
return $p->parse_to($destination_path);
}
=head1 CONSTRUCTOR METHODS
=head2 new
$p = HTML::Copy->new($source_path);
Make an instance of this module.
=cut
sub new {
my $class = shift @_;
my $self = $class->SUPER::new();
if (@_ > 1) {
my %args = @_;
my @keys = keys %args;
@$self{@keys} = @args{@keys};
} else {
$self->source_path(shift @_);
}
if ($self->source_path) {
(-e $self->source_path) or croak $self->source_path." is not found.\n";
$self->source_path($self->source_path);
}
$self->link_attributes(['src', 'href', 'background', 'csref', 'livesrc']);
# 'livesrc' and 'csref' are uesed in Adobe GoLive
$self->has_base(0);
return $self;
}
=head1 INSTANCE METHODS
=head2 copy_to
$p->copy_to($destination_path)
Parse contents of $source_path given in new method, change links and write into $destination_path.
=cut
sub copy_to {
my ($self, $destination_path) = @_;
$destination_path = $self->set_destination($destination_path);
my $io_layer = $self->io_layer();
my $fh = IO::File->new($destination_path, ">$io_layer");
if (defined $fh) {
$self->{'outputHTML'} = $fh;
$self->SUPER::parse($self->{'source_html'});
$self->eof;
$fh->close;
} else {
die "can't open $destination_path.";
}
return $self->destination_path;
}
=head2 parse_to
$p->parse_to($destination_path)
Parse contents of $source_path given in new method, change links and return HTML contents to wirte $destination_path. Unlike copy_to, $destination_path will not created.
=cut
sub parse_to {
my ($self, $destination_path) = @_;
$destination_path = $self->set_destination($destination_path);
$self->io_layer;
my $output = '';
my $fh = IO::File->new(\$output, ">:utf8");
$self->{'outputHTML'} = $fh;
$self->SUPER::parse($self->{'source_html'});
$self->eof;
$fh->close;
return decode_utf8($output);
}
=head1 ACCESSOR METHODS
=head2 io_layer
$p->io_layer;
$p->io_layer(':utf8');
Get and set PerlIO layer to read $source_path and to write $destination_path. Usualy it was automatically determined by $source_path's charset tag. If charset is not specified, Encode::Guess module will be used.
=cut
sub io_layer {
my $self = shift @_;
if (@_) {
$self->{'io_layer'} = shift @_;
}
else {
unless ($self->{'io_layer'}) {
$self->{'io_layer'} = $self->check_io_layer();
}
}
return $self->{'io_layer'};
}
=head2 encode_suspects
@suspects = $p->encode_sustects;
$p->encode_suspects(qw/shiftjis euc-jp/);
Add suspects of text encoding to guess the text encoding of the source HTML. If the source HTML have charset tag, it is not requred to add suspects.
=cut
sub encode_suspects {
my $self = shift @_;
if (@_) {
my @suspects = @_;
$self->{'EncodeSuspects'} = \@suspects;
}
if (my $suspects_ref = $self->{'EncodeSuspects'}) {
return @$suspects_ref;
}
else {
return ();
}
}
=head2 source_html
$p->source_html;
Obtain source HTML's contents
=cut
sub source_html {
my ($self) = @_;
$self->io_layer;
return $self->{'source_html'};
}
=head1 AUTHOR
Tetsuro KURITA
=cut
##== overriding methods of HTML::Parser
sub declaration { $_[0]->output("") }
sub process { $_[0]->output($_[2]) }
sub comment { $_[0]->output("") }
sub end { $_[0]->output($_[2]) }
sub text { $_[0]->output($_[1]) }
sub start {
my ($self, $tag, $attr_dict, $attr_names, $tag_text) = @_;
unless ($self->has_base) {
if ($tag eq 'base') {
$self->has_base(1);
}
my $is_changed = 0;
foreach my $an_attr (@{$self->link_attributes}) {
if (exists($attr_dict->{$an_attr})){
my $link_path = $attr_dict->{$an_attr};
next if ($link_path =~ /^\$/);
my $uri = URI->new($link_path);
next if ($uri->scheme);
$is_changed = 1;
$attr_dict->{$an_attr} = $self->change_link($uri);
}
}
if ($is_changed) {
my $attrs_text = $self->build_attributes($attr_dict, $attr_names);
$tag_text = "<$tag $attrs_text>";
}
}
$self->output($tag_text);
}
##== private functions
sub set_destination {
my ($self, $destination_path) = @_;
if (-d $destination_path) {
my $file_name = basename($self->source_path);
$destination_path = File::Spec->catfile($destination_path, $file_name);
} else {
mkpath(dirname($destination_path));
}
return $self->destination_path($destination_path);
}
sub check_encoding {
my ($self) = @_;
my $data;
open my $in, "<", $self->source_path;
{local $/; $data = <$in>;}
close $in;
my $p = HTML::HeadParser->new;
$p->utf8_mode(1);
$p->parse($data);
my $content_type = $p->header('content-type');
my $encoding = '';
if ($content_type) {
if ($content_type =~ /charset\s*=(.+)/) {
$encoding = $1;
}
}
unless ($encoding) {
my $decoder;
if (my @suspects = $self->encode_suspects) {
$decoder = Encode::Guess->guess($data, @suspects);
}
else {
$decoder = Encode::Guess->guess($data);
}
ref($decoder) or
die("Can't guess encoding of ".$self->source_path);
$encoding = $decoder->name;
}
$self->{'source_html'} = Encode::decode($encoding, $data);
return $encoding;
}
sub check_io_layer {
my ($self) = @_;
my $encoding = $self->check_encoding;
return '' unless ($encoding);
my $io_layer = '';
if (grep {/$encoding/} ('utf8', 'utf-8', 'UTF-8') ) {
$io_layer = ":utf8";
}
else {
$io_layer = ":encoding($encoding)";
}
return $io_layer;
}
sub build_attributes {
my ($self, $attr_dict, $attr_names) = @_;
my @attrs = ();
foreach my $attr_name (@{$attr_names}) {
if ($attr_name eq '/') {
push @attrs, '/';
} else {
my $attr_value = $attr_dict->{$attr_name};
push @attrs, "$attr_name=\"$attr_value\"";
}
}
return join(' ', @attrs);
}
sub change_link {
my ($self, $uri) = @_;
my $result_uri;
my $abs_uri = $uri->abs( $self->source_uri );
my $abs_path = $abs_uri->file;
if (-e $abs_path) {
$result_uri = $abs_uri->rel($self->destination_uri);
} else {
warn("$abs_path is not found.\nThe link to this path is not changed.\n");
$result_uri = $uri;
}
return $result_uri->as_string;
}
sub output {
my ($self, $out_text) = @_;
$self->{'outputHTML'}->print($out_text);
}
sub source_path {
my $self = shift @_;
if (@_) {
my $path = Cwd::abs_path(shift @_);
$self->{'source_path'} = $path;
$self->source_uri(URI::file->new($path));
}
return $self->{'source_path'};
}
sub destination_path {
my $self = shift @_;
if (@_) {
my $path = Cwd::abs_path(shift @_);
$self->{'destination_path'} = $path;
$self->destination_uri(URI::file->new($path));
}
return $self->{'destination_path'};
}
1;