#!/usr/bin/perl -wT # # W3C Link Checker # by Hugo Haas # (c) 1999-2006 World Wide Web Consortium # based on Renaud Bruyeron's checklink.pl # # $Id: checklink,v 4.42 2006/10/22 16:48:13 ville Exp $ # # This program is licensed under the W3C(r) Software License: # http://www.w3.org/Consortium/Legal/copyright-software # # The documentation is at: # http://validator.w3.org/docs/checklink.html # # See the CVSweb interface at: # http://dev.w3.org/cvsweb/perl/modules/W3C/LinkChecker/ # # An online version is available at: # http://validator.w3.org/checklink # # Comments and suggestions should be sent to the www-validator mailing list: # www-validator@w3.org (with 'checklink' in the subject) # http://lists.w3.org/Archives/Public/www-validator/ (archives) use strict; # Get rid of potentially unsafe and unneeded environment variables. delete(@ENV{qw(IFS CDPATH ENV BASH_ENV)}); $ENV{PATH} = ''; # undef would output warnings with Perl 5.6.1's Cwd.pm. # ----------------------------------------------------------------------------- package W3C::UserAgent; use LWP::RobotUA 1.19 qw(); # @@@ Needs also W3C::LinkChecker but can't use() it here. @W3C::UserAgent::ISA = qw(LWP::RobotUA); sub new { my $proto = shift; my $class = ref($proto) || $proto; my ($name, $from, $rules) = @_; # For security/privacy reasons, if $from was not given, do not send it. # Cheat by defining something for the constructor, and resetting it later. my $from_ok = $from; $from ||= 'www-validator@w3.org'; # WWW::RobotRules <= 5.78 have bugs which cause suboptimal results with # User-Agent substring matching against robots.txt files; "User-Agent: *" # should work ok with all though, and "User-Agent: W3C-checklink" for >= 5.77 my $self = $class->SUPER::new($name, $from, $rules); $self->from(undef) unless $from_ok; $self->env_proxy(); return $self; } sub simple_request { my $self = shift; my $response = do { local $SIG{__WARN__} = sub { # Suppress some warnings, rt.cpan.org #18902 warn($_[0]) if ($_[0] && $_[0] !~ /^RobotRules/); }; $self->W3C::UserAgent::SUPER::simple_request(@_); }; if (! defined($self->{FirstResponse})) { $self->{FirstResponse} = $response->code(); $self->{FirstMessage} = $response->message() || '(no message)'; } return $response; } sub redirect_ok { my ($self, $request, $response) = @_; if ($self->{Checklink_verbose_progress}) { # @@@ TODO: when an LWP internal robots.txt request gets redirected, # this will a bit confusingly print out info about it. Would need a # robust way of determining whether something is a LWP "internal" request. &W3C::LinkChecker::hprintf("\n%s %s ", $request->method(),$request->uri()); } return 0 unless $self->SUPER::redirect_ok($request, $response); if (my $res = &W3C::LinkChecker::ip_allowed($request->uri())) { $response->previous($response->clone()); $response->request($request); $response->code($res->code()); $response->message($res->message()); return 0; } return 1; } # ----------------------------------------------------------------------------- package W3C::LinkChecker; use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION $DocType $Head $Accept $ContentTypes %Cfg); use HTML::Entities qw(); use HTML::Parser 3.20 qw(); # >= 3.20 for "line" argspec identifier use HTTP::Request qw(); use HTTP::Response qw(); use Time::HiRes qw(); use URI qw(); use URI::Escape qw(); use URI::file qw(); # @@@ Needs also W3C::UserAgent but can't use() it here. use constant RC_ROBOTS_TXT => -1; use constant RC_DNS_ERROR => -2; use constant LINE_UNKNOWN => -1; use constant MP2 => (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2); @W3C::LinkChecker::ISA = qw(HTML::Parser); BEGIN { # Version info $PACKAGE = 'W3C Link Checker'; $PROGRAM = 'W3C-checklink'; $VERSION = '4.3'; $REVISION = sprintf('version %s (c) 1999-2006 W3C', $VERSION); my ($cvsver) = q$Revision: 4.42 $ =~ /(\d+[\d\.]*\.\d+)/; $AGENT = sprintf('%s/%s [%s] %s', $PROGRAM, $VERSION, $cvsver, LWP::RobotUA->_agent()); # Pull in mod_perl modules if applicable. eval { local $SIG{__DIE__}; require Apache2::RequestUtil; } if MP2(); my @content_types = qw(application/xhtml+xml text/html); $Accept = join(', ', @content_types) . ', */*;q=0.5'; my $re = join('|', map { s/\+/\\+/g; $_ } @content_types); $ContentTypes = qr{\b(?:$re)\b}io; # # Read configuration. If the W3C_CHECKLINK_CFG environment variable has # been set or the default contains a non-empty file, read it. Otherwise, # skip silently. # my $defaultconfig = '/etc/w3c/checklink.conf'; if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) { require Config::General; Config::General->require_version(2.06); # Need 2.06 for -SplitPolicy my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig; eval { my %config_opts = ( -ConfigFile => $conffile, -SplitPolicy => 'equalsign', -AllowMultiOptions => 'no', ); %Cfg = Config::General->new(%config_opts)->getall(); }; if ($@) { die <<".EOF."; Failed to read configuration from '$conffile': $@ .EOF. } } $Cfg{Markup_Validator_URI} ||= 'http://validator.w3.org/check?uri=%s'; $Cfg{CSS_Validator_URI} ||= 'http://jigsaw.w3.org/css-validator/validator?uri=%s'; $Cfg{Doc_URI} ||= 'http://validator.w3.org/docs/checklink.html'; $Cfg{Style_URI} ||= 'http://validator.w3.org/docs/linkchecker.css'; $DocType = ''; $Head = sprintf(<<'EOF', HTML::Entities::encode($AGENT), $Cfg{Style_URI}); EOF # Trusted environment variables that need laundering in taint mode. foreach (qw(NNTPSERVER NEWSHOST)) { ($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_}; } # Use passive FTP by default, see Net::FTP(3). $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE}); } # Autoflush $| = 1; # Different options specified by the user my $cmdline = ! ($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/); my %Opts = ( Command_Line => $cmdline, Quiet => 0, Summary_Only => 0, Verbose => 0, Progress => 0, HTML => 0, Timeout => 60, Redirects => 1, Dir_Redirects => 1, Accept_Language => $cmdline ? undef : $ENV{HTTP_ACCEPT_LANGUAGE}, HTTP_Proxy => undef, Hide_Same_Realm => 0, Depth => 0, # < 0 means unlimited recursion. Sleep_Time => 1, Max_Documents => 150, # For the online version. User => undef, Password => undef, Base_Locations => [], Exclude_Docs => undef, Masquerade => 0, Masquerade_From => '', Masquerade_To => '', Trusted => $Cfg{Trusted}, Allow_Private_IPs => defined($Cfg{Allow_Private_IPs}) ? $Cfg{Allow_Private_IPs} : $cmdline, ); undef $cmdline; unless ($Opts{Allow_Private_IPs}) { eval { require Net::IP; require Socket; Socket->import('inet_ntoa'); require Net::hostent; }; if ($@) { die <<".EOF."; Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and Net::hostent modules: $@ .EOF. } } # Global variables # What is our query? my $query; # What URI's did we process? (used for recursive mode) my %processed; # Result of the HTTP query my %results; # List of redirects my %redirects; # Count of the number of documents checked my $doc_count = 0; # Time stamp my $timestamp = &get_timestamp(); &parse_arguments() if $Opts{Command_Line}; # Precompile/error-check regular expressions. if (defined($Opts{Exclude_Docs})) { eval { $Opts{Exclude_Docs} = qr/$Opts{Exclude_Docs}/o; }; &usage(1, "Error in exclude-docs regexp: $@") if $@; } if (defined($Opts{Trusted})) { eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; }; &usage(1, "Error in trusted domains regexp: $@") if $@; } my $ua = W3C::UserAgent->new($AGENT); # @@@ TODO: admin address # @@@ make number of keep-alive connections customizable $ua->conn_cache({ total_capacity => 1}); # 1 keep-alive connection $ua->delay($Opts{Sleep_Time}/60); $ua->timeout($Opts{Timeout}); $ua->proxy('http', 'http://' . $Opts{HTTP_Proxy}) if $Opts{HTTP_Proxy}; if ($Opts{Command_Line}) { require Text::Wrap; Text::Wrap->import('wrap'); &usage(1) unless @ARGV; $Opts{_Self_URI} = 'http://validator.w3.org/checklink'; # For HTML output &ask_password() if ($Opts{User} && !$Opts{Password}); my $is_first = 1; my @bases = @{$Opts{Base_Locations}}; foreach my $uri (@ARGV) { if (!$Opts{Summary_Only}) { printf("%s %s\n", $PACKAGE, $REVISION) unless $Opts{HTML}; } else { $Opts{Verbose} = 0; $Opts{Progress} = 0; } # Reset base locations so that previous URI's given on the command line # won't affect the recursion scope for this URI @{$Opts{Base_Locations}} = @bases; # Transform the parameter into a URI $uri = &urize($uri); &check_uri($uri, $is_first, $Opts{Depth}, undef, 1); $is_first &&= 0; } undef $is_first; if ($Opts{HTML}) { &html_footer(); } elsif (($doc_count > 0) && !$Opts{Summary_Only}) { printf("\n%s\n", &global_stats()); } } else { require CGI; require CGI::Carp; CGI::Carp->import(qw(fatalsToBrowser)); $query = new CGI; # Set a few parameters in CGI mode $Opts{Verbose} = 0; $Opts{Progress} = 0; $Opts{HTML} = 1; $Opts{_Self_URI} = $query->url(-relative => 1); # Backwards compatibility my $uri = undef; if ($uri = $query->param('url')) { $query->param('uri', $uri) unless $query->param('uri'); $query->delete('url'); } $uri = $query->param('uri'); if (! $uri) { &html_header('', 1); # Set cookie only from results page. &print_form($query); &html_footer(); exit; } # Backwards compatibility if ($query->param('hide_dir_redirects')) { $query->param('hide_redirects', 'on'); $query->param('hide_type', 'dir'); $query->delete('hide_dir_redirects'); } $Opts{Summary_Only} = 1 if $query->param('summary'); if ($query->param('hide_redirects')) { $Opts{Dir_Redirects} = 0; if (my $type = $query->param('hide_type')) { $Opts{Redirects} = 0 if ($type ne 'dir'); } else { $Opts{Redirects} = 0; } } $Opts{Accept_Language} = undef if $query->param('no_accept_language'); $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0); if (my $depth = $query->param('depth')) { # @@@ Ignore invalid depth silently for now. $Opts{Depth} = $1 if ($depth =~ /(-?\d+)/); } # Save, clear or leave cookie as is. my $cookie = ''; if (my $action = $query->param('cookie')) { my %cookie = (-name => $PROGRAM); if ($action eq 'clear') { # Clear the cookie. $cookie{-value} = ''; $cookie{-expires} = '-1M'; } else { # Always refresh the expiration time. $cookie{-expires} = '+1M'; if ($action eq 'set') { # Set the options. my %options = $query->Vars(); delete($options{$_}) for qw(url uri check cookie); # Non-persistent. $cookie{-value} = \%options; } else { # Use the old values. $cookie{-value} = { $query->cookie($PROGRAM) }; } } $cookie = $query->cookie(%cookie); } undef $query; # Not needed any more. # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts. # If we're under mod_perl, there is a way around it... eval { local $SIG{__DIE__}; my $auth = Apache2::RequestUtil->request()->headers_in()->{Authorization}; $ENV{HTTP_AUTHORIZATION} = $auth if $auth; } if (MP2() && !$ENV{HTTP_AUTHORIZATION}); $uri =~ s/^\s+//g; if ($uri =~ m/^file:/) { # Only the http scheme is allowed # TODO: bug 29 &file_uri($uri); } elsif ($uri !~ m/:/) { if ($uri =~ m|^//|) { $uri = 'http:'.$uri; } else { $uri = 'http://'.$uri; } } &check_uri($uri, 1, $Opts{Depth}, $cookie); &html_footer(); } ############################################################################### ################################ # Command line and usage stuff # ################################ sub parse_arguments () { require Getopt::Long; Getopt::Long->require_version(2.17); Getopt::Long->import('GetOptions'); Getopt::Long::Configure('bundling', 'no_ignore_case'); my $masq = ''; my @locs = (); GetOptions('help|h|?' => sub { usage(0) }, 'q|quiet' => sub { $Opts{Quiet} = 1; $Opts{Summary_Only} = 1; }, 's|summary' => \$Opts{Summary_Only}, 'b|broken' => sub { $Opts{Redirects} = 0; $Opts{Dir_Redirects} = 0; }, 'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; }, 'v|verbose' => \$Opts{Verbose}, 'i|indicator' => \$Opts{Progress}, 'H|html' => \$Opts{HTML}, 'r|recursive' => sub { $Opts{Depth} = -1 if $Opts{Depth} == 0; }, 'l|location=s' => \@locs, 'exclude-docs=s', => \$Opts{Exclude_Docs}, 'u|user=s' => \$Opts{User}, 'p|password=s' => \$Opts{Password}, 't|timeout=i' => \$Opts{Timeout}, 'S|sleep=i' => \$Opts{Sleep_Time}, 'L|languages=s' => \$Opts{Accept_Language}, 'D|depth=i' => sub { $Opts{Depth} = $_[1] unless $_[1] == 0; }, 'd|domain=s' => \$Opts{Trusted}, 'masquerade=s' => \$masq, 'hide-same-realm' => \$Opts{Hide_Same_Realm}, 'V|version' => \&version, # Deprecated options: 'n|noacclanguage' => sub { warn("*** Warning: The " . "-n/--noacclanguage option is " . "deprecated and has no effect.\n"); }, 'y|proxy=s' => sub { warn("*** Warning: The -y/--proxy " . "option is deprecated, use the " . "http_proxy\n " . "environment variable instead.\n"); $Opts{HTTP_Proxy} = $_[1]; }, ) || usage(1); if ($masq) { $Opts{Masquerade} = 1; my @masq = split(/\s+/, $masq); if (scalar(@masq) != 2 || !defined($masq[0]) || $masq[0] !~ /\S/ || !defined($masq[1]) || $masq[1] !~ /\S/) { usage(1, "Error: --masquerade takes two whitespace separated URIs."); } else { $Opts{Masquerade_From} = $masq[0]; $Opts{Masquerade_To} = $masq[1]; } } if ($Opts{Accept_Language} && $Opts{Accept_Language} eq 'auto') { $Opts{Accept_Language} = &guess_language(); } if (($Opts{Sleep_Time} || 0) < 1) { warn("*** Warning: minimum allowed sleep time is 1 second, resetting.\n"); $Opts{Sleep_Time} = 1; } push(@{$Opts{Base_Locations}}, map { URI->new($_)->canonical() } @locs); $Opts{Depth} = -1 if ($Opts{Depth} == 0 && @locs); } sub version () { print "$PACKAGE $REVISION\n"; exit 0; } sub usage () { my ($exitval, $msg) = @_; $exitval = 0 unless defined($exitval); $msg ||= ''; $msg =~ s/[\r\n]*$/\n\n/ if $msg; die($msg) unless $Opts{Command_Line}; my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only'; select(STDERR) if $exitval; print "$msg$PACKAGE $REVISION Usage: checklink Options: -s, --summary Result summary only. -b, --broken Show only the broken links, not the redirects. -e, --directory Hide directory redirects, for example http://www.w3.org/TR -> http://www.w3.org/TR/ -r, --recursive Check the documents linked from the first one. -D, --depth N Check the documents linked from the first one to depth N (implies --recursive). -l, --location URI Scope of the documents checked in recursive mode (implies --recursive). Can be specified multiple times. If not specified, the default eg. for http://www.w3.org/TR/html4/Overview.html would be http://www.w3.org/TR/html4/ --exclude-docs REGEXP In recursive mode, do not check links in documents whose URIs match REGEXP. -L, --languages LANGS Accept-Language header to send. The special value 'auto' causes autodetection from the environment. -q, --quiet No output if no errors are found (implies -s). -v, --verbose Verbose mode. -i, --indicator Show progress while parsing. -u, --user USERNAME Specify a username for authentication. -p, --password PASSWORD Specify a password. --hide-same-realm Hide 401's that are in the same realm as the document checked. -S, --sleep SECS Sleep SECS seconds between requests to each server (default and minimum: 1 second). -t, --timeout SECS Timeout for requests (in seconds). -d, --domain DOMAIN Regular expression describing the domain to which authentication information will be sent (default: $trust). --masquerade \"BASE1 BASE2\" Masquerade base URI BASE1 as BASE2. See the manual page for more information. -H, --html HTML output. -?, -h, --help Show this message and exit. -V, --version Output version information and exit. See \"perldoc LWP\" for information about proxy server support, \"perldoc Net::FTP\" for information about various environment variables affecting FTP connections and \"perldoc Net::NNTP\" for setting a default NNTP server for news: URIs. The W3C_CHECKLINK_CFG environment variable can be used to set the configuration file to use. See details in the full manual page, it can be displayed with: perldoc checklink More documentation at: $Cfg{Doc_URI} Please send bug reports and comments to the www-validator mailing list: www-validator\@w3.org (with 'checklink' in the subject) Archives are at: http://lists.w3.org/Archives/Public/www-validator/ "; exit $exitval; } sub ask_password () { eval { local $SIG{__DIE__}; require Term::ReadKey; Term::ReadKey->require_version(2.00); Term::ReadKey->import(qw(ReadMode)); }; if ($@) { warn('Warning: Term::ReadKey 2.00 or newer not available, ' . "password input disabled.\n"); return; } printf(STDERR 'Enter the password for user %s: ', $Opts{User}); ReadMode('noecho', *STDIN); chomp($Opts{Password} = ); ReadMode('restore', *STDIN); print(STDERR "ok.\n"); } ############################################################################### ########################################################################### # Guess an Accept-Language header based on the $LANG environment variable # ########################################################################### sub guess_language () { my $lang = $ENV{LANG} or return undef; $lang =~ s/[\.@].*$//; # en_US.UTF-8, fi_FI@euro... return 'en' if ($lang eq 'C' || $lang eq 'POSIX'); my $res = undef; eval { require Locale::Language; if (my $tmp = Locale::Language::language2code($lang)) { $lang = $tmp; } if (my ($l, $c) = (lc($lang) =~ /^([a-z]+)(?:[-_]([a-z]+))?/)) { if (Locale::Language::code2language($l)) { $res = $l; if ($c) { require Locale::Country; $res .= "-$c" if Locale::Country::code2country($c); } } } }; return $res; } ########################################### # Transform foo into file://localhost/foo # ########################################### sub urize ($) { my $u = URI->new_abs(URI::Escape::uri_unescape($_[0]), URI::file->cwd()); return $u->as_string(); } ######################################## # Check for broken links in a resource # ######################################## sub check_uri ($$$;$$) { my ($uri, $is_first, $depth, $cookie, $is_start) = @_; $is_start ||= $is_first; my $start = &get_timestamp() unless $Opts{Quiet}; # Get and parse the document my $response = &get_document('GET', $uri, $doc_count, \%redirects); # Can we check the resource? If not, we exit here... return -1 if defined($response->{Stop}); if ($is_start) { # Starting point of a new check, eg. from the command line # Use the first URI as the recursion base unless specified otherwise. push(@{$Opts{Base_Locations}}, $response->{absolute_uri}->canonical()) unless @{$Opts{Base_Locations}}; } else { # Before fetching the document, we don't know if we'll be within the # recursion scope or not (think redirects). return -1 unless &in_recursion_scope($response->{absolute_uri}); print $Opts{HTML} ? '
' : '-' x 40, "\n"; } # We are checking a new document $doc_count++; if ($Opts{HTML}) { &html_header($uri, 0, $cookie) if $is_first; print('

'); } my $absolute_uri = $response->{absolute_uri}->as_string(); my $result_anchor = 'results'.$doc_count; if ($is_first && !$Opts{HTML} && !$Opts{Summary_Only}) { my $s = $Opts{Sleep_Time} == 1 ? '' : 's'; my $acclang = $Opts{Accept_Language} || '(not sent)'; printf(<<'EOF', $Accept, $acclang, $Opts{Sleep_Time}, $s); Settings used: - Accept: %s - Accept-Language: %s - Sleeping %d second%s between requests to each server EOF } printf("\nProcessing\t%s\n\n", $Opts{HTML} ? &show_url($absolute_uri) : $absolute_uri) unless $Opts{Quiet}; if ($Opts{HTML}) { print("

\n"); if (! $Opts{Summary_Only}) { my $accept = &encode($Accept); my $acclang = &encode($Opts{Accept_Language} || '(not sent)'); my $s = $Opts{Sleep_Time} == 1 ? '' : 's'; printf(<<'EOF', $accept, $acclang, $Opts{Sleep_Time}, $s);
Settings used:
EOF printf("

Go to the results.

\n", $result_anchor); my $esc_uri = URI::Escape::uri_escape($absolute_uri, "^A-Za-z0-9."); printf("

For reliable link checking results, check HTML validity first. See also CSS validity.

Back to the link checker.

\n", &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)), &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)), &encode($Opts{_Self_URI})); print("
\n");
    }
  }

  if ($Opts{Summary_Only} && !$Opts{Quiet}) {
    print '

' if $Opts{HTML}; print 'This may take some time'; print "... (why?)

" if $Opts{HTML}; print " if the document has many links to check.\n" unless $Opts{HTML}; } # Record that we have processed this resource $processed{$absolute_uri} = 1; # Parse the document my $p = &parse_document($uri, $absolute_uri, $response->content(), 1, $depth != 0); my $base = URI->new($p->{base}); # Check anchors ############### print "Checking anchors...\n" unless $Opts{Summary_Only}; my %errors; foreach my $anchor (keys %{$p->{Anchors}}) { my $times = 0; foreach my $l (keys %{$p->{Anchors}{$anchor}}) { $times += $p->{Anchors}{$anchor}{$l}; } # They should appear only once $errors{$anchor} = 1 if ($times > 1); # Empty IDREF's are not allowed $errors{$anchor} = 1 if ($anchor eq ''); } print " done.\n" unless $Opts{Summary_Only}; # Check links ############# my %links; # Record all the links found foreach my $link (keys %{$p->{Links}}) { my $link_uri = URI->new($link); my $abs_link_uri = URI->new_abs($link_uri, $base); # Work around a bug in URI::sip(s) (URI 1.22 - 1.30). $abs_link_uri = $link_uri if (!defined($abs_link_uri) && $link_uri->scheme() =~ /^sips?$/); if ($Opts{Masquerade}) { if ($abs_link_uri =~ m|^$Opts{Masquerade_From}|) { printf("processing %s in base %s\n", $abs_link_uri, $Opts{Masquerade_To}); my $nlink = $abs_link_uri; $nlink =~ s|^$Opts{Masquerade_From}|$Opts{Masquerade_To}|; $abs_link_uri = URI->new($nlink); }; } foreach my $lines (keys %{$p->{Links}{$link}}) { my $canonical = URI->new($abs_link_uri->canonical()); my $url = $canonical->scheme().':'.$canonical->opaque(); my $fragment = $canonical->fragment(); if (! $fragment) { # Document without fragment $links{$url}{location}{$lines} = 1; } else { # Resource with a fragment $links{$url}{fragments}{$fragment}{$lines} = 1; } } } # Build the list of broken URI's my %broken; foreach my $u (keys %links) { # Don't check mailto: URI's # TODO: bug 29 next if ($u =~ m/^mailto:/); if ($Opts{Summary_Only}) { # Hack: avoid browser/server timeouts in summary only CGI mode, bug 896 print ' ' if ($Opts{HTML} && !$Opts{Command_Line}); } else { &hprintf("Checking link %s\n", $u); } # Check that a link is valid &check_validity($uri, $u, ($depth != 0 && &in_recursion_scope($u)), \%links, \%redirects); &hprintf("\tReturn code: %s\n", $results{$u}{location}{code}) if ($Opts{Verbose}); if ($results{$u}{location}{success}) { # Even though it was not broken, we might want to display it # on the results page (e.g. because it required authentication) $broken{$u}{location} = 1 if ($results{$u}{location}{display} >= 400); # List the broken fragments foreach my $fragment (keys %{$links{$u}{fragments}}) { if ($Opts{Verbose}) { my @frags = sort {$a<=>$b} keys %{$links{$u}{fragments}{$fragment}}; &hprintf("\t\t%s %s - Line%s: %s\n", $fragment, ($results{$u}{fragments}{$fragment}) ? 'OK' : 'Not found', (scalar(@frags) > 1) ? 's' : '', join(', ', @frags) ); } # A broken fragment? if ($results{$u}{fragments}{$fragment} == 0) { $broken{$u}{fragments}{$fragment} += 2; } } } else { # Couldn't find the document $broken{$u}{location} = 1; # All the fragments associated are hence broken foreach my $fragment (keys %{$links{$u}{fragments}}) { $broken{$u}{fragments}{$fragment}++; } } } &hprintf("Processed in %ss.\n", &time_diff($start, &get_timestamp())) unless $Opts{Summary_Only}; # Display results if ($Opts{HTML} && !$Opts{Summary_Only}) { print("
\n"); printf("

Results

\n", $result_anchor); } print "\n" unless $Opts{Quiet}; &anchors_summary($p->{Anchors}, \%errors); &links_summary(\%links, \%results, \%broken, \%redirects); # Do we want to process other documents? if ($depth != 0) { foreach my $u (keys %links) { next unless $results{$u}{location}{success}; # Broken link? next unless &in_recursion_scope($u); # Do we understand its content type? next unless ($results{$u}{location}{type} =~ $ContentTypes); # Have we already processed this URI? next if &already_processed($u); # Do the job print "\n"; if ($Opts{HTML}) { if (!$Opts{Command_Line}) { if ($doc_count == $Opts{Max_Documents}) { print("
\n

Maximum number of documents reached!

\n"); } if ($doc_count >= $Opts{Max_Documents}) { $doc_count++; print("

Not checking $u

\n"); $processed{$u} = 1; next; } } } if ($depth < 0) { &check_uri($u, 0, -1); } else { &check_uri($u, 0, $depth-1); } } } } ####################################### # Get and parse a resource to process # ####################################### sub get_document ($$$;\%) { my ($method, $uri, $in_recursion, $redirects) = @_; # $method contains the HTTP method the use (GET or HEAD) # $uri contains the identifier of the resource # $in_recursion is > 0 if we are in recursion mode (i.e. it is at least # the second resource checked) # $redirects is a pointer to the hash containing the map of the redirects # Get the resource my $response; if (defined($results{$uri}{response}) && !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) { $response = $results{$uri}{response}; } else { $response = &get_uri($method, $uri); &record_results($uri, $method, $response); &record_redirects($redirects, $response); } if (! $response->is_success()) { if (! $in_recursion) { # Is it too late to request authentication? if ($response->code() == 401) { &authentication($response); } else { if ($Opts{HTML}) { &html_header($uri); print "

"; } &hprintf("\nError: %d %s\n", $response->code(), $response->message() || '(no message)'); print "

\n" if $Opts{HTML}; } } $response->{Stop} = 1; return($response); } # What is the URI of the resource that we are processing by the way? my $base_uri = URI->new($response->base()); my $request_uri = URI->new($response->request->url); $response->{absolute_uri} = $request_uri->abs($base_uri); # Can we parse the document? my $failed_reason; my $ct = $response->header('Content-Type'); my $ce = $response->header('Content-Encoding'); if (!$ct || $ct !~ $ContentTypes) { $failed_reason = "Content-Type for <$request_uri> is " . (defined($ct) ? "'$ct'" : 'undefined'); } elsif (defined($ce) && $ce ne 'identity') { # @@@ We could maybe handle gzip... $failed_reason = "Content-Encoding for <$request_uri> is '$ce'"; } if ($failed_reason) { # No, there is a problem... if (! $in_recursion) { if ($Opts{HTML}) { &html_header($uri); print "

\n"; } &hprintf("Can't check links: %s.\n", $failed_reason); print "

\n" if $Opts{HTML}; } $response->{Stop} = 1; } # Ok, return the information return($response); } ######################################################### # Check whether a URI is within the scope of recursion. # ######################################################### sub in_recursion_scope ($) { my ($uri) = @_; return undef unless $uri; my $candidate = URI->new($uri)->canonical(); return undef if (defined($Opts{Exclude_Docs}) && $candidate =~ $Opts{Exclude_Docs}); foreach my $base (@{$Opts{Base_Locations}}) { my $rel = $candidate->rel($base); next if ($candidate eq $rel); # Relative path not possible? next if ($rel =~ m|^(\.\.)?/|); # Relative path upwards? return 1; } return undef; # We always have at least one base location. } ################################################## # Check whether a URI has already been processed # ################################################## sub already_processed ($) { my ($uri) = @_; # Don't be verbose for that part... my $summary_value = $Opts{Summary_Only}; $Opts{Summary_Only} = 1; # Do a GET: if it fails, we stop, if not, the results are cached my $response = &get_document('GET', $uri, 1); # ... but just for that part $Opts{Summary_Only} = $summary_value; # Can we process the resource? return -1 if defined($response->{Stop}); # Have we already processed it? return 1 if defined($processed{$response->{absolute_uri}->as_string()}); # It's not processed yet and it is processable: return 0 return 0; } ############################ # Get the content of a URI # ############################ sub get_uri ($$;$\%$$$$) { # Here we have a lot of extra parameters in order not to lose information # if the function is called several times (401's) my ($method, $uri, $start, $redirects, $code, $realm, $message, $auth) = @_; # $method contains the method used # $uri contains the target of the request # $start is a timestamp (not defined the first time the function is # called) # $redirects is a map of redirects # $code is the first HTTP return code # $realm is the realm of the request # $message is the HTTP message received # $auth equals 1 if we want to send out authentication information # For timing purposes $start = &get_timestamp() unless defined($start); # Prepare the query # Do we want printouts of progress? my $verbose_progress = ! ($Opts{Summary_Only} || (!$doc_count && $Opts{HTML})); &hprintf("%s %s ", $method, $uri) if $verbose_progress; my $request = new HTTP::Request($method, $uri); $request->header('Accept-Language' => $Opts{Accept_Language}) if $Opts{Accept_Language}; $request->header('Accept', $Accept); # Are we providing authentication info? if ($auth && $request->url()->host() =~ $Opts{Trusted}) { if (defined($ENV{HTTP_AUTHORIZATION})) { $request->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION}); } elsif (defined($Opts{User}) && defined($Opts{Password})) { $request->authorization_basic($Opts{User}, $Opts{Password}); } } # Tell the user agent if we want progress reports (in redirects) or not. $ua->{Checklink_verbose_progress} = $verbose_progress; # Check if the IP address is allowed. my $response = &ip_allowed($request->uri()); return $response if $response; # Do the query $response = $ua->request($request); # Get the results # Record the very first response if (! defined($code)) { ($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)}); } # Authentication requested? if ($response->code() == 401 && !defined($auth) && (defined($ENV{HTTP_AUTHORIZATION}) || (defined($Opts{User}) && defined($Opts{Password})))) { # Set host as trusted domain unless we already have one. if (!$Opts{Trusted}) { my $re = sprintf('^%s$', quotemeta($response->base()->host())); $Opts{Trusted} = qr/$re/io; } # Deal with authentication and avoid loops if (! defined($realm)) { $response->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/; $realm = $1; } print "\n" if $verbose_progress; return &get_uri($method, $response->request()->url(), $start, $redirects, $code, $realm, $message, 1); } # @@@ subtract robot delay from the "fetched in" time? &hprintf(" fetched in %ss\n", &time_diff($start, &get_timestamp())) if $verbose_progress; $response->{Realm} = $realm if defined($realm); return $response; } ######################################### # Record the results of an HTTP request # ######################################### sub record_results ($$$) { my ($uri, $method, $response) = @_; $results{$uri}{response} = $response; $results{$uri}{method} = $method; $results{$uri}{location}{code} = $response->code(); $results{$uri}{location}{code} = RC_ROBOTS_TXT() if ($results{$uri}{location}{code} == 403 && $response->message() =~ /Forbidden by robots\.txt/); $results{$uri}{location}{code} = RC_DNS_ERROR() if ($results{$uri}{location}{code} == 500 && $response->message() =~ /Bad hostname '[^\']*'/); $results{$uri}{location}{type} = $response->header('Content-type'); $results{$uri}{location}{display} = $results{$uri}{location}{code}; # Rewind, check for the original code and message. for (my $tmp = $response->previous(); $tmp; $tmp = $tmp->previous()) { $results{$uri}{location}{orig} = $tmp->code(); $results{$uri}{location}{orig_message} = $tmp->message() || '(no message)'; } $results{$uri}{location}{success} = $response->is_success(); # Stores the authentication information if (defined($response->{Realm})) { $results{$uri}{location}{realm} = $response->{Realm}; $results{$uri}{location}{display} = 401 unless $Opts{Hide_Same_Realm}; } # What type of broken link is it? (stored in {record} - the {display} # information is just for visual use only) if (($results{$uri}{location}{display} == 401) && ($results{$uri}{location}{code} == 404)) { $results{$uri}{location}{record} = 404; } else { $results{$uri}{location}{record} = $results{$uri}{location}{display}; } # Did it fail? $results{$uri}{location}{message} = $response->message() || '(no message)'; if (! $results{$uri}{location}{success}) { &hprintf("Error: %d %s\n", $results{$uri}{location}{code}, $results{$uri}{location}{message}) if ($Opts{Verbose}); return; } } #################### # Parse a document # #################### sub parse_document ($$$$$) { my ($uri, $location, $document, $links, $rec_needs_links) = @_; my $p; if (defined($results{$uri}{parsing})) { # We have already done the job. Woohoo! $p->{base} = $results{$uri}{parsing}{base}; $p->{Anchors} = $results{$uri}{parsing}{Anchors}; $p->{Links} = $results{$uri}{parsing}{Links}; return $p; } my $start; $p = W3C::LinkChecker->new(); $p->{base} = $location; if (! $Opts{Summary_Only}) { $start = &get_timestamp(); print("Parsing...\n"); } if (!$Opts{Summary_Only} || $Opts{Progress}) { $p->{Total} = ($document =~ tr/\n//); } # We only look for anchors if we are not interested in the links # obviously, or if we are running a recursive checking because we # might need this information later $p->{only_anchors} = !($links || $rec_needs_links); # Transform into for parsing # Processing instructions are not parsed by process, but in this case # it should be. It's expensive, it's horrible, but it's the easiest way # for right now. $document =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ unless $p->{only_anchors}; $p->parse($document); if (! $Opts{Summary_Only}) { my $stop = &get_timestamp(); print "\r" if $Opts{Progress}; &hprintf(" done (%d lines in %ss).\n", $p->{Total}, &time_diff($start, $stop)); } # Save the results before exiting $results{$uri}{parsing}{base} = $p->{base}; $results{$uri}{parsing}{Anchors} = $p->{Anchors}; $results{$uri}{parsing}{Links} = $p->{Links}; return $p; } #################################### # Constructor for W3C::LinkChecker # #################################### sub new { my $p = HTML::Parser::new(@_, api_version => 3); eval { local $SIG{__DIE__}; $p->utf8_mode(1); }; # Start tags $p->handler(start => 'start', 'self, tagname, attr, text, line'); # Declarations $p->handler(declaration => sub { my $self = shift; $self->declaration(substr($_[0], 2, -1)); }, 'self, text, line'); # Other stuff $p->handler(default => 'parse_progress', 'self, line') if $Opts{Progress}; # Check ? $p->{check_name} = 1; # Check <[..] id="..">? $p->{check_id} = 1; # Don't interpret comment loosely $p->strict_comment(1); return $p; } ################################################# # Record or return the doctype of the document # ################################################# sub doctype { my ($self, $dc) = @_; return $self->{doctype} unless $dc; $_ = $self->{doctype} = $dc; # What to look for depending on the doctype $self->{check_name} = 0 if ($_ eq '-//W3C//DTD XHTML Basic 1.0//EN'); # Check for the id tag if ( # HTML 2.0 & 3.0 m%^-//IETF//DTD HTML [23]\.0//% || # HTML 3.2 m%^-//W3C//DTD HTML 3\.2//%) { $self->{check_id} = 0; } # Enable XML extensions $self->xml_mode(1) if (m%^-//W3C//DTD XHTML %); } ################################### # Print parse progress indication # ################################### sub parse_progress { my ($self, $line) = @_; printf("\r%4d%%", int($line/$self->{Total}*100)) if (defined($line) && $line >= 0); } ############################# # Extraction of the anchors # ############################# sub get_anchor { my ($self, $tag, $attr) = @_; my $anchor = $attr->{id} if $self->{check_id}; if ($self->{check_name} && ($tag eq 'a')) { # @@@@ In XHTML, is mandatory # Force an error if it's not the case (or if id's and name's values # are different) # If id is defined, name if defined must have the same value $anchor ||= $attr->{name}; } return $anchor; } ############################# # W3C::LinkChecker handlers # ############################# sub add_link { my ($self, $uri, $line) = @_; $self->{Links}{$uri}{$line}++ if defined($uri); } sub start { my ($self, $tag, $attr, $text, $line) = @_; $line = LINE_UNKNOWN() unless defined($line); # Anchors my $anchor = $self->get_anchor($tag, $attr); $self->{Anchors}{$anchor}{$line}++ if defined($anchor); # Links if (!$self->{only_anchors}) { # Here, we are checking too many things # The right thing to do is to parse the DTD... if ($tag eq 'base') { # Treat (without href) or as if it didn't exist. if (defined($attr->{href}) && $attr->{href} ne '') { $self->{base} = $attr->{href}; } } else { $self->add_link($attr->{href}, $line); } $self->add_link($attr->{src}, $line); $self->add_link($attr->{data}, $line) if ($tag eq 'object'); $self->add_link($attr->{cite}, $line) if ($tag eq 'blockquote'); } $self->parse_progress($line) if $Opts{Progress}; } sub declaration { my ($self, $text, $line) = @_; $line = LINE_UNKNOWN() unless defined($line); # Extract the doctype my @declaration = split(/\s+/, $text, 4); if (($#declaration >= 3) && ($declaration[0] eq 'DOCTYPE') && (lc($declaration[1]) eq 'html')) { # Parse the doctype declaration $text =~ m/^DOCTYPE\s+html\s+(?:PUBLIC\s+"([^"]+)"|SYSTEM)(\s+"([^"]+)")?\s*$/i; # Store the doctype $self->doctype($1) if $1; # If there is a link to the DTD, record it $self->{Links}{$3}{$line}++ if (!$self->{only_anchors} && $3); } return unless !$self->{only_anchors}; $self->text($text); } ################################ # Check the validity of a link # ################################ sub check_validity ($$$\%\%) { my ($testing, $uri, $want_links, $links, $redirects) = @_; # $testing is the URI of the document checked # $uri is the URI of the target that we are verifying # $want_links is true if we're interested in links in the target doc # $links is a hash of the links in the documents checked # $redirects is a map of the redirects encountered # Checking file: URI's is not allowed with a CGI # TODO: bug 29 if ($testing ne $uri) { if (!$Opts{Command_Line} && $testing !~ m/^file:/ && $uri =~ m/^file:/) { my $msg = 'Error: \'file:\' URI not allowed'; # Can't test? Return 400 Bad request. $results{$uri}{location}{code} = 400; $results{$uri}{location}{record} = 400; $results{$uri}{location}{success} = 0; $results{$uri}{location}{message} = $msg; &hprintf("Error: %d %s\n", 400, $msg) if $Opts{Verbose}; return; } } # Get the document with the appropriate method # Only use GET if there are fragments. HEAD is enough if it's not the # case. my @fragments = keys %{$links->{$uri}{fragments}}; my $method = scalar(@fragments) ? 'GET' : 'HEAD'; my $response; my $being_processed = 0; if ((! defined($results{$uri})) || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) { $being_processed = 1; $response = &get_uri($method, $uri); # Get the information back from get_uri() &record_results($uri, $method, $response); # Record the redirects &record_redirects($redirects, $response); } # We got the response of the HTTP request. Stop here if it was a HEAD. return if ($method eq 'HEAD'); # There are fragments. Parse the document. my $p; if ($being_processed) { # Can we really parse the document? return unless defined($results{$uri}{location}{type}); if ($results{$uri}{location}{type} !~ $ContentTypes) { &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n", $uri, $results{$uri}{location}{type}) if ($Opts{Verbose}); return; } # Do it then $p = &parse_document($uri, $response->base(), $response->content(), 0, $want_links); } else { # We already had the information $p->{Anchors} = $results{$uri}{parsing}{Anchors}; } # Check that the fragments exist foreach my $fragment (keys %{$links->{$uri}{fragments}}) { if (defined($p->{Anchors}{$fragment}) || &escape_match($fragment, $p->{Anchors})) { $results{$uri}{fragments}{$fragment} = 1; } else { $results{$uri}{fragments}{$fragment} = 0; } } } sub escape_match ($\%) { my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]); foreach my $b (keys %$hash) { return 1 if ($a eq URI::Escape::uri_unescape($b)); } return 0; } ########################## # Ask for authentication # ########################## sub authentication ($) { my $r = $_[0]; $r->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/; my $realm = $1; $realm = '' unless defined($realm); if ($Opts{Command_Line}) { printf STDERR <request()->url(), $realm; Authentication is required for %s. The realm is "%s". Use the -u and -p options to specify a username and password and the -d option to specify trusted domains. EOF } else { printf("Status: 401 Authorization Required\nWWW-Authenticate: %s\nConnection: close\nContent-Language: en\nContent-Type: text/html; charset=utf-8\n\n", $r->headers->www_authenticate); printf("%s W3C Link Checker: 401 Authorization Required %s ", $DocType, $Head); &banner(': 401 Authorization Required'); printf("

You need \"%s\" access to %s to perform link checking.
", &encode($realm), (&encode($r->request()->url())) x 2); if ($Opts{Trusted}) { printf <%s EOF } print "

\n"; } } ################## # Get statistics # ################## sub get_timestamp () { return pack('LL', Time::HiRes::gettimeofday()); } sub time_diff ($$) { my @start = unpack('LL', $_[0]); my @stop = unpack('LL', $_[1]); for ($start[1], $stop[1]) { $_ /= 1_000_000; } return(sprintf("%.1f", ($stop[0]+$stop[1])-($start[0]+$start[1]))); } ######################## # Handle the redirects # ######################## # Record the redirects in a hash sub record_redirects (\%$) { my ($redirects, $response) = @_; for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) { $redirects->{$prev->request()->url()} = $response->request()->url(); } } # Determine if a request is redirected sub is_redirected ($%) { my ($uri, %redirects) = @_; return(defined($redirects{$uri})); } # Get a list of redirects for a URI sub get_redirects ($%) { my ($uri, %redirects) = @_; my @history = ($uri); my %seen = ($uri => 1); # for tracking redirect loops my $loop = 0; while ($redirects{$uri}) { $uri = $redirects{$uri}; push(@history, $uri); if ($seen{$uri}) { $loop = 1; last; } else { $seen{$uri}++; } } return ($loop, @history); } #################################################### # Tool for sorting the unique elements of an array # #################################################### sub sort_unique (@) { my %saw; @saw{@_} = (); return (sort { $a <=> $b } keys %saw); } ##################### # Print the results # ##################### sub line_number ($) { my $line = shift; return $line if ($line >= 0); return "(N/A)"; } sub http_rc ($) { my $rc = shift; return $rc if ($rc >= 0); return "(N/A)"; } sub anchors_summary (\%\%) { my ($anchors, $errors) = @_; # Number of anchors found. my $n = scalar(keys(%$anchors)); if (! $Opts{Quiet}) { if ($Opts{HTML}) { print("

Anchors

\n

"); } else { print("Anchors\n\n"); } &hprintf("Found %d anchor%s.", $n, ($n == 1) ? '' : 's'); print('

') if $Opts{HTML}; print("\n"); } # List of the duplicates, if any. my @errors = keys %{$errors}; if (! scalar(@errors)) { print("

Valid anchors!

\n") if (! $Opts{Quiet} && $Opts{HTML} && $n); return; } undef $n; print('

') if $Opts{HTML}; print('List of duplicate and empty anchors'); print < EOF print("\n"); foreach my $anchor (@errors) { my $format; my @unique = &sort_unique(map { line_number($_) } keys %{$anchors->{$anchor}}); if ($Opts{HTML}) { $format = "\n"; } else { my $s = (scalar(@unique) > 1) ? 's' : ''; $format = "\t%s\tLine$s: %s\n"; } printf($format, &encode($anchor eq '' ? 'Empty anchor' : $anchor), join(', ', @unique)); } print("\n
Anchors Lines
%s%s
\n") if $Opts{HTML}; } sub show_link_report (\%\%\%\%\@;$\%) { my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_; print("\n

") if $Opts{HTML}; print("\n"); # Process each URL my ($c, $previous_c); foreach my $u (@$urls) { my @fragments = keys %{$broken->{$u}{fragments}}; # Did we get a redirect? my $redirected = &is_redirected($u, %$redirects); # List of lines my @total_lines; foreach my $l (keys %{$links->{$u}{location}}) { push (@total_lines, $l); } foreach my $f (keys %{$links->{$u}{fragments}}) { next if ($f eq $u && defined($links->{$u}{$u}{LINE_UNKNOWN()})); foreach my $l (keys %{$links->{$u}{fragments}{$f}}) { push (@total_lines, $l); } } my ($redirect_loop, @redirects_urls) = get_redirects($u, %$redirects); my $currloc = $results->{$u}{location}; # Error type $c = &code_shown($u, $results); # What to do my $whattodo; my $redirect_too; if ($todo) { my $currmsg = $currloc->{message} || ''; if ($u =~ m/^javascript:/) { if ($Opts{HTML}) { $whattodo = 'You must change this link: people using a browser without JavaScript support will not be able to follow this link. See the Web Content Accessibility Guidelines on the use of scripting on the Web and the techniques on how to solve this.'; } else { $whattodo = 'Change this link: people using a browser without JavaScript support will not be able to follow this link.'; } } elsif ($c == RC_ROBOTS_TXT()) { $whattodo = 'The link was not checked due to robots exclusion ' . 'rules. Check the link manually.'; } elsif ($redirect_loop) { $whattodo = 'Retrieving the URI results in a redirect loop, that should be ' . 'fixed. Examine the redirect sequence to see where the loop ' . 'occurs.'; } else { $whattodo = $todo->{$c}; } # @@@ 303 and 307 ??? if (defined($redirects{$u}) && ($c != 301) && ($c != 302)) { $redirect_too = 'The original request has been redirected.'; $whattodo .= ' '.$redirect_too unless $Opts{HTML}; } } else { # Directory redirects $whattodo = 'Add a trailing slash to the URL.'; } my @unique = &sort_unique(map { line_number($_) } @total_lines); my $lines_list = join(', ', @unique); my $s = (scalar(@unique) > 1) ? 's' : ''; undef @unique; my @http_codes = ($currloc->{code}); unshift(@http_codes, $currloc->{orig}) if $currloc->{orig}; @http_codes = map { http_rc($_) } @http_codes; if ($Opts{HTML}) { # Style stuff my $idref = ''; if ($codes && (!defined($previous_c) || ($c != $previous_c))) { $idref = ' id="d'.$doc_count.'code_'.$c.'"'; $previous_c = $c; } # Main info for (@redirects_urls) { $_ = &show_url($_); } # HTTP message my $http_message; if ($currloc->{message}) { $http_message = &encode($currloc->{message}); if ($c == 404 || $c == 500) { $http_message = ''. $http_message.''; } } my $redirmsg = $redirect_loop ? ' redirect loop detected' : ''; printf(" %s
What to do: %s%s
Response status code: %s
Response message: %s%s%s
Line%s: %s
\n", # Anchor for return codes $idref, # List of redirects $redirected ? join(' redirected to
', @redirects_urls) . $redirmsg : &show_url($u), # Color &bgcolor($c), # What to do $whattodo, # Redirect too? $redirect_too ? sprintf(' %s', &bgcolor(301), $redirect_too) : '', # Response code chain join(' -> ', map { &encode($_) } @http_codes), # Realm defined($currloc->{realm}) ? sprintf('Realm: %s
', &encode($currloc->{realm})) : '', # HTTP original message defined($currloc->{orig_message}) ? &encode($currloc->{orig_message}). ' -> ' : '', # HTTP final message $http_message, $s, # List of lines $lines_list); if ($#fragments >= 0) { my $fragment_direction = ''; if ($currloc->{code} == 200) { $fragment_direction = ' They need to be fixed!'; } printf("
Broken fragments and their line numbers: %s
\n", $fragment_direction); } } else { my $redirmsg = $redirect_loop ? ' redirect loop detected' : ''; printf("\n%s\t%s\n Code: %s %s\n%s\n", # List of redirects $redirected ? join("\n-> ", @redirects_urls) . $redirmsg : $u, # List of lines $lines_list ? "Line$s: $lines_list" : '', # Response code chain join(' -> ', @http_codes), # HTTP message $currloc->{message} || '', # What to do wrap(' To do: ', ' ', $whattodo)); if ($#fragments >= 0) { if ($currloc->{code} == 200) { print("The following fragments need to be fixed:\n"); } else { print("Fragments:\n"); } } } # Fragments foreach my $f (@fragments) { if ($Opts{HTML}) { printf("
%s: %s
\n", # Broken fragment &show_url($u, $f), # List of lines join(', ', &sort_unique(keys %{$links->{$u}{fragments}{$f}}))); } else { my @unq = &sort_unique(keys %{$links->{$u}{fragments}{$f}}); printf("\t%-30s\tLine%s: %s\n", # Fragment $f, # Multiple? (scalar(@unq) > 1) ? 's' : '', # List of lines join(', ', @unq)); } } print("
\n") if ($Opts{HTML} && scalar(@fragments)); } # End of the table print("
\n") if $Opts{HTML}; } sub code_shown ($$) { my ($u, $results) = @_; if ($results->{$u}{location}{record} == 200) { return $results->{$u}{location}{orig} || $results->{$u}{location}{record}; } else { return $results->{$u}{location}{record}; } } # # Checks whether we're allowed to retrieve the document based on it's IP # address. Takes an URI object and returns a HTTP::Response containing the # appropriate status and error message if the IP was disallowed, undef # otherwise. URIs without hostname or IP address are always allowed, # including schemes where those make no sense (eg. data:, often javascript:). # sub ip_allowed ($) { my ($uri) = @_; return undef if $Opts{Allow_Private_IPs}; # Short-circuit my $hostname = undef; eval { $hostname = $uri->host() }; # Not all URIs implement host()... return undef unless $hostname; my $addr = my $iptype = my $resp = undef; if (my $host = Net::hostent::gethostbyname($hostname)) { $addr = inet_ntoa($host->addr()) if $host->addr(); if ($addr && (my $ip = Net::IP->new($addr))) { $iptype = $ip->iptype(); } } if ($iptype && $iptype ne 'PUBLIC') { $resp = HTTP::Response->new(403, 'Checking non-public IP address disallowed by link checker configuration'); } return $resp; } sub links_summary (\%\%\%\%) { # Advices to fix the problems my %todo = ( 200 => 'There are broken fragments which must be fixed.', 300 => 'It usually means that there is a typo in a link that triggers mod_speling action - this must be fixed!', 301 => 'You should update the link.', 302 => 'Usually nothing.', 303 => 'Usually nothing.', 307 => 'Usually nothing.', 400 => 'Usually the sign of a malformed URL that cannot be parsed by the server.', 401 => "The link is not public. You'd better specify it.", 403 => 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.', 404 => 'The link is broken. Fix it NOW!', 405 => 'The server does not allow HEAD requests. Go ask the guys who run this server why. Check the link manually.', 406 => "The server isn't capable of responding according to the Accept* headers sent. Check it out.", 407 => 'The link is a proxy, but requires Authentication.', 408 => 'The request timed out.', 410 => 'The resource is gone. You should remove this link.', 415 => 'The media type is not supported.', 500 => 'This is a server side problem. Check the URI.', 501 => 'Could not check this link: method not implemented or scheme not supported.', 503 => 'The server cannot service the request, for some unknown reason.', # Non-HTTP codes: RC_ROBOTS_TXT() => "The link was not checked due to robots exclusion rules. Check the link manually, and see also the link checker documentation on robots exclusion.", RC_DNS_ERROR() => 'The hostname could not be resolved. This link needs to be fixed.', ); my %priority = ( 410 => 1, 404 => 2, 403 => 5, 200 => 10, 300 => 15, 401 => 20 ); my ($links, $results, $broken, $redirects) = @_; # List of the broken links my @urls = keys %{$broken}; my @dir_redirect_urls = (); if ($Opts{Redirects}) { # Add the redirected URI's to the report for my $l (keys %$redirects) { next unless (defined($results->{$l}) && defined($links->{$l}) && !defined($broken->{$l})); # Check whether we have a "directory redirect" # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/ my ($redirect_loop, @redirects) = get_redirects($l, %$redirects); if (($#redirects == 1) && (($redirects[0].'/') eq $redirects[1])) { push(@dir_redirect_urls, $l); next; } push(@urls, $l); } } # Broken links and redirects if ($#urls < 0) { if (! $Opts{Quiet}) { if ($Opts{HTML}) { print "

Links

\n

Valid links!

"; } else { print "\nValid links."; } print "\n"; } } else { print('

') if $Opts{HTML}; print("\nList of broken links"); print(' and redirects') if $Opts{Redirects}; # Sort the URI's by HTTP Code my %code_summary; my @idx; foreach my $u (@urls) { if (defined($results->{$u}{location}{record})) { my $c = &code_shown($u, $results); $code_summary{$c}++; push(@idx, $c); } } my @sorted = @urls[ sort { defined($priority{$idx[$a]}) ? defined($priority{$idx[$b]}) ? $priority{$idx[$a]} <=> $priority{$idx[$b]} : -1 : defined($priority{$idx[$b]}) ? 1 : $idx[$a] <=> $idx[$b] } 0 .. $#idx ]; @urls = @sorted; undef(@sorted); undef(@idx); if ($Opts{HTML}) { # Print a summary print <

Fragments listed are broken. See the table below to know what action to take.

EOF foreach my $code (sort(keys(%code_summary))) { printf('', &bgcolor($code)); printf('', $doc_count, $code, http_rc($code)); printf('', $code_summary{$code}); printf('', $todo{$code}); print "\n"; } print "\n
Code Occurrences What to do
%s%s%s
\n"; } else { print(':'); } &show_link_report($links, $results, $broken, $redirects, \@urls, 1, \%todo); } # Show directory redirects if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) { print('

') if $Opts{HTML}; print("\nList of directory redirects"); print("

\n

The links below are not broken, but the document does not use the exact URL.

") if $Opts{HTML}; &show_link_report($links, $results, $broken, $redirects, \@dir_redirect_urls); } } ############################################################################### ################ # Global stats # ################ sub global_stats () { my $stop = &get_timestamp(); my $n_docs = ($doc_count <= $Opts{Max_Documents}) ? $doc_count : $Opts{Max_Documents}; return sprintf('Checked %d document%s in %s seconds.', $n_docs, ($n_docs == 1) ? '' : 's', &time_diff($timestamp, $stop)); } ################## # HTML interface # ################## sub html_header ($;$$) { my ($uri, $doform, $cookie) = @_; my $title = defined($uri) ? $uri : ''; $title = ': ' . $title if ($title =~ /\S/); # mod_perl 1.99_05 doesn't seem to like if the "\n\n" isn't in the same # print() statement as the last header... my $headers = ''; if (! $Opts{Command_Line}) { $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $doform; $headers .= "Content-Type: text/html; charset=utf-8\n"; $headers .= "Set-Cookie: $cookie\n" if $cookie; $headers .= "Content-Language: en\n\n"; } my $script = my $onload = ''; if ($doform) { $script = <<'EOF'; EOF $onload = ' onload="document.forms[0].uri.focus()"'; } print $headers, $DocType, " W3C Link Checker", &encode($title), " ", $Head, $script, " '; &banner($title); } sub banner ($) { my ($title) = @_; printf(<<'EOF', &encode($title), $Cfg{Doc_URI}, $Cfg{Doc_URI});
EOF } sub bgcolor ($) { my ($code) = @_; my $class; my $r = HTTP::Response->new($code); if ($r->is_success()) { return ''; } elsif ($code == RC_ROBOTS_TXT()) { $class = 'dubious'; } elsif ($code == 300) { $class = 'multiple'; } elsif ($code == 401) { $class = 'unauthorized'; } elsif ($r->is_redirect()) { $class = 'redirect'; } elsif ($r->is_error()) { $class = 'broken'; } else { $class = 'broken'; } return(' class="'.$class.'"'); } sub show_url ($;$) { my ($url, $fragment) = @_; if (defined($fragment)) { my $u = URI->new($url); $u->fragment($fragment); $url = $u->as_string(); } $url = &encode($url); return sprintf('%s', $url, defined($fragment) ? &encode($fragment) : $url); } sub html_footer () { printf("

%s

\n", &global_stats()) if ($doc_count > 0 && !$Opts{Quiet}); printf(<<'EOF', $PACKAGE, $REVISION);
%s
%s
EOF } sub file_uri ($) { my ($uri) = @_; &html_header($uri); printf(<<'EOF', &encode($uri));

Forbidden

You cannot check such a URI (%s).

EOF &html_footer(); exit; } sub print_form ($) { my ($q) = @_; # Override undefined values from the cookie, if we got one. my $got_cookie = 0; if (my %cookie = $q->cookie($PROGRAM)) { $got_cookie = 1; while (my ($key, $value) = each %cookie) { $q->param($key, $value) unless defined($q->param($key)); } } my $chk = ' checked="checked"'; $q->param('hide_type', 'all') unless $q->param('hide_type'); my $sum = $q->param('summary') ? $chk : ''; my $red = $q->param('hide_redirects') ? $chk : ''; my $all = ($q->param('hide_type') ne 'dir') ? $chk : ''; my $dir = $all ? '' : $chk; my $acc = $q->param('no_accept_language') ? $chk : ''; my $rec = $q->param('recursive') ? $chk : ''; my $dep = &encode($q->param('depth') || ''); my $cookie_options = ''; if ($got_cookie) { $cookie_options = " "; } else { $cookie_options = " "; } print "

Options




,

", $cookie_options, "

"; } sub encode (@) { return $Opts{HTML} ? HTML::Entities::encode(@_) : @_; } sub hprintf (@) { if (! $Opts{HTML}) { printf(@_); } else { print HTML::Entities::encode(sprintf($_[0], @_[1..@_-1])); } } # Local Variables: # mode: perl # indent-tabs-mode: nil # tab-width: 2 # perl-indent-level: 2 # End: # ex: ts=2 sw=2 et