# Copyright (c) 2002-2005 the World Wide Web Consortium : # Keio University, # European Research Consortium for Informatics and Mathematics # Massachusetts Institute of Technology. # written by Olivier Thereaux for W3C # # $Id: Basic.pm,v 1.15 2006/06/23 03:53:38 ot Exp $ package W3C::LogValidator::Basic; use strict; use warnings; require Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw() ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our @EXPORT = qw(); our $VERSION = sprintf "%d.%03d",q$Revision: 1.15 $ =~ /(\d+)\.(\d+)/; ########################### # usual package interface # ########################### our $verbose = 1; our %config; sub new { my $self = {}; my $proto = shift; my $class = ref($proto) || $proto; # mandatory vars for the API @{$self->{URIs}} = undef; # don't change this if (@_) {%config = %{(shift)};} if (exists $config{verbose}) {$verbose = $config{verbose}} bless($self, $class); return $self; } sub uris { my $self = shift; if (@_) { @{$self->{URIs}} = @_ } return @{$self->{URIs}}; } sub trim_uris { my $self = shift; my @trimmed_uris; my $exclude_regexp = ""; my @exclude_areas; $exclude_regexp = $config{ExcludeAreas}; if ($exclude_regexp){ $exclude_regexp =~ s/\//\\\//g ; @exclude_areas = split(" ", $exclude_regexp); } else { print "nothing to exclude\n" if ($verbose >2);} my $uri; while ($uri = shift) { my $acceptable = 1; foreach my $area (@exclude_areas) { if ($uri =~ /$area/) { my $slasharea = $area; $slasharea =~ s/\\\//\//g; $slasharea =~ s/\\././g; print "Ignoring $uri matching $slasharea \n" if ($verbose > 2) ; $acceptable = 0; } } push @trimmed_uris,$uri if ($acceptable); } return @trimmed_uris; } ######################################### # Actual subroutine to check the list of uris # ######################################### sub process_list { my $self = shift; my $max_invalid = undef; my $max_documents = undef; if (exists $config{MaxDocuments}) {$max_documents = $config{MaxDocuments}} else {$max_documents = 0} # This basic module does not actually "validates" # so MaxInvalid is not relevant... Keeping it anyway if (exists $config{MaxInvalid}) {$max_invalid = $config{MaxInvalid}} else {$max_invalid = 0} my $name = ""; if (exists $config{ServerName}) {$name = $config{ServerName}} print "Now Using the Basic module... \n" if $verbose; my %hits; my %HTTPcodes; my @uris = undef; use DB_File; if (defined ($config{tmpfile})) { my $tmp_file = $config{tmpfile}; tie (%hits, 'DB_File', "$tmp_file", O_RDONLY) || die ("Cannot create or open $tmp_file"); @uris = sort { $hits{$b} <=> $hits{$a} } keys %hits; } elsif ($self->uris()) { @uris = $self->uris(); foreach my $uri (@uris) { $hits{$uri} = 0 } } @uris = $self->trim_uris(@uris); if (defined ($config{tmpfile_HTTP_codes})) { my $tmp_file_HTTP_codes = $config{tmpfile_HTTP_codes}; tie (%HTTPcodes, 'DB_File', "$tmp_file_HTTP_codes", O_RDONLY) || die ("Cannot create or open $tmp_file_HTTP_codes"); } my $intro="Here are the most popular documents overall for $name."; my @result; my @result_head; push @result_head, "Rank"; push @result_head, "Hits"; push @result_head, "Address"; my $census = 0; while ( (@uris) and (($census < $max_documents) or (!$max_documents)) ) { my $uri = shift (@uris); chomp ($uri); my @result_tmp; if (!defined $HTTPcodes{$uri}) #(!defined $HTTPcodes{$uri}) or ($HTTPcodes{$uri} eq "") or ( $HTTPcodes{$uri} =~ /^[2-3]/)) { # This module should ignore requests that resulted in 4XX and 5XX codes $census++; push @result_tmp, "$census"; push @result_tmp, "$hits{$uri}"; push @result_tmp, "$uri"; push @result, [@result_tmp]; } elsif ($HTTPcodes{$uri} eq "200") # should perhaps make a subroutine for that instead of DUPing code { $census++; push @result_tmp, "$census"; push @result_tmp, "$hits{$uri}"; push @result_tmp, "$uri"; push @result, [@result_tmp]; } elsif ((defined $HTTPcodes{$uri}) and ($verbose > 1)) { print "$uri returned code $HTTPcodes{$uri}, ignoring \n"; } } print "Done!\n" if $verbose; if ($census eq 1) # let's repect grammar here { $intro=~ s/are/is/; $intro=~ s/ //; $intro=~ s/document\(s\)/document/; } else { $intro=~ s//$census/; } if (defined ($config{tmpfile})) { untie %hits; } my $outro=""; my %returnhash; $returnhash{"name"}="basic"; $returnhash{"intro"}=$intro; $returnhash{"outro"}=$outro; @{$returnhash{"thead"}}=@result_head; @{$returnhash{"trows"}}=@result; return %returnhash; } package W3C::LogValidator::Basic; 1; __END__ =head1 NAME W3C::LogValidator::Basic - [W3C Log Validator] Sort Web server log entries by popularity (hits) =head1 SYNOPSIS use W3C::LogValidator::Basic; my $b = new W3C::LogValidator::Basic; $b->uris('http://www.w3.org/Overview.html', 'http://www.yahoo.com/index.html'); my $result_string= $b->process_list(); =head1 DESCRIPTION This module is part of the W3C::LogValidator suite, and simply gives back pages sorted by popularity. This is an example of simple module for LogValidator. =head1 API =head2 Constructor =over 2 =item $b = W3C::LogValidator::Basic->new Constructs a new C processor. You might pass it a configuration hash reference (see L and L) Particularly relevant for this module are the "verbose", "MaxDocuments" and obviously "tmpfile" (see C). Pass the configuration hash ref as follows: $b = W3C::LogValidator::HTMLValidator->new(\%config); =back =head2 General Methods =over 4 =item b->uris Returns a list of URIs to be processed (unless the configuration gives the location for the hash of URI/hits berkeley file, see C If an array is given as a parameter, also sets the list of URIs and returns it. Note: while this method is useful in other modules of L, this basic module is here to sort URIs extracted from Log Files by popularity, this method is hence rather useless for L. =item b->trim_uris Given a list of URIs of documents to process, returns a subset of this list containing the URIs of documents the module supposedly can handle. For this module, the decision is made based on the setting for ExcludedAreas only =item b->process_list Formats the list of URIs sorted by popularity. Returns a result hash. Keys for this hash are: name (string): the name of the module, i.e "Basic" intro (string): introduction to the processing results thead (array): headers of the results table trows (array of arrays): rows of the results table outro (string): conclusion of the processing results =back =head1 BUGS Public bug-tracking interface at http://www.w3.org/Bugs/Public/ =head1 AUTHOR Olivier Thereaux for W3C =head1 SEE ALSO W3C::LogValidator, perl(1). Up-to-date complete info at http://www.w3.org/QA/Tools/LogValidator/ =cut