package Lire::W3CExtendedLog;
use strict;
use base qw/ Exporter /;
use vars qw/ @EXPORT_OK %DEFAULTS_TYPE2REGEX %DEFAULTS_IDENTIFIER2TYPE /;
use Exporter;
use Time::Local;
use Lire::Logger;
BEGIN {
@EXPORT_OK = qw/ w3c_time uri_decode string_decode /;
# Maps W3C types to regex
%DEFAULTS_TYPE2REGEX =
(
'integer' => '(\d+|-)',
'fixed' => '(\d+(?:\.\d+)?|-)',
'uri' => '(\S+)',
'date' => '(\d\d\d\d-\d\d-\d\d)',
'time' => '(\d\d:\d\d(?::\d\d(?:\.\d+)?)?)',
# Match anything beween a starting " and another followed by a space
# (Embedded " are doubled) -> "String with a "" in it"
'string' => '"((?:[^"]|"")*)"',
'name' => '([-_.0-9a-zA-Z]+)', # This also match invalid domain names
'address' => '(\d+\.\d+\.\d+\.\d+|-)',
);
# Maps identifier to type
%DEFAULTS_IDENTIFIER2TYPE =
(
'count' => 'integer',
'time-from' => 'fixed',
'time-to' => 'fixed',
'interval' => 'integer',
'ip' => 'address',
'dns' => 'name',
'status' => 'integer',
'comment' => 'string',
'uri' => 'uri',
'uri-stem' => 'uri',
'uri-query' => 'uri',
'method' => 'name',
'username' => 'uri',
'date' => 'date',
'time' => 'time',
'port' => 'integer',
'time-taken' => 'fixed',
'bytes' => 'integer',
'cached' => 'integer',
);
}
my $debug = 0;
sub debug {
$debug and lr_debug($_[0]);
}
########################################################################
# UTILITY FUNCTIONS
########################################################################
sub uri_decode {
$_[0] =~ s/\%(..)/hex( '0x' . $1)/ge;
$_[0] =~ tr/+/ /;
}
sub string_decode {
# Transform "" into "
$_[0] =~ tr/"/"/s;
}
sub w3c_time {
my ( $date, $time ) = @_;
my ($year, $month, $mday ) = $date =~ /^(\d\d\d\d)-(\d\d)-(\d\d)$/
or die "invalid date string: $date\n";
my ($hours, $min, $sec, $msec ) =
$time =~ /^(\d\d):(\d\d)(?::(\d\d)(?:\.(\d+))?)?$/
or die "invalid time string: $time\n";
$month--;
$year -= 1900;
# Yes, W3C Extended Log time are in UTC time.
return timegm( $sec || 0, $min, $hours, $mday, $month, $year );
}
########################################################################
# PARSER METHODS
########################################################################
sub new {
my $self = shift;
my $class = ref($self) || $self;
bless $self =
{
# Copy in attributes so that they can be overidden
'type2regex' => { %DEFAULTS_TYPE2REGEX },
'identifier2type' => { %DEFAULTS_IDENTIFIER2TYPE },
'log_date' => undef,
'log_time' => undef,
'version' => undef,
'sofware' => undef,
'fields' => undef,
'is_iis' => undef,
'in_header' => 1,
'skip_to_next_header' => 0,
}, $class;
return $self;
}
sub field2re {
my ( $self, $field ) = @_;
if ( $field =~ /^\w+\(.*\)$/ ) {
# Header
if ( $self->{'is_iis'} ) {
return $self->{'type2regex'}{'uri'};
} else {
return $self->{'type2regex'}{'string'};
}
} else {
# csrx are prefixes used in field identifiers, cf W3C specs
my ( $identifier ) = $field =~ /^(?:[csrx]{1,2}-)?(.*)$/;
my $type = $self->{'identifier2type'}{$identifier};
unless ( defined $type ) {
lr_warn( "identifier '$identifier' (constructed from '$field') " .
"not found in identifier2type property. Will use uri type" );
$debug and do {
debug( "dumping identifier2type hash" );
while ((my $k, my $v) = each %{ $self->{'identifier2type'} }) {
if (defined $v) {
debug( "identifier2type{'$k'} = '$v'" );
} else {
debug( "identifier2type{'$k'} undefined" );
}
}
};
return $self->{'type2regex'}{'uri'};
} else {
if ( $type eq 'string' && $self->{'is_iis'} ) {
return $self->{'type2regex'}{'uri'};
} elsif (defined $self->{'type2regex'}{$type}) {
return $self->{'type2regex'}{$type};
} else {
lr_warn( "unknown type: $type. Will return uri type" );
return $self->{'type2regex'}{'uri'};
}
}
}
}
sub field2decoder {
my ( $self, $field ) = @_;
if ( $field =~ /^\w+\(.*\)$/ ) {
# Header
if ( $self->{'is_iis'} ) {
return \&uri_decode;
} else {
return \&string_decode;
}
} else {
my ( $identifier ) = $field =~ /^(?:[csrx]{1,2}-)?(.*)$/;
my $type = $self->{'identifier2type'}{$identifier};
return undef unless defined $type;
if ( $type eq 'string' && $self->{'is_iis'} ) {
return \&uri_decode;
} elsif ( $type eq 'string' ) {
return \&string_decode;
}
return undef;
}
}
sub build_parser {
my ( $self ) = @_;
my @fields = split /\s+/, $self->{'fields'};
# Quick check
die( "we don't support aggregated records: $self->{'fields'}\n" )
if $fields[0] eq 'count';
my @re = ();
my %decoders = ();
foreach my $f ( @fields ) {
if (defined $self->field2re( $f )) {
push @re, $self->field2re( $f );
} else {
die( "internal inconsistency: field2re undefined in '$f'. " .
"aborting.\n")
}
my $decoder = $self->field2decoder( $f );
$decoders{$f} = $decoder if $decoder;
}
my $sep = $self->{'tab_sep'} ? '\t' : '\s+';
my $re = "^" . join ( $sep, @re) . "\$";
lr_info( "will use $re as lexer" );
lr_info( "to parse fields: ", join ", ", @fields );
my $rx = qr/$re/;
$self->{'w3c_parser'} = sub {
# Remove potential extra CR of DOS line ending
$_[0] =~ s/\r?\n?$//;
my @rec = $_[0] =~ /$rx/
or die "lexer failed\n";
my $i = 0;
my %w3c = map { $fields[$i++] => $_ } @rec;
# Decode fields
foreach my $name ( keys %decoders ) {
$decoders{$name}->( $w3c{$name} );
}
# Decode timestamp
my $date = $w3c{'date'} || $self->{'log_date'};
if ( defined $date && exists $w3c{'time'} ) {
$w3c{'lire_time'} = w3c_time( $date, $w3c{'time'} );
}
return \%w3c;
}
}
sub parse_directive {
my ( $self, $line ) = @_;
$line =~ s/\r?\n?$//; # To remove potential extra CR of DOS line ending
die "not a directive line: $line\n"
unless $line =~ /^#/;
my ( $directive, $text ) = $line =~ /^#(\w+): (.*)/
or die( "error parsing directive: $line\n" );
SWITCH:
for ( $directive ) {
/^Version$/i && do {
$self->{'version'} = $text + 0;
die( "Unsupported format version: $text != 1.0\n" )
unless $self->{'version'} == 1;
lr_info( "W3C Extended Log Format ", $text );
last SWITCH;
};
/^Fields$/i && do {
if ( $self->{'fields'} ) {
if ( $self->{'fields'} ne $text ) {
$self->{'skip_to_next_header'} = 1;
lr_warn( "we don't support switching Fields directive: $text.Ignoring those log records\n");
}
last SWITCH;
}
$self->{'fields'} = $text;
# this is reported to occur in MS ISA logs (the one IIS logfiles
# we've seen use space as separator in both Fields and the log
# itself. Unfortunately, this logfile does _not_ escape spaces
# in its values :( )
$self->{'tab_sep'} = $text =~ /\t/;
lr_info( "found tabcharacter in Fields declaration; therefore " .
"using tab as field separator and allowing non-escaped " .
"spaces" ) if $self->{'tab_sep'};
$self->build_parser();
last SWITCH;
};
/^Software$/i && do {
lr_info( "Log generated by $text" );
$self->{'is_iis'} = $text =~ /Microsoft Internet/;
lr_info( "activating IIS specific support" )
if $self->{'is_iis'};
last SWITCH;
};
/^Date$/ && do {
my $date_re = $self->{'type2regex'}{'date'};
my $time_re = $self->{'type2regex'}{'time'};
($self->{'log_date'}, $self->{'log_time'} ) =
$text =~ /$date_re $time_re/o;
if ( defined $self->{'log_date'} && defined $self->{'log_time'} ) {
lr_info( "Date: ", $text );
} else {
lr_warn( "Invalid date directive: $text" );
}
last SWITCH;
};
/^(Start-Date|End-Date|Date|Remark)$/ && do {
# Ignore those directive
lr_info( $directive, ": ", $text );
last SWITCH;
};
/^SubComponent$/ && do {
lr_info( "ignoring log of SubComponent $text" );
$self->{'skip_to_next_header'} = 1;
last SWITCH;
};
# Defaults
lr_warn( "unknown directive: $line" );
}
}
sub parse {
my ($self, $line) = @_;
if ( $line =~ /^#/ ) {
if ( $self->{'skip_to_next_header'} && ! $self->{'in_header'} ) {
$self->{'skip_to_next_header'} = 0;
}
$self->{'in_header'} = 1;
$self->parse_directive( $line );
} elsif ( $self->{'version'} && $self->{'fields'} ) {
$self->{'in_header'} = 0;
return undef
if $self->{'skip_to_next_header'};
$self->parse_record( $line );
} else {
lr_err( "invalid W3C extended log file: must start by Version and Fields directives" );
}
}
sub parse_record {
$_[0]->{'w3c_parser'}->( $_[1] );
}
1;
__END__
=pod
=head1 NAME
Lire::W3CExtendedLog - Base implementation of a W3C Extended Log parser
=head1 SYNOPSIS
use Lire::W3CExtendedLog;
my $parser = new Lire::W3CExtendedLog;
my $w3c_rec = $parser->parse( $line );
=head1 DESCRIPTION
This module defines objects able to parse W3C Extended Log Format.
This log format is defined at http://www.w3.org/TR/WD-logfile.html
All attributes of the created object can be overriden by e.g. modules extending
the object. The attributes are:
=head2 type2regex
I<type2regex> is a hash containing key-value pairs like
'name' => '([-_.0-9a-zA-Z]+)'
Keys are all data formats for log file field entries as defined in the W3C
specification: 'integer', 'fixed', 'uri', 'date', 'time' and 'string', along
with 'name' and 'address' types.
=head2 identifier2type
I<identifier2type> is a hash containing key-value pairs like
'dns' => 'name',
'uri-query' => 'uri',
'ip' =>
Keys are the W3C defined Field identifiers, with their prefixes stripped off.
=head2 field2re
I<field2re> is subroutine; when called as
$self->{field2re('c-ip')}
it will return e.g.
'(\d+\.\d+\.\d+\.\d+|-)'
Arguments are as found in the Fields directive, so, in an ideal world, should
be identifiers. It uses I<type2regex>.
=head2 field2decoder
I<field2decoder> is a subroutine; it returns one of I<\&uri_decode> ,
I<\&string_decode> or I<undef>, depending on, a.o., I<is_iis>. It is
used by I<build_parser>.
=head2 parse
I<parse> is the preferred interface to this module. It expects a line as its
argument, and returns a reference to a hash (like I<&w3c_parser>), or executes
I<&parse_directive>.
=head2 parse_directive
I<parse_directive> expects a directive in its argument, it fills the object.
=head2 w3c_parser
I<w3c_parser> is a subroutine; it expects a logline as argument, and returns a
reference to a hash, mapping $self->{'fields'} entries to their decoded values.
It uses the I<&field2re> and I<&field2decoder> routines. It is build in
build_parser.
=head2 build_parser
I<build_parser> is a subroutine, it builds and returns I<&w3c_parser>.
It is called in I<&parse_directive>.
=head2 log_date and log_time
I<log_date> and I<log_time> contain strings constructed from the Date
directive.
=head2 version and sofware
I<version> and I<software> contain strings constructed from the Version and
Software directives, respectively.
=head2 fields
I<fields> contains the entire string from the Fields directive.
=head2 is_iis
I<is_iis> is set in case the Software directive contains 'Microsoft Internet'
as a substring. It is used to enable IIS specific support.
=head2 tab_sep
I<tab_sep> is set in case tabs are found in the Fields directive. We assume
these will be used in the log itself too, and allow unescaped spaces in the
log.
Summarizing:
&parse --calls--> &parse_directive
`--calls--> &w3c_parser
&parse_directive --calls--> &build_parser
&build_parser --calls--> &field2decoder
`--calls--> &field2re
`--returns--> &w3c_parser
&field2decoder --returns--> &uri_decode, &string_decode
&field2re --uses--> %type2regex
`--uses--> %identifier2type
=head1 BUILDING INHERITING MODULES
FIXME . Needs to be written. Steal from w3c_extended2dlf's
Lire::WWW::ExtendedLog, which ISA Lire::W3CExtendedLog.
=head1 SEE ALSO
w3c_extended2dlf(1), ms_isa2dlf(1)
=head1 AUTHOR
Francis J. Lacoste <flacoste@logreport.org>
=head1 VERSION
$Id: W3CExtendedLog.pm,v 1.18 2006/07/23 13:16:30 vanbaal Exp $
=head1 COPYRIGHT
Copyright (C) 2001-2002 Stichting LogReport Foundation LogReport@LogReport.org
This file is part of Lire.
Lire is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html.
=cut
syntax highlighted by Code2HTML, v. 0.9.1