package Lire::W3CExtendedLog; use strict; use base qw/ Exporter /; use vars qw/ @EXPORT_OK %DEFAULTS_TYPE2REGEX %DEFAULTS_IDENTIFIER2TYPE /; use Exporter; use Time::Local; use Lire::Logger; BEGIN { @EXPORT_OK = qw/ w3c_time uri_decode string_decode /; # Maps W3C types to regex %DEFAULTS_TYPE2REGEX = ( 'integer' => '(\d+|-)', 'fixed' => '(\d+(?:\.\d+)?|-)', 'uri' => '(\S+)', 'date' => '(\d\d\d\d-\d\d-\d\d)', 'time' => '(\d\d:\d\d(?::\d\d(?:\.\d+)?)?)', # Match anything beween a starting " and another followed by a space # (Embedded " are doubled) -> "String with a "" in it" 'string' => '"((?:[^"]|"")*)"', 'name' => '([-_.0-9a-zA-Z]+)', # This also match invalid domain names 'address' => '(\d+\.\d+\.\d+\.\d+|-)', ); # Maps identifier to type %DEFAULTS_IDENTIFIER2TYPE = ( 'count' => 'integer', 'time-from' => 'fixed', 'time-to' => 'fixed', 'interval' => 'integer', 'ip' => 'address', 'dns' => 'name', 'status' => 'integer', 'comment' => 'string', 'uri' => 'uri', 'uri-stem' => 'uri', 'uri-query' => 'uri', 'method' => 'name', 'username' => 'uri', 'date' => 'date', 'time' => 'time', 'port' => 'integer', 'time-taken' => 'fixed', 'bytes' => 'integer', 'cached' => 'integer', ); } my $debug = 0; sub debug { $debug and lr_debug($_[0]); } ######################################################################## # UTILITY FUNCTIONS ######################################################################## sub uri_decode { $_[0] =~ s/\%(..)/hex( '0x' . $1)/ge; $_[0] =~ tr/+/ /; } sub string_decode { # Transform "" into " $_[0] =~ tr/"/"/s; } sub w3c_time { my ( $date, $time ) = @_; my ($year, $month, $mday ) = $date =~ /^(\d\d\d\d)-(\d\d)-(\d\d)$/ or die "invalid date string: $date\n"; my ($hours, $min, $sec, $msec ) = $time =~ /^(\d\d):(\d\d)(?::(\d\d)(?:\.(\d+))?)?$/ or die "invalid time string: $time\n"; $month--; $year -= 1900; # Yes, W3C Extended Log time are in UTC time. return timegm( $sec || 0, $min, $hours, $mday, $month, $year ); } ######################################################################## # PARSER METHODS ######################################################################## sub new { my $self = shift; my $class = ref($self) || $self; bless $self = { # Copy in attributes so that they can be overidden 'type2regex' => { %DEFAULTS_TYPE2REGEX }, 'identifier2type' => { %DEFAULTS_IDENTIFIER2TYPE }, 'log_date' => undef, 'log_time' => undef, 'version' => undef, 'sofware' => undef, 'fields' => undef, 'is_iis' => undef, 'in_header' => 1, 'skip_to_next_header' => 0, }, $class; return $self; } sub field2re { my ( $self, $field ) = @_; if ( $field =~ /^\w+\(.*\)$/ ) { # Header if ( $self->{'is_iis'} ) { return $self->{'type2regex'}{'uri'}; } else { return $self->{'type2regex'}{'string'}; } } else { # csrx are prefixes used in field identifiers, cf W3C specs my ( $identifier ) = $field =~ /^(?:[csrx]{1,2}-)?(.*)$/; my $type = $self->{'identifier2type'}{$identifier}; unless ( defined $type ) { lr_warn( "identifier '$identifier' (constructed from '$field') " . "not found in identifier2type property. Will use uri type" ); $debug and do { debug( "dumping identifier2type hash" ); while ((my $k, my $v) = each %{ $self->{'identifier2type'} }) { if (defined $v) { debug( "identifier2type{'$k'} = '$v'" ); } else { debug( "identifier2type{'$k'} undefined" ); } } }; return $self->{'type2regex'}{'uri'}; } else { if ( $type eq 'string' && $self->{'is_iis'} ) { return $self->{'type2regex'}{'uri'}; } elsif (defined $self->{'type2regex'}{$type}) { return $self->{'type2regex'}{$type}; } else { lr_warn( "unknown type: $type. Will return uri type" ); return $self->{'type2regex'}{'uri'}; } } } } sub field2decoder { my ( $self, $field ) = @_; if ( $field =~ /^\w+\(.*\)$/ ) { # Header if ( $self->{'is_iis'} ) { return \&uri_decode; } else { return \&string_decode; } } else { my ( $identifier ) = $field =~ /^(?:[csrx]{1,2}-)?(.*)$/; my $type = $self->{'identifier2type'}{$identifier}; return undef unless defined $type; if ( $type eq 'string' && $self->{'is_iis'} ) { return \&uri_decode; } elsif ( $type eq 'string' ) { return \&string_decode; } return undef; } } sub build_parser { my ( $self ) = @_; my @fields = split /\s+/, $self->{'fields'}; # Quick check die( "we don't support aggregated records: $self->{'fields'}\n" ) if $fields[0] eq 'count'; my @re = (); my %decoders = (); foreach my $f ( @fields ) { if (defined $self->field2re( $f )) { push @re, $self->field2re( $f ); } else { die( "internal inconsistency: field2re undefined in '$f'. " . "aborting.\n") } my $decoder = $self->field2decoder( $f ); $decoders{$f} = $decoder if $decoder; } my $sep = $self->{'tab_sep'} ? '\t' : '\s+'; my $re = "^" . join ( $sep, @re) . "\$"; lr_info( "will use $re as lexer" ); lr_info( "to parse fields: ", join ", ", @fields ); my $rx = qr/$re/; $self->{'w3c_parser'} = sub { # Remove potential extra CR of DOS line ending $_[0] =~ s/\r?\n?$//; my @rec = $_[0] =~ /$rx/ or die "lexer failed\n"; my $i = 0; my %w3c = map { $fields[$i++] => $_ } @rec; # Decode fields foreach my $name ( keys %decoders ) { $decoders{$name}->( $w3c{$name} ); } # Decode timestamp my $date = $w3c{'date'} || $self->{'log_date'}; if ( defined $date && exists $w3c{'time'} ) { $w3c{'lire_time'} = w3c_time( $date, $w3c{'time'} ); } return \%w3c; } } sub parse_directive { my ( $self, $line ) = @_; $line =~ s/\r?\n?$//; # To remove potential extra CR of DOS line ending die "not a directive line: $line\n" unless $line =~ /^#/; my ( $directive, $text ) = $line =~ /^#(\w+): (.*)/ or die( "error parsing directive: $line\n" ); SWITCH: for ( $directive ) { /^Version$/i && do { $self->{'version'} = $text + 0; die( "Unsupported format version: $text != 1.0\n" ) unless $self->{'version'} == 1; lr_info( "W3C Extended Log Format ", $text ); last SWITCH; }; /^Fields$/i && do { if ( $self->{'fields'} ) { if ( $self->{'fields'} ne $text ) { $self->{'skip_to_next_header'} = 1; lr_warn( "we don't support switching Fields directive: $text.Ignoring those log records\n"); } last SWITCH; } $self->{'fields'} = $text; # this is reported to occur in MS ISA logs (the one IIS logfiles # we've seen use space as separator in both Fields and the log # itself. Unfortunately, this logfile does _not_ escape spaces # in its values :( ) $self->{'tab_sep'} = $text =~ /\t/; lr_info( "found tabcharacter in Fields declaration; therefore " . "using tab as field separator and allowing non-escaped " . "spaces" ) if $self->{'tab_sep'}; $self->build_parser(); last SWITCH; }; /^Software$/i && do { lr_info( "Log generated by $text" ); $self->{'is_iis'} = $text =~ /Microsoft Internet/; lr_info( "activating IIS specific support" ) if $self->{'is_iis'}; last SWITCH; }; /^Date$/ && do { my $date_re = $self->{'type2regex'}{'date'}; my $time_re = $self->{'type2regex'}{'time'}; ($self->{'log_date'}, $self->{'log_time'} ) = $text =~ /$date_re $time_re/o; if ( defined $self->{'log_date'} && defined $self->{'log_time'} ) { lr_info( "Date: ", $text ); } else { lr_warn( "Invalid date directive: $text" ); } last SWITCH; }; /^(Start-Date|End-Date|Date|Remark)$/ && do { # Ignore those directive lr_info( $directive, ": ", $text ); last SWITCH; }; /^SubComponent$/ && do { lr_info( "ignoring log of SubComponent $text" ); $self->{'skip_to_next_header'} = 1; last SWITCH; }; # Defaults lr_warn( "unknown directive: $line" ); } } sub parse { my ($self, $line) = @_; if ( $line =~ /^#/ ) { if ( $self->{'skip_to_next_header'} && ! $self->{'in_header'} ) { $self->{'skip_to_next_header'} = 0; } $self->{'in_header'} = 1; $self->parse_directive( $line ); } elsif ( $self->{'version'} && $self->{'fields'} ) { $self->{'in_header'} = 0; return undef if $self->{'skip_to_next_header'}; $self->parse_record( $line ); } else { lr_err( "invalid W3C extended log file: must start by Version and Fields directives" ); } } sub parse_record { $_[0]->{'w3c_parser'}->( $_[1] ); } 1; __END__ =pod =head1 NAME Lire::W3CExtendedLog - Base implementation of a W3C Extended Log parser =head1 SYNOPSIS use Lire::W3CExtendedLog; my $parser = new Lire::W3CExtendedLog; my $w3c_rec = $parser->parse( $line ); =head1 DESCRIPTION This module defines objects able to parse W3C Extended Log Format. This log format is defined at http://www.w3.org/TR/WD-logfile.html All attributes of the created object can be overriden by e.g. modules extending the object. The attributes are: =head2 type2regex I is a hash containing key-value pairs like 'name' => '([-_.0-9a-zA-Z]+)' Keys are all data formats for log file field entries as defined in the W3C specification: 'integer', 'fixed', 'uri', 'date', 'time' and 'string', along with 'name' and 'address' types. =head2 identifier2type I is a hash containing key-value pairs like 'dns' => 'name', 'uri-query' => 'uri', 'ip' => Keys are the W3C defined Field identifiers, with their prefixes stripped off. =head2 field2re I is subroutine; when called as $self->{field2re('c-ip')} it will return e.g. '(\d+\.\d+\.\d+\.\d+|-)' Arguments are as found in the Fields directive, so, in an ideal world, should be identifiers. It uses I. =head2 field2decoder I is a subroutine; it returns one of I<\&uri_decode> , I<\&string_decode> or I, depending on, a.o., I. It is used by I. =head2 parse I is the preferred interface to this module. It expects a line as its argument, and returns a reference to a hash (like I<&w3c_parser>), or executes I<&parse_directive>. =head2 parse_directive I expects a directive in its argument, it fills the object. =head2 w3c_parser I is a subroutine; it expects a logline as argument, and returns a reference to a hash, mapping $self->{'fields'} entries to their decoded values. It uses the I<&field2re> and I<&field2decoder> routines. It is build in build_parser. =head2 build_parser I is a subroutine, it builds and returns I<&w3c_parser>. It is called in I<&parse_directive>. =head2 log_date and log_time I and I contain strings constructed from the Date directive. =head2 version and sofware I and I contain strings constructed from the Version and Software directives, respectively. =head2 fields I contains the entire string from the Fields directive. =head2 is_iis I is set in case the Software directive contains 'Microsoft Internet' as a substring. It is used to enable IIS specific support. =head2 tab_sep I is set in case tabs are found in the Fields directive. We assume these will be used in the log itself too, and allow unescaped spaces in the log. Summarizing: &parse --calls--> &parse_directive `--calls--> &w3c_parser &parse_directive --calls--> &build_parser &build_parser --calls--> &field2decoder `--calls--> &field2re `--returns--> &w3c_parser &field2decoder --returns--> &uri_decode, &string_decode &field2re --uses--> %type2regex `--uses--> %identifier2type =head1 BUILDING INHERITING MODULES FIXME . Needs to be written. Steal from w3c_extended2dlf's Lire::WWW::ExtendedLog, which ISA Lire::W3CExtendedLog. =head1 SEE ALSO w3c_extended2dlf(1), ms_isa2dlf(1) =head1 AUTHOR Francis J. Lacoste =head1 VERSION $Id: W3CExtendedLog.pm,v 1.18 2006/07/23 13:16:30 vanbaal Exp $ =head1 COPYRIGHT Copyright (C) 2001-2002 Stichting LogReport Foundation LogReport@LogReport.org This file is part of Lire. Lire is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program (see COPYING); if not, check with http://www.gnu.org/copyleft/gpl.html. =cut