package Lire::W3CExtendedLog;

use strict;

use base qw/ Exporter /;
use vars qw/ @EXPORT_OK %DEFAULTS_TYPE2REGEX %DEFAULTS_IDENTIFIER2TYPE /;

use Exporter;
use Time::Local;
use Lire::Logger;

BEGIN {
    @EXPORT_OK = qw/ w3c_time uri_decode string_decode /;

    # Maps W3C types to regex
    %DEFAULTS_TYPE2REGEX =
      (
       'integer'	=> '(\d+|-)',
       'fixed'	=> '(\d+(?:\.\d+)?|-)',
       'uri'	=> '(\S+)',
       'date'	=> '(\d\d\d\d-\d\d-\d\d)',
       'time'	=> '(\d\d:\d\d(?::\d\d(?:\.\d+)?)?)',

       # Match anything beween a starting " and another followed by a space
       # (Embedded " are doubled) -> "String with a "" in it"
       'string'	=> '"((?:[^"]|"")*)"',
       'name'	=> '([-_.0-9a-zA-Z]+)',  # This also match invalid domain names
       'address'	=> '(\d+\.\d+\.\d+\.\d+|-)',
      );

    # Maps identifier to type
    %DEFAULTS_IDENTIFIER2TYPE =
      (
       'count'      => 'integer',
       'time-from'  => 'fixed',
       'time-to'    => 'fixed',
       'interval'   => 'integer',
       'ip'	    => 'address',
       'dns'	    => 'name',
       'status'     => 'integer',
       'comment'    => 'string',
       'uri'	    => 'uri',
       'uri-stem'   => 'uri',
       'uri-query'  => 'uri',
       'method'     => 'name',
       'username'   => 'uri',
       'date'	    => 'date',
       'time'	    => 'time',
       'port'	    => 'integer',
       'time-taken' => 'fixed',
       'bytes'      => 'integer',
       'cached'     => 'integer',
      );
}

my $debug = 0;
sub debug {
    $debug and lr_debug($_[0]);
}

########################################################################
#			  UTILITY FUNCTIONS
########################################################################

sub uri_decode {
    $_[0] =~ s/\%(..)/hex( '0x' . $1)/ge;
    $_[0] =~ tr/+/ /;
}

sub string_decode {
    # Transform "" into "
    $_[0] =~ tr/"/"/s;
}

sub w3c_time {
    my ( $date, $time ) = @_;

    my ($year, $month, $mday ) = $date =~ /^(\d\d\d\d)-(\d\d)-(\d\d)$/
      or die "invalid date string: $date\n";

    my ($hours, $min, $sec, $msec ) =
      $time =~ /^(\d\d):(\d\d)(?::(\d\d)(?:\.(\d+))?)?$/
      or die "invalid time string: $time\n";

    $month--;
    $year -= 1900;

    # Yes, W3C Extended Log time are in UTC time.
    return timegm( $sec || 0, $min, $hours, $mday, $month, $year );
}

########################################################################
#			    PARSER METHODS
########################################################################

sub new {
    my $self = shift;
    my $class = ref($self) || $self;
    bless $self = 
      {
       # Copy in attributes so that they can be overidden
       'type2regex'	=> { %DEFAULTS_TYPE2REGEX },
       'identifier2type'	=> { %DEFAULTS_IDENTIFIER2TYPE },
       'log_date'		=> undef,
       'log_time'		=> undef,
       'version'		=> undef,
       'sofware'		=> undef,
       'fields'		=> undef,
       'is_iis'		=> undef,
       'in_header'	=> 1,
       'skip_to_next_header' => 0,
      }, $class;

    return $self;
}

sub field2re {
    my ( $self, $field ) = @_;

    if ( $field =~ /^\w+\(.*\)$/ ) {
	# Header
	if ( $self->{'is_iis'} ) {
	    return $self->{'type2regex'}{'uri'};
	} else {
	    return $self->{'type2regex'}{'string'};
	}
    } else {
        # csrx are prefixes used in field identifiers, cf W3C specs
	my ( $identifier ) = $field =~ /^(?:[csrx]{1,2}-)?(.*)$/;

	my $type = $self->{'identifier2type'}{$identifier};
	unless ( defined $type ) {
	    lr_warn( "identifier '$identifier' (constructed from '$field') " .
              "not found in identifier2type property. Will use uri type" );
            $debug and do {
                debug( "dumping identifier2type hash" );
                while ((my $k, my $v) = each %{ $self->{'identifier2type'} }) {
                    if (defined $v) {
                        debug( "identifier2type{'$k'} = '$v'" );
                    } else {
                        debug( "identifier2type{'$k'} undefined" );
                    }
                }
            };
	    return $self->{'type2regex'}{'uri'};
	} else {
	    if ( $type eq 'string' && $self->{'is_iis'} ) {
		return $self->{'type2regex'}{'uri'};
	    } elsif (defined $self->{'type2regex'}{$type}) {
                return $self->{'type2regex'}{$type};
            } else {
                lr_warn( "unknown type: $type.  Will return uri type" );
                return $self->{'type2regex'}{'uri'};
	    }
	}
    }
}

sub field2decoder {
    my ( $self, $field ) = @_;

    if ( $field =~ /^\w+\(.*\)$/ ) {
	# Header
	if ( $self->{'is_iis'} ) {
	    return \&uri_decode;
	} else {
	    return \&string_decode;
	}
    } else {
	my ( $identifier ) = $field =~ /^(?:[csrx]{1,2}-)?(.*)$/;

	my $type = $self->{'identifier2type'}{$identifier};
	return undef unless defined $type;

	if ( $type eq 'string' && $self->{'is_iis'} ) {
	    return \&uri_decode;
	} elsif ( $type eq 'string' ) {
	    return \&string_decode;
	}
	return undef;
    }
}

sub build_parser {
    my ( $self ) = @_;
    my @fields = split /\s+/, $self->{'fields'};

    # Quick check
    die( "we don't support aggregated records: $self->{'fields'}\n" )
      if $fields[0] eq 'count';

    my @re = ();

    my %decoders = ();
    foreach my $f ( @fields ) {
	if (defined $self->field2re( $f )) {
            push @re, $self->field2re( $f );
        } else {
            die( "internal inconsistency: field2re undefined in '$f'. " .
                "aborting.\n")
        }
	my $decoder = $self->field2decoder( $f );
	$decoders{$f} = $decoder if $decoder;
    }
    my $sep = $self->{'tab_sep'} ? '\t' : '\s+';
    my $re = "^" . join ( $sep, @re) . "\$";
    lr_info( "will use $re as lexer" );
    lr_info( "to parse fields: ", join ", ", @fields );
    my $rx = qr/$re/;

    $self->{'w3c_parser'} = sub {
	# Remove potential extra CR of DOS line ending
	$_[0] =~ s/\r?\n?$//; 

	my @rec = $_[0] =~ /$rx/
	  or die "lexer failed\n";

	my $i = 0;
	my %w3c = map { $fields[$i++] => $_ } @rec;

	# Decode fields
	foreach my $name ( keys %decoders ) {
	    $decoders{$name}->( $w3c{$name} );
	}

	# Decode timestamp
	my $date = $w3c{'date'} || $self->{'log_date'};
	if ( defined $date  && exists $w3c{'time'} ) {
	    $w3c{'lire_time'} = w3c_time( $date, $w3c{'time'} );
	}

	return \%w3c;
    }
}

sub parse_directive {
    my ( $self, $line ) = @_;
    $line =~ s/\r?\n?$//; # To remove potential extra CR of DOS line ending

    die "not a directive line: $line\n"
      unless $line =~ /^#/;

    my ( $directive, $text ) = $line =~ /^#(\w+): (.*)/
      or die( "error parsing directive: $line\n" );

  SWITCH:
    for ( $directive ) {
	/^Version$/i && do {
	    $self->{'version'} = $text + 0;
	    die( "Unsupported format version: $text != 1.0\n" )
	      unless $self->{'version'} == 1;

	    lr_info( "W3C Extended Log Format ", $text );
	    last SWITCH;
	};
	/^Fields$/i && do {
	    if ( $self->{'fields'} ) {
		if ( $self->{'fields'} ne $text ) {
		    $self->{'skip_to_next_header'} = 1;
		    lr_warn( "we don't support switching Fields directive: $text.Ignoring those log records\n");
		}

		last SWITCH;
	    }
	    $self->{'fields'} = $text;
            # this is reported to occur in MS ISA logs (the one IIS logfiles
            # we've seen use space as separator in both Fields and the log
            # itself.  Unfortunately, this logfile does _not_ escape spaces
            # in its values :(  )
	    $self->{'tab_sep'} = $text =~ /\t/;
	    lr_info( "found tabcharacter in Fields declaration; therefore " .
              "using tab as field separator and allowing non-escaped " .
              "spaces" ) if $self->{'tab_sep'};
	    $self->build_parser();
	    last SWITCH;
	};
	/^Software$/i && do {
	    lr_info( "Log generated by $text" );
	    $self->{'is_iis'} = $text =~ /Microsoft Internet/;
	    lr_info( "activating IIS specific support" )
	      if $self->{'is_iis'};
	    last SWITCH;
	};
	/^Date$/ && do {
	    my $date_re = $self->{'type2regex'}{'date'};
	    my $time_re = $self->{'type2regex'}{'time'};
	    ($self->{'log_date'}, $self->{'log_time'} ) =
	      $text =~ /$date_re $time_re/o;
	    if ( defined $self->{'log_date'} && defined $self->{'log_time'} ) {
		lr_info( "Date: ", $text );
	    } else {
		lr_warn( "Invalid date directive: $text" );
	    }
	    last SWITCH;
	};
	/^(Start-Date|End-Date|Date|Remark)$/ && do {
	    # Ignore those directive
	    lr_info( $directive, ": ", $text );
	    last SWITCH;
	};
	/^SubComponent$/ && do  {
	    lr_info( "ignoring log of SubComponent $text" );
	    $self->{'skip_to_next_header'} = 1;
	    last SWITCH;
	};

	# Defaults
	lr_warn( "unknown directive: $line" );
    }
}

sub parse {
    my ($self, $line) = @_;

    if ( $line =~ /^#/ ) {
	if ( $self->{'skip_to_next_header'} && ! $self->{'in_header'} ) {
	    $self->{'skip_to_next_header'} = 0;
	}
	$self->{'in_header'} = 1;
	$self->parse_directive( $line );
    } elsif ( $self->{'version'} && $self->{'fields'} ) {
	$self->{'in_header'} = 0;
	return undef 
	  if $self->{'skip_to_next_header'};

	$self->parse_record( $line );
    } else {
	lr_err( "invalid W3C extended log file: must start by Version and Fields directives" );
    }
}

sub parse_record {
    $_[0]->{'w3c_parser'}->( $_[1] );
}

1;

__END__

=pod

=head1 NAME

Lire::W3CExtendedLog - Base implementation of a W3C Extended Log parser

=head1 SYNOPSIS

use Lire::W3CExtendedLog;

my $parser = new Lire::W3CExtendedLog;

my $w3c_rec = $parser->parse( $line );

=head1 DESCRIPTION

This module defines objects able to parse W3C Extended Log Format.
This log format is defined at http://www.w3.org/TR/WD-logfile.html

All attributes of the created object can be overriden by e.g. modules extending
the object.  The attributes are:

=head2 type2regex

I<type2regex> is a hash containing key-value pairs like

 'name' => '([-_.0-9a-zA-Z]+)'

Keys are all data formats for log file field entries as defined in the W3C
specification: 'integer', 'fixed', 'uri', 'date', 'time' and 'string', along
with 'name' and 'address' types.

=head2 identifier2type

I<identifier2type> is a hash containing key-value pairs like

 'dns'        => 'name',
 'uri-query'  => 'uri',
 'ip'         =>

Keys are the W3C defined Field identifiers, with their prefixes stripped off.

=head2 field2re

I<field2re> is subroutine; when called as

 $self->{field2re('c-ip')}

it will return e.g.

 '(\d+\.\d+\.\d+\.\d+|-)'

Arguments are as found in the Fields directive, so, in an ideal world, should
be identifiers.  It uses I<type2regex>.

=head2 field2decoder

I<field2decoder> is a subroutine; it returns one of I<\&uri_decode> ,
I<\&string_decode> or I<undef>, depending on, a.o., I<is_iis>.  It is
used by I<build_parser>.

=head2 parse

I<parse> is the preferred interface to this module.  It expects a line as its
argument, and returns a reference to a hash (like I<&w3c_parser>), or executes
I<&parse_directive>.

=head2 parse_directive

I<parse_directive> expects a directive in its argument, it fills the object.

=head2 w3c_parser

I<w3c_parser> is a subroutine; it expects a logline as argument, and returns a
reference to a hash, mapping $self->{'fields'} entries to their decoded values.
It uses the I<&field2re> and I<&field2decoder> routines.  It is build in
build_parser.

=head2 build_parser

I<build_parser> is a subroutine, it builds and returns I<&w3c_parser>.
It is called in I<&parse_directive>.

=head2 log_date and log_time

I<log_date> and I<log_time> contain strings constructed from the Date
directive.

=head2 version and sofware

I<version> and I<software> contain strings constructed from the Version and
Software directives, respectively.

=head2 fields

I<fields> contains the entire string from the Fields directive.

=head2 is_iis

I<is_iis> is set in case the Software directive contains 'Microsoft Internet'
as a substring.  It is used to enable IIS specific support.

=head2 tab_sep

I<tab_sep> is set in case tabs are found in the Fields directive.  We assume
these will be used in the log itself too, and allow unescaped spaces in the
log.


Summarizing:

 &parse --calls--> &parse_directive
        `--calls--> &w3c_parser

 &parse_directive --calls--> &build_parser

 &build_parser --calls--> &field2decoder
              `--calls--> &field2re
              `--returns--> &w3c_parser

 &field2decoder --returns--> &uri_decode, &string_decode

 &field2re --uses--> %type2regex
           `--uses--> %identifier2type


=head1 BUILDING INHERITING MODULES

FIXME .  Needs to be written.  Steal from w3c_extended2dlf's
Lire::WWW::ExtendedLog, which ISA Lire::W3CExtendedLog.

=head1 SEE ALSO

w3c_extended2dlf(1), ms_isa2dlf(1)

=head1 AUTHOR

  Francis J. Lacoste <flacoste@logreport.org>

=head1 VERSION

$Id: W3CExtendedLog.pm,v 1.18 2006/07/23 13:16:30 vanbaal Exp $

=head1 COPYRIGHT

Copyright (C) 2001-2002 Stichting LogReport Foundation LogReport@LogReport.org

This file is part of Lire.

Lire is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html. 

=cut