#!/usr/local/bin/perl
#
###
# Project:     pflogstats
# Program:     apachelogiostats.pl
# Description: Main program for extract accounting information from Apache2 logio
#
#              Copyright (C) 2003-2003 by Dr. Peter Bieringer <pbieringer at aerasec dot de>
#               ftp://ftp.aerasec.de/pub/linux/postfix/pflogsumm/
#
# License:     GNU GPL v2
# CVS:         $Id: apachelogiostats.pl,v 1.12 2005/04/26 16:02:13 peter Exp $
#
# See also following files: LICENSE, ChangeLog
###

###
# ChangeLog
#	0.01
#	 - copy from sqwmstats.pl and adapt it
#	0.02
#	 - fix bug for matching in common log lines
#	 - add optional (default) TCP overhead accounting (IPv6-enabled)
#	0.03
#	 - implement caching of IP address version (speed-up: x3)
#	0.04
#	 - fix layout problem on accounting > 1 GB
#	0.05
#	 - fix typo introduced in 0.04
#	0.06
#	 - fix parser to accept user with whitespaces
#	0.07
#	 - replace number format function call
###
# ToDo
#	- timerange (set)
#	- implement "format"
###

use strict;
use Getopt::Long;
use Net::IP;

## Name and version
use vars qw{$release $progName};
$release = "0.07";
$progName = "apachelogiostats.pl";


## Define global variables

# option handling
use vars qw{%options %opts};

$options{'help|h|?'}  = \$opts{'help'};
$options{"version"}  = \$opts{'version'};
$options{'acc_notcpoverhead'} = \$opts{'acc_notcpoverhead'};

# module hooks
use vars qw{%hooks};

# Number formats
use vars qw{%numberformat};

## Module loader
# 1st: look into current directory
push @INC, ".";

# 2nd: look into /usr/local/lib/perl5/site_perl/5.8.8/Pflogstats
push @INC, "/usr/local/lib/perl5/site_perl/5.8.8/Pflogstats";

# 3rd: look into /usr/lib/pflogstats
push @INC, "/usr/lib/pflogstats";

# General
require "pflogstats-common-support.pm";
require "pflogstats-extensions-networking.pm";

## Print options (debug)
#for my $key (keys %options) {
#       print $key . "\n";
#};
#exit 0;


# Local variables
my %accounting;
$accounting{'sent'} = 0;
$accounting{'rcvd'} = 0;

my @mainhelptext;
my $p_hook;
# Time range of logdata
use vars qw{$timemin $timemax};
my ($time);
my %monthNums = qw(
    Jan  0 Feb  1 Mar  2 Apr  3 May  4 Jun  5
    Jul  6 Aug  7 Sep  8 Oct  9 Nov 10 Dec 11);


# Local functions prototyping
sub print_apachelogio_stats();


# Help
sub help() {
	my $helpstring = "
    Type: accpopimap
    [--acc_notcpoverhead]        Don't account estimated TCP overhead
    [--debug <debug>]            Debug value
                                  | 0x0020 : display extracted data from log line
";
	return $helpstring;
};


## Help function
sub print_help() {
	print "$progName $release\n\n";

	help();

	print STDERR "  Options from included modules:\n\n";

	## Hook 'help'
	for my $p_hook (sort keys %{$hooks{'help'}}) {
		my $helpstring = &{$hooks{'help'}->{$p_hook}};
		print STDERR "    Options from module '" . $p_hook . "':";
		print STDERR $helpstring . "\n";
	};
};

## Hook 'early_begin'
for my $p_hook (keys %{$hooks{'early_begin'}}) {
	&{$hooks{'early_begin'}->{$p_hook}};
};

## Get options
my $ret = GetOptions(%options);

if (! $ret ) {
	print_help();
	exit 1;
};

# Print help or version
if(defined($opts{'help'})) {
	print_help();
	exit 0;
};
if(defined($opts{'version'})) {
	print "$progName $release\n";
	exit 0;
};


## Hook 'checkoptions'
for $p_hook (keys %{$hooks{'checkoptions'}}) {
	&{$hooks{'checkoptions'}->{$p_hook}};
};

## Hook 'beforemainloopstarts'
for $p_hook (keys %{$hooks{'beforemainloopstarts'}}) {
	&{$hooks{'beforemainloopstarts'}->{$p_hook}};
};

print "DEBUG: start parsing logfile\n" if ($opts{'debug'});

## Start parsing logfile #################################################
my $skip;
my ($user, $ip_string, $rcvd, $sent, $returncode, $request, $date, $size);
my ($ip, $ps, $pr, $ip_version);
my %cache_ip_version;

while(<>) {
	chomp;
	$_ =~ s/^M$//g; # Remove trailing CR
	$~ =~ s/^[[:space:][:cntrl:]]+$//g; # Remove spaces and ctrl chars only

	next if (length($_) == 0); # skip empty lines


	# Parsing web log

	# Todo: Datematching!!!!

	undef $user; undef $ip_string; undef $rcvd; undef $sent; undef $returncode; undef $date; undef $size;

	# Logline: 1.2.3.4 - - [01/Sep/2003:00:38:27 +0200] "GET /path/to/logo.gif HTTP/1.0" 401 401 "https://smtp2.aerasec.de/webmail/cgi-bin/sqwebmail?noframes=1" "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.3) Gecko/20030312" IN=536 OUT=698
	
	# Get content
	printf STDERR "DEBUG/apachelogio: line: " . $_ . "\n" if ( $opts{'debug'} & 0x0020 ) ;
	($ip_string, $user, $date, $request, $returncode, $size) = /^([^\s]+)\s+[^\s]+\s+([^\[]+)\s+\[(.*)\]\s+\"(.*)\"\s+(\d+)\s+(\d+|\-)/i;

	printf STDERR "DEBUG/apachelogio: ip='" . $ip_string . "' user='" . $user . "' date='" . $date . "' request='" . $request . "' returncode='" . $returncode . "' size='" . $size . "'\n" if ( $opts{'debug'} & 0x0020 );

	# Calculate Unixtime
	if (! ($date =~ /^(\d+)\/(.*)\/(\d+):(\d+):(\d+):(\d+) /)) {
		print STDERR "ERROR/apachelogio: line contains no valid date: $date\n";
	};
        $time = timelocal( $6, $5, $4, $1, $monthNums{$2}, $3);

        # Catch min/max times for late timerange display
        if (! defined $timemin || ! defined $timemax ) {
                # initial values
                if (! defined $timemin) { $timemin = $time };
                if (! defined $timemax) { $timemax = $time };
        } else {
                # get min/max
                if    ($time < $timemin) { $timemin = $time; }
                elsif ($time > $timemax) { $timemax = $time; };
        };


	# Log line containing prefix for IN and OUT bytes from logio
	if ( /\s+IN=([0-9]+).*$/ ) {
		$rcvd = $1;
	};
	if ( /\s+OUT=([0-9]+).*$/ ) {
		$sent = $1;
	};

	if ((! defined $sent) && (! defined $rcvd)) {
		# logio values without any prefix tokens at the end of the log line
		if ( /([0-9]+)\s+([0-9]+)$/ ) {
			$rcvd = $1;
			$sent = $2;
		};
	};

	if ((! defined $sent) && (! defined $rcvd)) {
		# use size, we still have nothing else
		if ($size =~ /^[0-9]+$/) {
			$sent = $size;
		};
		# use length of request, we still have nothing else
		#  Rest of HTTP header cannot be estimated
		if ($request ne "-") {
			# Request + CRLF + CRLF
			$rcvd = length($request) + 4;
		} else {
			# No request, at least 2x CRLF
			$rcvd = 4;
		};
	};

	if ( ! (defined $user && defined $ip_string && defined $rcvd && defined $sent ) ) {
		# not a proper accounting line
		print STDERR "DEBUG/apachelogio: not a proper line\n" if ( $opts{'debug'} & 0x0010 ) ;
		next;
	};

	$skip = 0;

	# Hook "testipaddress"
	for my $p_hook (keys %{$main::hooks{'testipaddress'}}) {
		#printf STDERR "DEBUG/apachelogio: test address: " . $ip_string . "\n" if ($opts{'debug'} & 0x0010 ) ;
		if ( &{$main::hooks{'testipaddress'}->{$p_hook}} ($ip_string, 'returnonerror') != 0 ) {
			# excluded
			printf STDERR "DEBUG/apachelogio: excluded from accounting\n" if ($opts{'debug'} & 0x0010 ) ;
			$skip = 1;
			last;
		};
	};

	if ($skip == 0) {
		$accounting{'sent'} += $sent;
		$accounting{'rcvd'} += $rcvd;

		if ( ! defined $opts{'acc_notcpoverhead'} ) {
			if (defined $cache_ip_version{$ip_string}) {
				# in cache
				$ip_version = $cache_ip_version{$ip_string};
			} else {
				# retrieve information
				undef $ip;

				$ip = new Net::IP ($ip_string);

				if (defined $ip) {
					if (($ip->version() == 4) || ($ip->version() == 6)) {
						$ip_version = $ip->version();
					} else {
						$ip_version = 0; # dummy, die later
					};
				} else {
					# Hostname instead of IP address, assume IPv4
					$ip_version = 4;
				};

				# Fill cache
				$cache_ip_version{$ip_string} = $ip_version;
			};

			if ($ip_version == 6) {
				## IPv6
				
				# 2xSYN + 2xFIN = 4x( IPv6[40] + TCP[20] ) = 240
				$accounting{'rcvd'} += 240;

				# 1xSYN + 2xFIN = 3x( IPv6[40] + TCP[20] ) = 180
				$accounting{'sent'} += 120;

				# MTU: 1500 - IPv6[40] - TCP[20] = 1440 

				# Packets received
				$pr = int(($rcvd + 1440 - 1) / 1440);

				# Packets sent
				$ps = int(($sent + 1440 - 1) / 1440);

				# Assume 10% ACKs, header  IPv6[40] + TCP[20] = 60
				$accounting{'rcvd'} += $pr * 60 + $ps * 6;
				$accounting{'send'} += $ps * 60 + $pr * 6;
			} elsif ($ip_version == 4) {
				## IPv4

				# 2xSYN + 2xFIN = 4x( IPv4[20] + TCP[20] ) = 160
				$accounting{'rcvd'} += 160;

				# 1xSYN + 2xFIN = 3x( IPv4[20] + TCP[20] ) = 120
				$accounting{'sent'} += 120;

				# MTU: 1500 - IPv6[20] - TCP[20] = 1460

				# Packets received
				$pr = int(($rcvd + 1460 - 1) / 1460);

				# Packets sent
				$ps = int(($sent + 1460 - 1) / 1460);

				# Assume 10% ACKs, header  IPv4[20] + TCP[20] = 40
				$accounting{'rcvd'} += $pr * 40 + $ps * 4;
				$accounting{'send'} += $ps * 40 + $pr * 4;
			} else {
				die "Unsupported IP version: $ip->version()";
			};
		};
	};
};

print "DEBUG/apachelogio: end parsing logfile\n" if ($opts{'debug'});

if (defined $main::opts{'printstatistics'}) {
	for my $p_hook (keys %{$main::hooks{'printstatistics'}}) {
		&{$main::hooks{'printstatistics'}->{$p_hook}};
	};
};

&print_apachelogio_stats();

exit 0;

#### END

# statistics
sub print_apachelogio_stats() {
	print_headline("Apache logio accounting statistics", "default");

	if (! defined $main::opts{'acc_notcpoverhead'}) {
		print "\n# Accounting data also contains following overheads:\n";
		print "#  + TCP overhead (partially estimated)\n" if (! defined $opts{'acc_notcpoverhead'});
	};

	print '='x75 . "\n";
	printf "%-50s: %6s\n", "", "BytesTraffic"; 
	print_timerange_normal();
	print '-'x75 . "\n";
	printf "%-50s: %11u   %9s\n",
			"received (requests)",
			$accounting{'rcvd'},
			format_number($accounting{'rcvd'});
	printf "%-50s: %11u   %9s\n",
			"sent (data)",
			$accounting{'sent'},
			format_number($accounting{'sent'});
	print '-'x75 . "\n";
	printf "%-50s: %11u   %9s\n", "Total",
		$accounting{'rcvd'} + $accounting{'sent'},
		format_number($accounting{'rcvd'} + $accounting{'sent'});
	print '='x75 . "\n";

	print "\n";

	return 0;
};


syntax highlighted by Code2HTML, v. 0.9.1