#!/usr/bin/perl -w

#
# This script reads Squid access log and computes test configuration
# values (suitable for use in Polygraph PGL configuration files) to
# mimic logged traffic
#
# NOTE: request interarrival time distribution calculation assumes that
# the first field in access log is request arrival time (not response departure
# time recorded in standard Squid logs), and that the log is sorted by that
# first field; use the following command or equivalent to convert standard
# access log:
#
#  % access2order access.log | sort -t' ' -n +0
#

use strict;

my $SessionIdleTout = 1*60*1000.; # when a busy session ends

my %Ds = (
	  InterArrival => &newTimeDistr('my_req_inter_arrival', 'Request interarrival times during busy periods'),
	  SessionBusyDur => &newTimeDistr('my_session_busy_period', 'Duration of a busy session period'),
	  SessionBusyCount => &newNumDistr('my_session_busy_count', 'Number of requests per busy session period'),
	  SessionIdleDur => &newTimeDistr('my_session_idle_period', 'Duration of an idle session period'),

	  Rptm => &newTimeDistr('my_think_time', 'Response times'),
	  RequestHeaderSize => &newSizeDistr('my_req_header_size', 'Request header sizes'),
	  RequestBodySize => &newSizeDistr('my_req_content_size', 'Request body sizes'),
	  ResponseSize => &newSizeDistr('my_resp_size', 'Response sizes'),
	  StatusCodes => &newEventsDistr('my_resp_codes', 'Response status codes'),
#	  RequestTypes => &newEventsDistr('my_req_types', 'Request types'),
	  RequestMethods => &newEventsDistr('my_req_methods', 'Request methods'),
);

my %Ips = ();
my ($cntEntry, $cntIp) = (0) x 2;
$| = 1;

while (<>) {
	chomp;
	++$cntEntry;
	&reportProgress() if $cntEntry % 1000 == 0;

	my @fields = (split);
	my $rptm = $fields[1];
	my $time = $fields[0];
	my $ip = $fields[2];
	my ($result, $scode) = split(m|/|, $fields[3]);

	if (exists $Ips{$ip}) {
		my $last = $Ips{$ip}->{last};
		die("access log not sorted by request time, stopped")
			if $time < $last;

		&updateDistr($Ds{Rptm}, $rptm) if $scode == 200 || $scode == 304;
		&updateDistr($Ds{RequestHeaderSize}, $fields[10]);
		&updateDistr($Ds{RequestBodySize}, $fields[11]);
		&updateDistr($Ds{ResponseSize}, $fields[4]) if $scode == 200;
		&updateDistr($Ds{StatusCodes}, $scode);
		&updateDistr($Ds{RequestMethods}, $fields[5]);

		my $gap = 1000.*($time - $last);
		if (!defined $SessionIdleTout || $gap < $SessionIdleTout) {
			&updateDistr($Ds{InterArrival}, $gap);
		} else {
			&updateDistr($Ds{SessionBusyCount},
				$Ips{$ip}->{busy_count});
			&updateDistr($Ds{SessionBusyDur},
				1000.*($last - $Ips{$ip}->{busy_start}));
			&updateDistr($Ds{SessionIdleDur}, 1000.*($time - $last));
			$Ips{$ip}->{busy_start} = $time;
			$Ips{$ip}->{busy_count} = 0;
		}
		$Ips{$ip}->{last} = $time;
		$Ips{$ip}->{busy_count}++;
	} else {
		++$cntIp;
		$Ips{$ip} = {
			last => $time,
			busy_start => $time,
			busy_count => 1,
		}
	}
}
&reportProgress();

map { &reportDistr($_) } sort { $a->{id} cmp $b->{id} } values %Ds;

exit(0);

sub newEventsDistr {
	my ($id, $name) = @_;
	return &newDistr($id, $name, [
		&newArea('all', undef(), undef()),
	]);
}

sub newTimeDistr {
	my ($id, $name) = @_;
	my $distr = &newDistr($id, $name, [
		&newArea('frequent', 1000, 1),
		&newArea('medium', 10*1000, 10),
		&newArea('occasional', 100*1000, 100),
	]);
	$distr->{pgl_type} = 'time_distr';
	$distr->{report_factor} = 1000.0; # convert to seconds
	$distr->{report_unit} = 'seconds';
	return $distr;
}

sub newSizeDistr {
	my ($id, $name) = @_;
	my $distr = &newDistr($id, $name, [
		&newArea('tiny',        1024,     1),
		&newArea('small',    10*1024,    10),
		&newArea('medium',  100*1024,   100),
		&newArea('large',  1000*1024,  1000),
		&newArea('huge',  10000*1024, 10000),
	]);
	$distr->{pgl_type} = 'size_distr';
	$distr->{report_unit} = 'bytes';
	return $distr;
}

sub newNumDistr {
	my $distr = &newSizeDistr(@_);
	$distr->{pgl_type} = 'num_distr';
	$distr->{report_unit} = 'number';
	return $distr;
}

sub newDistr {
	my ($id, $name, $areas) = @_;

	my $d = {
		id => $id,
		name => $name,
		pgl_type => undef(),
		report_factor => undef(),
		report_unit => undef(),

		areas => $areas,

		count => 0,
		sum => 0,
		sqSum => 0,
	};

	# assign minimums
	my $lastMax;
	foreach my $area (@{$d->{areas}}) {
		$area->{min} = $lastMax if defined $lastMax;
		$lastMax = $area->{max};
	}

	return $d;
}

sub newArea {
	my ($name, $max, $factor) = @_;
	return {
		name => $name,
		min => undef(),
		max => $max,
		factor => $factor,
		values => {},
	};
}

sub updateDistr {
	my ($distr, $value) = @_;
	return unless defined $value && $value ne '-';

	# find matching area
	my $area;
	foreach $a (@{$distr->{areas}}) {
		if (defined $area) {
			$area = $a if defined $a->{min} && $value >= $a->{min};
		} else {
			$area = $a;
		}
	}
	die("no matching area for $value in ". $distr->{name}. " distro, stopped")
		unless $area;

	if (defined $area->{factor}) {
		$distr->{sum} += $value;
		$distr->{sqSum} += $value * $value;

		$value = int($value / $area->{factor});
	}

	$distr->{count}++;

	if (defined $area->{values}->{$value}) {
		$area->{values}->{$value}++;
	} else {
		$area->{values}->{$value} = 1;
	}
}

sub reportDistr {
	my ($distr) = @_;

	printf("# %s\n", $distr->{name});
	printf("#\tcount:   %10d\n", $distr->{count});

	if (defined $distr->{areas}->[0]->{factor}) {
		&reportNumDistr($distr);
	} else {
		&reportEventDistr($distr);
	}

	printf("\n");
}

sub reportNumDistr {
	my ($distr) = @_;

	if ($distr->{count}) {
		my $mean = $distr->{sum}/$distr->{count};
		my $dev;
		if ($distr->{count} > 1) {
			my $diff = $distr->{sqSum} - 
				$distr->{sum}*$distr->{sum}/$distr->{count};
			$dev = sqrt($diff / ($distr->{count}-1));
		}
		my $median = &distrPercentile($distr, 50.0);

		printf("#\tmedian:  %s\n", &distrValue($distr, $median));
		printf("#\tmean:    %s\n", &distrValue($distr, $mean));
		printf("#\tstd_dev: %s\n", &distrValue($distr, $dev)) if defined $dev;
		printf("#\trel_dev: %14.3f%%\n", &percent($dev, $mean)) if $mean > 0;
	}
	printf("#\tunit:    %10s\n", $distr->{report_unit});
	
	printf("%s %s = {\n", $distr->{pgl_type}, $distr->{id});

	my $sum = 0;
	foreach my $a (@{$distr->{areas}}) {
		&reportNumArea($distr, $a, \$sum);
	}

	printf("}\n");
}

sub reportNumArea {
	my ($distr, $area, $sumPtr) = @_;

	my @keys = sort { $a <=> $b } keys %{$area->{values}};
	my $bin = { min => undef(), max => undef(), count => 0 };
	foreach my $v (@keys) {
		my $c = $area->{values}->{$v};
		my $value = int($v * $area->{factor});
		&nextBin($distr, $bin, ${$sumPtr}) 
			if ($bin->{count} + $c) >= ($distr->{count}/100.);
		$bin->{count} += $c;
		$bin->{min} = $value unless defined $bin->{min};
		$bin->{max} = $value;
		${$sumPtr} += $c;
	}
	&nextBin($distr, $bin, ${$sumPtr}) if $bin->{count};
}

sub nextBin {
	my ($distr, $bin, $sum) = @_;
	return unless $bin->{count};

	my ($min, $max) = map { &distrValue($distr, $_) }
		($bin->{min}, $bin->{max});

	printf("\t%s : %s %10.3f # %10.3f\n", $min, $max,
		&percent($bin->{count}, $distr->{count}),
		&percent($sum, $distr->{count}));

	$bin->{count} = 0;
	$bin->{min} = $bin->{max} = undef();	
}

sub distrPercentile {
	my ($distr, $level) = @_;

	my $sum = 0;
	my $last;
	foreach my $area (@{$distr->{areas}}) {
		my @keys = sort { $a <=> $b } keys %{$area->{values}};
		foreach my $v (@keys) {
			$sum += $area->{values}->{$v};
			my $value = int($v * $area->{factor});
			return $value if &percent($sum, $distr->{count}) >= $level;
			$last = $value;
		}
	}
	return $last;
}

sub distrValue {
	my ($distr, $v) = @_;
	my $value = $distr->{report_factor} ? $v/$distr->{report_factor} : $v;
	my $f = $distr->{report_factor} ? '%14.3f' : '%10d';
	return sprintf($f, $value);
}

sub reportEventDistr {
	my ($distr) = @_;

	printf("%s = [\n", $distr->{id});
	my $area = $distr->{areas}->[0];

	my @keys = sort { $area->{values}->{$b} <=> $area->{values}->{$a} }
		keys %{$area->{values}};

	my $count = 0;
	foreach my $v (@keys) {
		if (my $c = $area->{values}->{$v}) {
			my $value = sprintf('"%s"', $v);
			if ($count == 0) {
				printf("\t%-10s", $value); # let most frequent entry absorb cal mistakes
			} else {
				printf(",\n") if $count;
				printf("\t%-10s : %.3f%%", $value, &percent($c, $distr->{count}));
			}
			++$count;
		}
	}

	printf("\n];\n", $distr->{id});
}

sub reportProgress {
	printf(STDERR "#%03dK IPs: %3d\n", $cntEntry/1000, $cntIp);
}

sub percent {
	my ($part, $whole) = @_;
	die() unless defined $whole;
	return -1 unless $whole > 0 && defined($part);
	no integer;
	return 100. * $part/$whole;
}


syntax highlighted by Code2HTML, v. 0.9.1