#!/usr/bin/perl -w

#
# This script reads Squid access log and downloads referenced objects,
# stuffing them into Polygraph Content Database (.cdb) files, based on
# reported or guessed content type. The user specifies the directory
# where the files should be created or updated.
#

use strict;

# content group entries are checked in the order they are listed here
# last group always matches
my @ContentGroups = (
	{
		name => 'images',
		ctypes => [ qr|image/|i ],
		extensions => [ qr(jpeg|jpg|gif|png)i ],
		format => 'verbatim',
	},

	{
		name => 'htmls',
		ctypes => [ qr|text/\w*html|i ],
		extensions => [ qr(html?)i ],
		format => 'linkonly',
	},

	{
		name => 'downloads',
		ctypes => [ qr|application/(?!\w*java)/|i ],
		extensions => [ qr(zip|tar|tgz|gz|exe)i ],
	},

	{
		name => 'other',
		ctypes => [ qr|.|i ],
		extensions => [ qr|.|i ],
	},
);

my ($opt, $Dir) = @ARGV;
die("usage: $0 --cdbs <existing directory to create or update .cdb files in>\n")
	unless defined $Dir && -d $Dir && $opt eq '--cdbs';
$Dir =~ s|/*$||g;
shift @ARGV; shift @ARGV;
	
# init groups
foreach my $g (@ContentGroups) {
	$g->{hits} = 0;
	$g->{ctypes_stats} = {};
	$g->{extensions_stats} = {};
	$g->{format} = 'verbatim' unless exists $g->{format};
}

$| = 1;

my $cntEntry = 0;
while (<>) {
	chomp;
	++$cntEntry;
	&reportProgress() if $cntEntry % 1000 == 0;

	my @fields = (split);
	next unless @fields >= 10;
	my $url = $fields[6];
	my $type = $fields[9];

	# find matching content group
	my $match;
	foreach my $g (@ContentGroups) {
		last if $match = &groupMatch($g, $url, $type);
	}
	# last resort
	$match = $ContentGroups[$#ContentGroups] unless $match;

	&get($match, $url);
}
&reportProgress();

map { &reportGroup($_) } sort { $b->{hits} <=> $a->{hits} } @ContentGroups;

exit(0);

sub groupMatch {
	my ($g, $url, $type) = @_;

	my $match;
	if (defined $type && $type ne '-') {
		$match = &listMatch($g, 'ctypes', $type);
	} 

	if (!$match && defined $url) {
		my ($ext) = ($url =~ m|/.*\w+\.([^\.]+)|);
		$match = &listMatch($g, 'extensions', $ext) if defined $ext;
	}

	return $match;
}

sub listMatch {
	my ($g, $name, $text) = @_;
	my $list = $g->{$name};

	foreach my $e (@{$list}) {
		if ($text =~ m/$e/) {
			my $stats = $g->{"${name}_stats"};
			if (exists $stats->{$e}) {
				++$stats->{$e};
			} else {
				$stats->{$e} = 1;
			}
			return $g;
		}
	}
	return undef();
}

sub get {
	my ($g, $url) = @_;

	my $tmp = sprintf('%s/wget-%d.out', $Dir, $$);
	my $wget = sprintf("wget --output-document=%s --server-response '%s'",
		$tmp, $url);
	if (system($wget)) {
		warn("failed to fetch '$url'\n");
		return;
	}

	my $db = sprintf('%s/%s.cdb', $Dir, $g->{name});
	my $cdb = sprintf("cdb %s add --format %s %s",
		$db, $g->{format}, $tmp);
	if (system($cdb)) {
		die("failed to add '$url' to $db: $!\n");
	}

	unlink $tmp;

	++$g->{hits};
}

sub reportGroup {
	my ($g) = @_;

	printf("Group: %s\n", $g->{name});
	printf("\thits:  %10d\n", $g->{hits});

	&reportList($g, 'ctypes');
	&reportList($g, 'extensions');

	printf("\n");
}

sub reportList {
	my ($g, $name) = @_;
	my $stats = $g->{"${name}_stats"};

	my $total = 0;
	map { $total += $_ } values %{$stats};

	printf("\t%-10s: %10d\n", $name, $total);

	while (my ($key, $value) = each %{$stats}) {
		printf("\t\t%-20s %5d %10.3f\n", $key, $value, &percent($value, $total));
	}
}

sub reportProgress {
	printf(STDERR "#lines: %03dK\n", $cntEntry/1000);
}

sub percent {
	my ($part, $whole) = @_;
	die() unless defined $whole;
	return -1 unless $whole > 0 && defined($part);
	no integer;
	return 100. * $part/$whole;
}


syntax highlighted by Code2HTML, v. 0.9.1