#!/usr/bin/perl

# Set $WINDOWS to 1 if running on Windows
$WINDOWS = 0;
# $WINDOWS = 1;

# Fields to include in the log file
# XXX should be configurable somewhere
@log_format = qw(action mid reason groups bytes binary);

$MODE = 'highwind';

# load the filter code and run get_config() and some other initialization code
require './cleanfeed'; # XXX

# set up our signal handlers.
$got_usr1 = 0;
$got_usr2 = 0;
$got_hup = 0;
$got_term = 0;
$SIG{USR1} = sub { $got_usr1 = 1 };	# use SIGUSR1 to write statfile
$SIG{USR2} = sub { $got_usr2 = 1 };	# use SIGUSR2 to dump EMP history
$SIG{HUP}  = sub { $got_hup  = 1 };	# use SIGHUP to reload configuration
$SIG{TERM} = sub { $got_term = 1 };	# catch SIGTERM so we can clean up

# Check for the PID file existence and create it
pid_file();

mainloop();
exit 0;

# main loop for standalone mode.
sub mainloop {
	my ($head);

	$| = 1; # Flush STDOUT
	binmode STDIN if $WINDOWS;

	open_logfile();

	my %known_headers = (
		'approved'				=> 'Approved',
		'content-base'			=> 'Content-Base',
		'content-disposition'	=> 'Content-Disposition',
		'content-type'			=> 'Content-Type',
		'control'				=> 'Control',
		'date'					=> 'Date',
		'distribution'			=> 'Distribution',
		'followup-to'			=> 'Followup-To',
		'from'					=> 'From',
		'lines'					=> 'Lines',
		'message-id'			=> 'Message-ID',
		'newsgroups'			=> 'Newsgroups',
		'nntp-posting-host'		=> 'NNTP-Posting-Host',
		'organization'			=> 'Organization',
		'path'					=> 'Path',
		'references'			=> 'References',
		'reply-to'				=> 'Reply-To',
		'sender'				=> 'Sender',
		'subject'				=> 'Subject',
		'supersedes'			=> 'Supersedes',
		'user-agent'			=> 'User-Agent',
		'x-trace'				=> 'X-Trace',
		'x-newsreader'			=> 'X-Newsreader',
		'x-newsposter'			=> 'X-Newsposter',
		'x-mailer'				=> 'X-Mailer',
		'x-poster'				=> 'X-Poster',
		'x-cancelled-by'		=> 'X-Cancelled-By',
		'x-canceled-by'			=> 'X-Canceled-By',
	);

	$/ = "\r\n.\r\n";	# we're slurping in the whole article at once

	%hdr = ();
	while (defined ($head = <STDIN>)) {
		$head =~ s/\r\n/\n/g;
		$bytes = length $head;	# size of the article
		($head, $hdr{__BODY__}) = split(/\n\n/, $head, 2);
		$head =~ s/\n\s+/ /g;	# handle continuation headers

		# read in a line of the header and store it in a hash
		for (split (/\n/, $head)) {
			next if not /^([^ ]+): (.*)$/;
			my ($header, $value) = ($1, $2);
			my $lcheader = lc $header;
			$header = $known_headers{$lcheader}
				if exists $known_headers{$lcheader};
			$hdr{$header} = $value;
		}
					
		my $ret = filter_art();	# the real work

		if ($ret) {
			print "435\r\n"; # rejected
			log_entry($ret);
		} else {
			print "335\r\n"; # accepted
			log_entry() if $config{log_accepts};
		}

		%hdr = ();

		# reload our config if we caught SIGHUP
		re_configure() if $got_hup;
		# write stats file if we caught SIGUSR1
		if ($got_usr1) {
			writestats(1);
			$got_usr1 = 0;
		}
		# terminate cleanly if we caught SIGTERM
		last if $got_term;
		# dump EMP histories if we caught SIGUSR2
		if ($got_usr2) {
			dump_emp();
			$got_usr2 = 0;
		}

	} # stdin loop

	# Cleanup
	close_logfile();

	dump_emp() if $config{do_emp_dump};

	unlink $config{pid_file} if $config{pid_file};
}

# when running standalone, HUP brings us here.
# reload config, close logfiles and repoen if we still want them.
sub re_configure {
	$got_hup = 0;

	get_config();
	setup_stuff();

	# Close the logfile.
	close_logfile();
	# If we still want logfiles, open again.
	open_logfile();
}

# Create a pid file. If it already exists, complain and die.
sub pid_file {
	return undef unless $config{pid_file};

	die "Cleanfeed already running (pid file)\n" if -e $config{pid_file};

	if (open(PIDFILE, ">$config{pid_file}")) {
		print PIDFILE "$$\n";
		close PIDFILE;
	} else {
		warn "cleanfeed can't create pid file: $!\n";
	}
}

##############################################################################
# logging
##############################################################################

# Rotate the logfile
sub rotate_log {
	# Make sure logging is actually set up.
	return 0 unless $config{log_directory} and $config{log_name};

	# Delete files older than we want to keep
	opendir(DIR, $config{log_directory}) or return 1;
	my @newlist;
	while (readdir DIR) {
		next unless /^\Q$config{log_name}\E\.(\d+)$/;
		my $number = $1;
		if ($config{keep_old_logs} and $number >= $config{keep_old_logs} - 1) {
			unlink "$config{log_directory}/$config{log_name}.$number";
		} else {
			push @newlist, $_;
		}
	}
	closedir DIR;

	# Increment all the numbers by one, then move the main logfile to .0
	foreach (reverse sort extension_sort @newlist) {
		$newnum = $1 + 1 if /\.(\d+)$/;
		rename("$config{log_directory}/$_",
			"$config{log_directory}/$config{log_name}.$newnum");
	}

	rename("$config{log_directory}/$config{log_name}",
		"$config{log_directory}/$config{log_name}.0");

	# Close the logfile.
	close_logfile();

	# Open the new logfile.
	open_logfile();

	return 1;
}

sub extension_sort {
	my ($anum, $bnum);

	$anum = $1 if $a =~ /\.(\d+)$/;
	$bnum = $1 if $b =~ /\.(\d+)$/;

	return $anum <=> $bnum;
}

# Write a log entry.
# The format for log entries is defined in @log_format. Each
# element in that array is an element for the log. Elements
# are tab-separated. Each element consists of the name of
# the element, a colon, and the contents.
#
# Pass a true argument if the article is being accepted but logged anyway.
#

# Possible elements for log entries:
#
# action - 'filter' or 'accept'
# localtime - current time in local timezone
# gmtime - current time in GMT
# time - current time in unix format
# reason - reason for rejection
# mid - article's message-id
# groups - newsgroups the article was posted to
# groupcount - how many newsgroups the article was posted to
# lines - article's line count (not from the Lines header)
# bytes - article size
# pathtail - Path tail, second to last entry in the Path header
# peer - first entry in the Path header (who sent us the article)
# nntpph - NNTP-Posting-Host header
# from - From header
# subject - Subject header
# reader - either User-Agent, X-Newsreader, X-Mailer, X-Poster
# date - Date header
# control - Control header
# controltype - Type of control message
# supersedes - Supersedes header
# org - Organization header
# refcount - how many ID's in References header
# urls - list of urls in the body, first three only
# binary - 'yes' if the article is a binary
sub log_entry {
	return 0 unless $Do_Log;

	my ($reason) = @_;

	my $entry = '';
	foreach my $item (@log_format) {
		my $junk;

		if ($item eq 'action') {
			$entry .= 'action:'. ($reason ? 'filter' : 'accept') ."\t";
		} elsif ($item eq 'localtime') {
			$entry .= 'localtime:'. scalar localtime() ."\t";
		} elsif ($item eq 'gmtime') {
			$entry .= 'gmtime:'. scalar gmtime() ."\t";
		} elsif ($item eq 'time') {
			$entry .= "time:$now\t";
		} elsif ($item eq 'reason' and $reason) {
			($junk = $reason) =~ s/\t/ /g;
			$entry .= "reason:$junk\t";
		} elsif ($item eq 'mid') {
			$entry .= "mid:$hdr{'Message-ID'}\t";
		} elsif ($item eq 'groups') {
			$entry .= 'groups:'. join(',', @groups) ."\t";
		} elsif ($item eq 'groupcount') {
			$entry .= 'groupcount:'. (scalar @groups) ."\t";
		} elsif ($item eq 'lines') {
			$entry .= "lines:$lines\t";
		} elsif ($item eq 'bytes') {
			$entry .= "bytes:$bytes\t";
		} elsif ($item eq 'pathtail') {
			$junk = path_tail();
			$entry .= "pathtail:$junk\t" if defined $junk;
		} elsif ($item eq 'peer') {
			$junk = what_peer();
			$entry .= "peer:$junk\t" if defined $junk;
		} elsif ($item eq 'nntpph' and $hdr{'NNTP-Posting-Host'}) {
			($junk = $hdr{'NNTP-Posting-Host'}) =~ s/\t/ /g;
			$entry .= "nntpph:$junk\t";
		} elsif ($item eq 'from') {
			($junk = $hdr{From}) =~ s/\t/ /g;
			$entry .= "from:$junk\t";
		} elsif ($item eq 'subject' and $hdr{Subject}) {
			($junk = $hdr{Subject}) =~ s/\t/ /g;
			$entry .= "subject:$junk\t";
		} elsif ($item eq 'reader') {
			($junk = ($XReader || x_reader())) =~ s/\t/ /g;
			$entry .= "reader:$junk\t" if $junk;
		} elsif ($item eq 'date') {
			$entry .= "date:$hdr{Date}\t";
		} elsif ($item eq 'control' and $hdr{Control}) {
			($junk = $hdr{Control}) =~ s/\t/ /g;
			$entry .= "control:$junk\t";
		} elsif ($item eq 'controltype' and $hdr{Control}) {
			$entry .= "controltype:$1\t" if $hdr{Control} =~ /^\s*(\w)\s/;
		} elsif ($item eq 'supersedes' and $hdr{Supersedes}) {
			($junk = $hdr{Supersedes}) =~ s/\t/ /g;
			$entry .= "supersedes:$junk\t";
		} elsif (($item eq 'org' or $item eq 'organization')
				and $hdr{Organization}) {
			($junk = $hdr{Organization}) =~ s/\t/ /g;
			$entry .= "org:$junk\t";
		} elsif ($item eq 'refcount') {
			$entry .= 'refcount:' . reference_count() . "\t";
		} elsif ($item eq 'urls') {
			$junk = body_urls();
			$junk = [ @$junk[1..3] ] if scalar @$junk > 3;
			$entry .= 'urls:' . join(' ', @$junk) . "\t" if scalar @$junk > 0;
		} elsif ($item eq 'binary') {
			$entry .= "binary:yes\t" if is_binary();
		} # what kind of entry?
	} # foreach format entry

	$entry =~ s/\t+$//;
	$logfh->print("$entry\n");
}

sub open_logfile {
	return 0 unless $Log_File;

	$logfh = new IO::File unless ref $logfh eq 'IO::File';

	return 1 if $logfh->opened;	# logfile is already open (?)

	if ($logfh->open(">>$Log_File")) {
		$Do_Log = 1;
	} else {
		$Do_Log = 0;
		slog('E', "failed to open logfile $Log_File");
	}

	return $Do_Log;
}

sub close_logfile {
	return 0 unless $Do_Log and $logfh->opened;

	$logfh->close;
	$Do_Log = 0;

	return 1;
}

# Second-to-last entry in the Path header.
sub path_tail {
	return $1 if $hdr{Path} =~ /!([^!]+)![^!]+$/;
	return undef;
}

# First entry in the Path header (the peer who sent us this article).
sub what_peer {
	return $1 if $hdr{Path} =~ /^([^!]+)!/;
	return undef;
}

# Extract URLs from the article body and return a reference to a
# list of them.
sub body_urls {
	$body = lc substr($hdr{__BODY__}, 0, 50000) unless defined $body;
	my @body_urls = ($body =~ /$fullURL/og);
	return \@body_urls;
}

# Number of entries in the References header
sub reference_count {
	return 0 if not $hdr{References};
	return scalar ($hdr{References} =~ /<[^>]*>\s*/g);
}

print $MODE.$fullURL.$XReader.$now.$lines; # lint food

1;


syntax highlighted by Code2HTML, v. 0.9.1