#!/usr/bin/perl
# Set $WINDOWS to 1 if running on Windows
$WINDOWS = 0;
# $WINDOWS = 1;
# Fields to include in the log file
# XXX should be configurable somewhere
@log_format = qw(action mid reason groups bytes binary);
$MODE = 'highwind';
# load the filter code and run get_config() and some other initialization code
require './cleanfeed'; # XXX
# set up our signal handlers.
$got_usr1 = 0;
$got_usr2 = 0;
$got_hup = 0;
$got_term = 0;
$SIG{USR1} = sub { $got_usr1 = 1 }; # use SIGUSR1 to write statfile
$SIG{USR2} = sub { $got_usr2 = 1 }; # use SIGUSR2 to dump EMP history
$SIG{HUP} = sub { $got_hup = 1 }; # use SIGHUP to reload configuration
$SIG{TERM} = sub { $got_term = 1 }; # catch SIGTERM so we can clean up
# Check for the PID file existence and create it
pid_file();
mainloop();
exit 0;
# main loop for standalone mode.
sub mainloop {
my ($head);
$| = 1; # Flush STDOUT
binmode STDIN if $WINDOWS;
open_logfile();
my %known_headers = (
'approved' => 'Approved',
'content-base' => 'Content-Base',
'content-disposition' => 'Content-Disposition',
'content-type' => 'Content-Type',
'control' => 'Control',
'date' => 'Date',
'distribution' => 'Distribution',
'followup-to' => 'Followup-To',
'from' => 'From',
'lines' => 'Lines',
'message-id' => 'Message-ID',
'newsgroups' => 'Newsgroups',
'nntp-posting-host' => 'NNTP-Posting-Host',
'organization' => 'Organization',
'path' => 'Path',
'references' => 'References',
'reply-to' => 'Reply-To',
'sender' => 'Sender',
'subject' => 'Subject',
'supersedes' => 'Supersedes',
'user-agent' => 'User-Agent',
'x-trace' => 'X-Trace',
'x-newsreader' => 'X-Newsreader',
'x-newsposter' => 'X-Newsposter',
'x-mailer' => 'X-Mailer',
'x-poster' => 'X-Poster',
'x-cancelled-by' => 'X-Cancelled-By',
'x-canceled-by' => 'X-Canceled-By',
);
$/ = "\r\n.\r\n"; # we're slurping in the whole article at once
%hdr = ();
while (defined ($head = <STDIN>)) {
$head =~ s/\r\n/\n/g;
$bytes = length $head; # size of the article
($head, $hdr{__BODY__}) = split(/\n\n/, $head, 2);
$head =~ s/\n\s+/ /g; # handle continuation headers
# read in a line of the header and store it in a hash
for (split (/\n/, $head)) {
next if not /^([^ ]+): (.*)$/;
my ($header, $value) = ($1, $2);
my $lcheader = lc $header;
$header = $known_headers{$lcheader}
if exists $known_headers{$lcheader};
$hdr{$header} = $value;
}
my $ret = filter_art(); # the real work
if ($ret) {
print "435\r\n"; # rejected
log_entry($ret);
} else {
print "335\r\n"; # accepted
log_entry() if $config{log_accepts};
}
%hdr = ();
# reload our config if we caught SIGHUP
re_configure() if $got_hup;
# write stats file if we caught SIGUSR1
if ($got_usr1) {
writestats(1);
$got_usr1 = 0;
}
# terminate cleanly if we caught SIGTERM
last if $got_term;
# dump EMP histories if we caught SIGUSR2
if ($got_usr2) {
dump_emp();
$got_usr2 = 0;
}
} # stdin loop
# Cleanup
close_logfile();
dump_emp() if $config{do_emp_dump};
unlink $config{pid_file} if $config{pid_file};
}
# when running standalone, HUP brings us here.
# reload config, close logfiles and repoen if we still want them.
sub re_configure {
$got_hup = 0;
get_config();
setup_stuff();
# Close the logfile.
close_logfile();
# If we still want logfiles, open again.
open_logfile();
}
# Create a pid file. If it already exists, complain and die.
sub pid_file {
return undef unless $config{pid_file};
die "Cleanfeed already running (pid file)\n" if -e $config{pid_file};
if (open(PIDFILE, ">$config{pid_file}")) {
print PIDFILE "$$\n";
close PIDFILE;
} else {
warn "cleanfeed can't create pid file: $!\n";
}
}
##############################################################################
# logging
##############################################################################
# Rotate the logfile
sub rotate_log {
# Make sure logging is actually set up.
return 0 unless $config{log_directory} and $config{log_name};
# Delete files older than we want to keep
opendir(DIR, $config{log_directory}) or return 1;
my @newlist;
while (readdir DIR) {
next unless /^\Q$config{log_name}\E\.(\d+)$/;
my $number = $1;
if ($config{keep_old_logs} and $number >= $config{keep_old_logs} - 1) {
unlink "$config{log_directory}/$config{log_name}.$number";
} else {
push @newlist, $_;
}
}
closedir DIR;
# Increment all the numbers by one, then move the main logfile to .0
foreach (reverse sort extension_sort @newlist) {
$newnum = $1 + 1 if /\.(\d+)$/;
rename("$config{log_directory}/$_",
"$config{log_directory}/$config{log_name}.$newnum");
}
rename("$config{log_directory}/$config{log_name}",
"$config{log_directory}/$config{log_name}.0");
# Close the logfile.
close_logfile();
# Open the new logfile.
open_logfile();
return 1;
}
sub extension_sort {
my ($anum, $bnum);
$anum = $1 if $a =~ /\.(\d+)$/;
$bnum = $1 if $b =~ /\.(\d+)$/;
return $anum <=> $bnum;
}
# Write a log entry.
# The format for log entries is defined in @log_format. Each
# element in that array is an element for the log. Elements
# are tab-separated. Each element consists of the name of
# the element, a colon, and the contents.
#
# Pass a true argument if the article is being accepted but logged anyway.
#
# Possible elements for log entries:
#
# action - 'filter' or 'accept'
# localtime - current time in local timezone
# gmtime - current time in GMT
# time - current time in unix format
# reason - reason for rejection
# mid - article's message-id
# groups - newsgroups the article was posted to
# groupcount - how many newsgroups the article was posted to
# lines - article's line count (not from the Lines header)
# bytes - article size
# pathtail - Path tail, second to last entry in the Path header
# peer - first entry in the Path header (who sent us the article)
# nntpph - NNTP-Posting-Host header
# from - From header
# subject - Subject header
# reader - either User-Agent, X-Newsreader, X-Mailer, X-Poster
# date - Date header
# control - Control header
# controltype - Type of control message
# supersedes - Supersedes header
# org - Organization header
# refcount - how many ID's in References header
# urls - list of urls in the body, first three only
# binary - 'yes' if the article is a binary
sub log_entry {
return 0 unless $Do_Log;
my ($reason) = @_;
my $entry = '';
foreach my $item (@log_format) {
my $junk;
if ($item eq 'action') {
$entry .= 'action:'. ($reason ? 'filter' : 'accept') ."\t";
} elsif ($item eq 'localtime') {
$entry .= 'localtime:'. scalar localtime() ."\t";
} elsif ($item eq 'gmtime') {
$entry .= 'gmtime:'. scalar gmtime() ."\t";
} elsif ($item eq 'time') {
$entry .= "time:$now\t";
} elsif ($item eq 'reason' and $reason) {
($junk = $reason) =~ s/\t/ /g;
$entry .= "reason:$junk\t";
} elsif ($item eq 'mid') {
$entry .= "mid:$hdr{'Message-ID'}\t";
} elsif ($item eq 'groups') {
$entry .= 'groups:'. join(',', @groups) ."\t";
} elsif ($item eq 'groupcount') {
$entry .= 'groupcount:'. (scalar @groups) ."\t";
} elsif ($item eq 'lines') {
$entry .= "lines:$lines\t";
} elsif ($item eq 'bytes') {
$entry .= "bytes:$bytes\t";
} elsif ($item eq 'pathtail') {
$junk = path_tail();
$entry .= "pathtail:$junk\t" if defined $junk;
} elsif ($item eq 'peer') {
$junk = what_peer();
$entry .= "peer:$junk\t" if defined $junk;
} elsif ($item eq 'nntpph' and $hdr{'NNTP-Posting-Host'}) {
($junk = $hdr{'NNTP-Posting-Host'}) =~ s/\t/ /g;
$entry .= "nntpph:$junk\t";
} elsif ($item eq 'from') {
($junk = $hdr{From}) =~ s/\t/ /g;
$entry .= "from:$junk\t";
} elsif ($item eq 'subject' and $hdr{Subject}) {
($junk = $hdr{Subject}) =~ s/\t/ /g;
$entry .= "subject:$junk\t";
} elsif ($item eq 'reader') {
($junk = ($XReader || x_reader())) =~ s/\t/ /g;
$entry .= "reader:$junk\t" if $junk;
} elsif ($item eq 'date') {
$entry .= "date:$hdr{Date}\t";
} elsif ($item eq 'control' and $hdr{Control}) {
($junk = $hdr{Control}) =~ s/\t/ /g;
$entry .= "control:$junk\t";
} elsif ($item eq 'controltype' and $hdr{Control}) {
$entry .= "controltype:$1\t" if $hdr{Control} =~ /^\s*(\w)\s/;
} elsif ($item eq 'supersedes' and $hdr{Supersedes}) {
($junk = $hdr{Supersedes}) =~ s/\t/ /g;
$entry .= "supersedes:$junk\t";
} elsif (($item eq 'org' or $item eq 'organization')
and $hdr{Organization}) {
($junk = $hdr{Organization}) =~ s/\t/ /g;
$entry .= "org:$junk\t";
} elsif ($item eq 'refcount') {
$entry .= 'refcount:' . reference_count() . "\t";
} elsif ($item eq 'urls') {
$junk = body_urls();
$junk = [ @$junk[1..3] ] if scalar @$junk > 3;
$entry .= 'urls:' . join(' ', @$junk) . "\t" if scalar @$junk > 0;
} elsif ($item eq 'binary') {
$entry .= "binary:yes\t" if is_binary();
} # what kind of entry?
} # foreach format entry
$entry =~ s/\t+$//;
$logfh->print("$entry\n");
}
sub open_logfile {
return 0 unless $Log_File;
$logfh = new IO::File unless ref $logfh eq 'IO::File';
return 1 if $logfh->opened; # logfile is already open (?)
if ($logfh->open(">>$Log_File")) {
$Do_Log = 1;
} else {
$Do_Log = 0;
slog('E', "failed to open logfile $Log_File");
}
return $Do_Log;
}
sub close_logfile {
return 0 unless $Do_Log and $logfh->opened;
$logfh->close;
$Do_Log = 0;
return 1;
}
# Second-to-last entry in the Path header.
sub path_tail {
return $1 if $hdr{Path} =~ /!([^!]+)![^!]+$/;
return undef;
}
# First entry in the Path header (the peer who sent us this article).
sub what_peer {
return $1 if $hdr{Path} =~ /^([^!]+)!/;
return undef;
}
# Extract URLs from the article body and return a reference to a
# list of them.
sub body_urls {
$body = lc substr($hdr{__BODY__}, 0, 50000) unless defined $body;
my @body_urls = ($body =~ /$fullURL/og);
return \@body_urls;
}
# Number of entries in the References header
sub reference_count {
return 0 if not $hdr{References};
return scalar ($hdr{References} =~ /<[^>]*>\s*/g);
}
print $MODE.$fullURL.$XReader.$now.$lines; # lint food
1;
syntax highlighted by Code2HTML, v. 0.9.1