#!/usr/bin/perl -w -T
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
my $PREFIX = '@@PREFIX@@'; # substituted at 'make' time
my $DEF_RULES_DIR = '@@DEF_RULES_DIR@@'; # substituted at 'make' time
my $LOCAL_RULES_DIR = '@@LOCAL_RULES_DIR@@'; # substituted at 'make' time
my $LOCAL_STATE_DIR = '@@LOCAL_STATE_DIR@@'; # substituted at 'make' time
use lib '@@INSTALLSITELIB@@'; # substituted at 'make' time
use File::Spec;
BEGIN { # see comments in "spamassassin.raw" for doco
my @bin = File::Spec->splitpath($0);
my $bin = ($bin[0] ? File::Spec->catpath(@bin[0..1]) : $bin[1])
|| File::Spec->curdir;
if (-e $bin.'/lib/Mail/SpamAssassin.pm'
|| !-e '@@INSTALLSITELIB@@/Mail/SpamAssassin.pm' )
{
my $searchrelative;
$searchrelative = 1; # disabled during "make install": REMOVEFORINST
if ($searchrelative && $bin eq '../' && -e '../blib/lib/Mail/SpamAssassin.pm')
{
unshift ( @INC, '../blib/lib' );
} else {
foreach ( qw(lib ../lib/site_perl
../lib/spamassassin ../share/spamassassin/lib))
{
my $dir = File::Spec->catdir( $bin, split ( '/', $_ ) );
if ( -f File::Spec->catfile( $dir, "Mail", "SpamAssassin.pm" ) )
{ unshift ( @INC, $dir ); last; }
}
}
}
}
use strict;
use warnings;
use Mail::SpamAssassin;
use Getopt::Long;
use File::Copy;
use File::Path;
use Pod::Usage;
use Data::Dumper;
use vars qw( %opt );
Mail::SpamAssassin::Util::clean_path_in_taint_mode();
Mail::SpamAssassin::Util::untaint_var( \%ENV );
##############################################################################
# testing purposes only
my $fixup_re_test;
#$fixup_re_test = 1; fixup_re("\\x{1b}\$b"); die;
#$fixup_re_test = 1; fixup_re("\\33\$b"); die;
#$fixup_re_test = 1; fixup_re("[link]"); die;
#$fixup_re_test = 1; fixup_re("please do not resend your original message."); die;
##############################################################################
Getopt::Long::Configure(
qw(bundling no_getopt_compat
permute no_auto_abbrev no_ignore_case)
);
GetOptions(
'list' => \$opt{'list'},
'sudo' => \$opt{'sudo'},
'keep-tmps' => \$opt{'keep-tmps'},
'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'},
'prefspath|prefs-file|p=s' => \$opt{'prefspath'},
'siteconfigpath=s' => \$opt{'siteconfigpath'},
'updatedir=s' => \$opt{'updatedir'},
'cf=s' => \@{$opt{'cf'}},
'debug|D:s' => \$opt{'debug'},
'help|h|?' => \$opt{'help'},
'version|V' => \$opt{'version'},
)
or usage( 0, "Unknown option!" );
if ( defined $opt{'help'} ) {
usage( 0, "For more information read the manual page" );
}
if ( defined $opt{'version'} ) {
print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n";
exit 0;
}
sub usage {
my ( $exitval, $message ) = @_;
$exitval ||= 64;
if ($exitval == 0) {
print_version();
print("\n");
}
pod2usage(
-verbose => 0,
-message => $message,
-exitval => $exitval,
);
}
# set debug areas, if any specified (only useful for command-line tools)
if (defined $opt{'debug'}) {
$opt{'debug'} ||= 'all';
}
# at least info
$opt{'debug'} ||= 'info';
# ensure the body-rule base extractor plugin is loaded, we use that
my $post_config = q(
loadplugin Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor
).join("\n", @{$opt{'cf'}})."\n";
my $spamtest = new Mail::SpamAssassin(
{
rules_filename => $opt{'configpath'},
site_rules_filename => $opt{'siteconfigpath'},
userprefs_filename => $opt{'prefspath'},
debug => $opt{'debug'},
local_tests_only => 1,
dont_copy_prefs => 1,
PREFIX => $PREFIX,
DEF_RULES_DIR => $DEF_RULES_DIR,
LOCAL_RULES_DIR => $LOCAL_RULES_DIR,
LOCAL_STATE_DIR => $LOCAL_STATE_DIR,
post_config_text => $post_config,
}
);
# appropriate BodyRuleBaseExtractor settings for rule2xs usage
$spamtest->{base_extract} = 1;
$spamtest->{bases_must_be_casei} = 1;
$spamtest->{bases_can_use_alternations} = 0;
$spamtest->{bases_can_use_quantifiers} = 0;
$spamtest->{bases_can_use_char_classes} = 0;
$spamtest->{bases_split_out_alternations} = 1;
if (defined $opt{'updatedir'}) {
$opt{'updatedir'} = Mail::SpamAssassin::Util::untaint_file_path($opt{'updatedir'});
}
else {
$opt{'updatedir'} = $spamtest->sed_path('__local_state_dir__/compiled/__version__');
}
my $installdir = $opt{'updatedir'};
if ((!defined $opt{'list'})
&& !$opt{'sudo'}
&& -d $installdir && !-w $installdir)
{
die "sa-compile: cannot write to $installdir, aborting\n";
}
$spamtest->init(1);
my $conf = $spamtest->{conf};
# this actually extracts the base rules in the plugin, as a side-effect
my $res = $spamtest->lint_rules();
if ($res) {
die "sa-compile: not compiling; 'spamassassin --lint' check failed!\n";
}
if ( defined $opt{'list'} ) {
foreach my $ruletype (sort keys %{$conf->{base_orig}}) {
print dump_base_strings($ruletype);
}
}
else {
compile_base_strings();
}
$spamtest->finish();
exit;
##############################################################################
sub dump_base_strings {
my ($ruletype) = @_;
my $s = "name $ruletype\n";
foreach my $key1 (sort keys %{$conf->{base_orig}->{$ruletype}}) {
$s .= "orig $key1 $conf->{base_orig}->{$ruletype}->{$key1}\n";
}
foreach my $key (sort keys %{$conf->{base_string}->{$ruletype}}) {
$s .= "r $key:$conf->{base_string}->{$ruletype}->{$key}\n";
}
return $s;
}
##############################################################################
sub dump_as_perl {
my ($ruletype) = @_;
my %todump = (
name => $ruletype,
base_orig => $conf->{base_orig}->{$ruletype},
base_string => $conf->{base_string}->{$ruletype}
);
my $s = Data::Dumper->Dump([ \%todump ], [qw(bases)]);
return $s;
}
##############################################################################
sub compile_base_strings {
my $dirpath = Mail::SpamAssassin::Util::secure_tmpdir();
die "secure_tmpdir failed" unless $dirpath && -w $dirpath;
my $sudo = ($opt{sudo} ? 'sudo ' : '');
foreach my $ruletype (sort keys %{$conf->{base_orig}})
{
# create the bases.in file
my $basespath = "bases_$ruletype.in";
$basespath =~ s/[^A-Za-z0-9_\.]/_/gs;
open OUT, ">$dirpath/$basespath"
or die "cannot write to $dirpath/$basespath";
print OUT dump_base_strings($ruletype);
close OUT or die "cannot write to $dirpath/$basespath";
# compile it...
chdir $dirpath; print "cd $dirpath\n";
rule2xs($basespath);
run(get_perl()." Makefile.PL ".
"PREFIX=$dirpath/ignored INSTALLSITEARCH=$installdir");
run("make");
run($sudo."make install"); # into $installdir
# and generate the bases.pl file, for perl consumers
my $plpath = "bases_$ruletype.pl";
$plpath =~ s/[^A-Za-z0-9_\.]/_/gs;
open OUT, ">$dirpath/$plpath"
or die "cannot write to $dirpath/$plpath";
print OUT dump_as_perl($ruletype);
close OUT or die "cannot write to $dirpath/$plpath";
run($sudo."cp $dirpath/$plpath $installdir/$plpath");
}
if (!$opt{'keep-tmps'}) {
chdir '/'; print "cd /\n"; # saves trouble on MacOS, possibly
run($sudo."rm -rf $dirpath"); # cleanup
}
else {
print "temporary dir left due to --keep-tmps: $dirpath\n";
}
}
sub run {
my @cmd = @_;
print join(' ',@cmd)."\n";
system(@cmd);
($?>>8 != 0) and die "command failed!";
}
sub get_perl {
my $perl;
if ($^X =~ m|^/|) {
$perl = $^X;
} else {
use Config;
$perl = $Config{perlpath};
$perl =~ s|/[^/]*$|/$^X|;
}
$perl =~ /^(.*)$/;
return $1;
}
##############################################################################
use constant MAX_RULES_PER_C_FILE => 200;
sub rule2xs {
my $modname;
my $force = 1;
my $FILE = shift;
open(my $fh, "sort $FILE |") || die "open($FILE): $!";
# read ruleset name from the first line in the file
my $ruleset_name;
$_ = <$fh>;
if (/^name\s+(\S+)/) {
$ruleset_name = $1;
}
if (!$modname) {
$modname = "Mail::SpamAssassin::CompiledRegexps::$ruleset_name";
}
our $PATH = $modname;
$PATH =~ s/::/-/g;
our $PMFILE = $modname;
$PMFILE =~ s/.*:://;
$PMFILE .= ".pm";
our $XSFILE = $PMFILE;
$XSFILE =~ s/\.pm$/.xs/;
$force and rmtree $PATH;
mkdir $PATH or (!$force and die "mkdir($PATH): $!");
chdir $PATH; print "cd $PATH\n";
my $cprefix = $modname; $cprefix =~ s/[^A-ZA-z0-9]+/_/gs;
my $numscans = 0;
my $has_rules = '';
while (!eof($fh)) {
$numscans++;
open(my $re, ">scanner${numscans}.re") || die "open(>scanner{$numscans}.re): $!";
print $re <<EOT;
#define NULL ((char*) 0)
#define YYCTYPE unsigned char
#define YYCURSOR *p
#define YYLIMIT *p
#define YYMARKER q
#define YYFILL(n)
/* backtrack to return other, semi-overlapped tokens; e.g.
allow "abcdef" to return both "abc" and "cde" as tokens */
#define RET(x) { YYCURSOR = YYMARKER; return (x); }
EOT
print $re <<EOT;
char *${cprefix}_scan${numscans}(unsigned char **p){
unsigned char *q;
/*!re2c
EOT
my $line = 0;
my $rulecount = 0;
while (<$fh>) {
next if /^#/;
if (/^orig\s+(\S+)\s+(.*)$/) {
my $name = $1;
my $regexp = $2;
$name =~ s/#/[hash]/gs;
$regexp =~ s/#/[hash]/gs;
$has_rules .= " q#$name# => q#$regexp#,\n";
$rulecount++;
next;
}
my ($regexp, $reason) = /^r (.*):(.*)$/;
die "no 'r REGEXP:REASON' in $_" unless defined $regexp;
eval {
print $re "\t", fixup_re($regexp), " {RET(\"$reason\");}\n";
$line++;
};
$@ and handle_fixup_error($@, $regexp, $reason);
last if $line == MAX_RULES_PER_C_FILE;
}
print $re <<EOT;
[\\000-\\377] { return NULL; }
*/
}
EOT
}
for (1..$numscans) {
my $cmd = "re2c -i -b -o scanner$_.c scanner$_.re";
run($cmd);
# this must be fatal; it can result in corrupt output modules missing
# scannerN() functions
if ($? >> 8 != 0) {
my $cwd = `pwd`; chop $cwd;
die "'$cmd' failed, dying!\n".
"Have you got a sufficiently-recent version of re2c?\n".
"see $cwd/scanner$_.re\n";
}
}
open(FILE, ">Makefile.PL") || die "write Makefile.PL: $!";
print FILE <<"EOT";
use ExtUtils::MakeMaker;
WriteMakefile(
'NAME' => '$modname',
'VERSION_FROM' => '$PMFILE',
'ABSTRACT_FROM' => '$PMFILE',
'OBJECT' => '\$(O_FILES)',
'OPTIMIZE' => '-O2',
'AUTHOR' => 'A. U. Tomated <automated\@example.com>',
);
EOT
open(FILE, ">MANIFEST.SKIP") || die "write MANIFEST.SKIP: $!";
print FILE <<'EOT';
CVS/.*
\.bak$
\.sw[a-z]$
\.tar$
\.tgz$
\.tar\.gz$
\.o$
\.xsi$
\.bs$
^.#
^tmp/
^blib/
^Makefile$
^Makefile\.[a-z]+$
^pm_to_blib$
~$
EOT
open(my $re, ">$XSFILE") || die "write $XSFILE: $!";
print $re <<"EOT";
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
/* split single-space-separated result string */
static void
split_and_add (AV *results, char *match)
{
char *wordstart, *cp;
for (cp = wordstart = match; *cp != (unsigned char) 0; cp++) {
if (*cp == ' ') {
av_push(results,
newSVpvn_share(wordstart, cp-wordstart, (U32)0));
wordstart = cp + 1;
}
}
av_push(results,
newSVpvn_share(wordstart, cp-wordstart, (U32)0));
}
EOT
# use a buffer string here instead of writing direct to the file,
# so we can prepend 'extern' statements (bug 5534)
my $xscode = <<"EOT";
MODULE = $modname PACKAGE = $modname
PROTOTYPES: DISABLE
SV *
scan(psv)
SV* psv
PREINIT:
int i;
char *match;
unsigned char *cursor;
unsigned char *pstart;
unsigned char *pend;
STRLEN plen;
AV *results;
CODE:
pstart = (unsigned char *) SvPVutf8(psv, plen);
pend = pstart + plen;
results = (AV *) sv_2mortal((SV *) newAV());
EOT
for (1..$numscans) {
my $funcname = $cprefix."_scan".$_;
$xscode =
# prepend this chunk
qq{
extern char *${funcname} (unsigned char **);
}.$xscode.
# and append this one
qq{
cursor = pstart;
while (cursor < pend) {
while (match = ${funcname} (\&cursor)) {
split_and_add(results, match);
}
}
};
}
print $re $xscode;
print $re <<EOT;
RETVAL = newRV((SV *) results);
OUTPUT:
RETVAL
EOT
close($re);
open(FILE, ">$PMFILE") || die "write $PMFILE: $!";
my $str =<<"EOT";
package $modname;
use strict;
use vars qw(\$VERSION \@ISA \@EXPORT_OK);
use DynaLoader ();
BEGIN {
\$VERSION = '1.0';
\@ISA = qw(DynaLoader);
\@EXPORT_OK = qw();
our \$HAS_RULES = {
$has_rules
};
bootstrap $modname \$VERSION;
}
1;
fnord__END__
fnord=head1 NAME
$modname - Efficient string matching for regexps found in $FILE
fnord=head1 SYNOPSIS
use $modname;
...
my \$match = ${modname}::scan(\$string);
fnord=head1 DESCRIPTION
This module was created by SpamAssassin with the aid of re2xs, which uses re2c
to create an XS library capable of scanning through a bunch of regular
expressions as defined in F<$FILE>.
See C<sa-compile> for more details.
fnord=cut
EOT
$str =~ s/^fnord//gm;
print FILE $str;
}
sub fixup_re {
my $re = shift;
$fixup_re_test and print "INPUT: /$re/\n";
my $output = "";
my $TOK = qr([\"\\]);
my $STATE;
while ($re =~ /\G(.*?)($TOK)/gc) {
my $pre = $1;
my $tok = $2;
if (length($pre)) {
$output .= "\"$pre\"";
}
if ($tok eq '"') {
$output .= '"\\""';
}
elsif ($tok eq '\\') {
$re =~ /\G(x\{[^\}]+\}|\d+|.)/gc or die "\\ at end of string!";
my $esc = $1;
if ($esc eq '"') {
$output .= '"\\""';
} elsif ($esc =~ /^x\{(\S+)\}$/) {
$output .= '"'.chr(hex($1)).'"';
} elsif ($esc =~ /^\d+/) {
$output .= '"'.chr(oct($esc)).'"';
} else {
$output .= "\"$esc\"";
}
}
else {
print "PRE: $pre\nTOK: $tok\n";
}
}
if (!defined(pos($re))) {
# no matches
$output .= "\"$re\"";
}
elsif (pos($re) <= length($re)) {
$output .= fixup_re(substr($re, pos($re)));
}
$output =~ s/^""/"/; # protect start and end quotes
$output =~ s/(?<!\\)""$/"/;
$output =~ s/(?<!\\)""//g; # strip empty strings, or turn "abc""def" -> "abcdef"
$fixup_re_test and print "OUTPUT: $output\n";
return $output;
}
sub handle_fixup_error {
my ($strat, $regexp, $reason) = @_;
if ($strat) {
warn "skipped: $regexp: $strat";
}
}
##############################################################################
=cut
=head1 NAME
sa-compile - compile SpamAssassin ruleset into native code
=head1 SYNOPSIS
B<sa-compile> [options]
Options:
--list Output base string list to STDOUT
--sudo Use 'sudo' for privilege escalation
--keep-tmps Keep temporary files instead of deleting
-C path, --configpath=path, --config-file=path
Path to standard configuration dir
-p prefs, --prefspath=file, --prefs-file=file
Set user preferences file
--siteconfigpath=path Path for site configs
(default: @@LOCAL_RULES_DIR@@)
--updatedir=path Directory to place updates
(default: @@LOCAL_STATE_DIR@@/compiled/<version>)
--cf='config line' Additional line of configuration
-D, --debug [area=n,...] Print debugging messages
-V, --version Print version
-h, --help Print usage message
=head1 DESCRIPTION
sa-compile uses C<re2c> to compile the SpamAssassin ruleset. This is then used
by the C<Mail::SpamAssassin::Plugin::Rule2XSBody> plugin to speed up
SpamAssassin's operation, where possible, and when that plugin is loaded.
C<re2c> can match strings much faster than perl code, by constructing a DFA to
match many simple strings in parallel, and compiling that to native object
code. Not all SpamAssassin rules are amenable to this conversion, however.
This requires C<re2c> (see C<http://re2c.org/>), and the C
compiler used to build Perl XS modules, be installed.
Note that running this, and creating a compiled ruleset, will have no
effect on SpamAssassin scanning speeds unless you also edit your C<v320.pre>
file and ensure this line is uncommented:
loadplugin Mail::SpamAssassin::Plugin::Rule2XSBody
=head1 OPTIONS
=over 4
=item B<--list>
Output the extracted base strings to STDOUT, instead of generating
the C extension code.
=item B<--sudo>
Use C<sudo(8)> to run code as 'root' when writing files to the compiled-rules
storage area (which is C<@@LOCAL_STATE_DIR@@/compiled/@@VERSION@@> by default).
=item B<--keep-tmps>
Keep temporary files after the script completes, instead of
deleting them.
=item B<-C> I<path>, B<--configpath>=I<path>, B<--config-file>=I<path>
Use the specified path for locating the distributed configuration files.
Ignore the default directories (usually C<@@DEF_RULES_DIR@@> or similar).
=item B<--siteconfigpath>=I<path>
Use the specified path for locating site-specific configuration files. Ignore
the default directories (usually C<@@LOCAL_RULES_DIR@@> or similar).
=item B<--updatedir>
By default, C<sa-compile> will use the system-wide rules update directory:
@@LOCAL_STATE_DIR@@/spamassassin/compiled/@@VERSION@@
If the updates should be stored in another location, specify it here.
Note that use of this option is not recommended; if sa-compile is placing the
compiled rules the wrong directory, you probably need to rebuild SpamAssassin
with different C<Makefile.PL> arguments, instead of overriding sa-compile's
runtime behaviour.
=item B<--cf='config line'>
Add additional lines of configuration directly from the command-line, parsed
after the configuration files are read. Multiple B<--cf> arguments can be
used, and each will be considered a separate line of configuration.
=item B<-p> I<prefs>, B<--prefspath>=I<prefs>, B<--prefs-file>=I<prefs>
Read user score preferences from I<prefs> (usually
C<$HOME/.spamassassin/user_prefs>) .
=item B<-D> [I<area,...>], B<--debug> [I<area,...>]
Produce debugging output. If no areas are listed, all debugging information is
printed. Diagnostic output can also be enabled for each area individually;
I<area> is the area of the code to instrument.
For more information about which areas (also known as channels) are
available, please see the documentation at
L<http://wiki.apache.org/spamassassin/DebugChannels>.
=item B<-h>, B<--help>
Print help message and exit.
=item B<-V>, B<--version>
Print sa-compile version and exit.
=back
=head1 SEE ALSO
Mail::SpamAssassin(3)
spamassassin(1)
spamd(1)
=head1 PREREQUESITES
C<Mail::SpamAssassin>
C<re2c>
C<Mail::SpamAssassin::Plugin::Rule2XSBody>
=head1 BUGS
See <http://issues.apache.org/SpamAssassin/>
=head1 AUTHORS
The Apache SpamAssassin(tm) Project <http://spamassassin.apache.org/>
=head1 COPYRIGHT
SpamAssassin is distributed under the Apache License, Version 2.0, as
described in the file C<LICENSE> included with the distribution.
=cut
syntax highlighted by Code2HTML, v. 0.9.1