# POPFILE LOADABLE MODULE
package Classifier::Bayes;

use POPFile::Module;
@ISA = ("POPFile::Module");

#----------------------------------------------------------------------------
#
# Bayes.pm --- Naive Bayes text classifier
#
# Copyright (c) 2001-2006 John Graham-Cumming
#
#   This file is part of POPFile
#
#   POPFile is free software; you can redistribute it and/or modify it
#   under the terms of version 2 of the GNU General Public License as
#   published by the Free Software Foundation.
#
#   POPFile is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with POPFile; if not, write to the Free Software
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
#   Modified by              Sam Schinke    (sschinke@users.sourceforge.net)
#   Merged with db code from Scott Leighton (helphand@users.sourceforge.net)
#
#----------------------------------------------------------------------------

use strict;
use warnings;
use locale;
use Classifier::MailParse;
use IO::Handle;
use DBI;
use Digest::MD5 qw( md5_hex );
use MIME::Base64;
use File::Copy;

# This is used to get the hostname of the current machine
# in a cross platform way

use Sys::Hostname;

# A handy variable containing the value of an EOL for networks

my $eol = "\015\012";

# Korean characters definition

my $ksc5601_sym = '(?:[\xA1-\xAC][\xA1-\xFE])';
my $ksc5601_han = '(?:[\xB0-\xC8][\xA1-\xFE])';
my $ksc5601_hanja  = '(?:[\xCA-\xFD][\xA1-\xFE])';
my $ksc5601 = "(?:$ksc5601_sym|$ksc5601_han|$ksc5601_hanja)";

my $eksc = "(?:$ksc5601|[\x81-\xC6][\x41-\xFE])"; #extended ksc

#----------------------------------------------------------------------------
# new
#
#   Class new() function
#----------------------------------------------------------------------------
sub new
{
    my $type = shift;
    my $self = POPFile::Module->new();

    # Set this to 1 to get scores for individual words in message detail

    $self->{wordscores__}        = 0;

    # Choice for the format of the "word matrix" display.

    $self->{wmformat__}          = '';

    # Just our hostname

    $self->{hostname__}        = '';

    # File Handle for DBI database

    $self->{db__}                = {};

    $self->{history__}        = 0;

    # To save time we also 'prepare' some commonly used SQL statements
    # and cache them here, see the function db_connect__ for details

    $self->{db_get_buckets__} = 0;
    $self->{db_get_wordid__} = 0;
    $self->{db_get_word_count__} = 0;
    $self->{db_put_word_count__} = 0;
    $self->{db_get_bucket_unique_counts__} = 0;
    $self->{db_get_unique_word_count__} = 0;
    $self->{db_get_bucket_word_counts__} = 0;
    $self->{db_get_full_total__} = 0;
    $self->{db_get_bucket_parameter__} = 0;
    $self->{db_set_bucket_parameter__} = 0;
    $self->{db_get_bucket_parameter_default__} = 0;
    $self->{db_get_buckets_with_magnets__} = 0;
    $self->{db_delete_zero_words__} = 0;

    # Caches the name of each bucket and relates it to both the bucket
    # ID in the database and whether it is pseudo or not
    #
    # Subkeys used are:
    #
    # id     The bucket ID in the database
    # pseudo 1 if this is a pseudo bucket

    $self->{db_bucketid__}       = {};

    # Caches the IDs that map to parameter types

    $self->{db_parameterid__}    = {};

    # Caches looked up parameter values on a per bucket basis

    $self->{db_parameters__}     = {};

    # Used to parse mail messages
    $self->{parser__}            = new Classifier::MailParse;

    # The possible colors for buckets
    $self->{possible_colors__} = [ 'red',       'green',      'blue',       'brown', # PROFILE BLOCK START
                                   'orange',    'purple',     'magenta',    'gray',
                                   'plum',      'silver',     'pink',       'lightgreen',
                                   'lightblue', 'lightcyan',  'lightcoral', 'lightsalmon',
                                   'lightgrey', 'darkorange', 'darkcyan',   'feldspar',
                                   'black' ];                                        # PROFILE BLOCK STOP

    # Precomputed per bucket probabilities
    $self->{bucket_start__}      = {};

    # A very unlikely word
    $self->{not_likely__}        = {};

    # The expected corpus version
    #
    # DEPRECATED  This is only used when upgrading old flat file corpus files
    #             to the database
    $self->{corpus_version__}    = 1;

    # The unclassified cutoff this value means that the top
    # probabilily must be n times greater than the second probability,
    # default is 100 times more likely
    $self->{unclassified__}      = log(100);

    # Used to tell the caller whether a magnet was used in the last
    # mail classification
    $self->{magnet_used__}       = 0;
    $self->{magnet_detail__}     = 0;

    # This maps session keys (long strings) to user ids.  If there's
    # an entry here then the session key is valid and can be used in
    # the POPFile API.  See the methods get_session_key and
    # release_session_key for details

    $self->{api_sessions__}      = {};

    # Used to indicate whether we are using SQLite and what the full
    # path and name of the database is if we are.

    $self->{db_is_sqlite__}      = 0;
    $self->{db_name__}           = '';

    # Must call bless before attempting to call any methods

    bless $self, $type;

    $self->name( 'bayes' );

    return $self;
}

#----------------------------------------------------------------------------
#
# forked
#
# This is called inside a child process that has just forked, since
# the child needs access to the database we open it
#
#----------------------------------------------------------------------------
sub forked
{
    my ( $self ) = @_;

    $self->db_connect__();
}

#----------------------------------------------------------------------------
#
# initialize
#
# Called to set up the Bayes module's parameters
#
#----------------------------------------------------------------------------
sub initialize
{
    my ( $self ) = @_;

    # This is the name for the database

    $self->config_( 'database', 'popfile.db' );

    # This is the 'connect' string used by DBI to connect to the
    # database, if you decide to change from using SQLite to some
    # other database (e.g. MySQL, Oracle, ... ) this *should* be all
    # you need to change.  The additional parameters user and auth are
    # needed for some databases. 
    #
    # Note that the dbconnect string
    # will be interpolated before being passed to DBI and the variable
    # $dbname can be used within it and it resolves to the full path
    # to the database named in the database parameter above.

    $self->config_( 'dbconnect', 'dbi:SQLite2:dbname=$dbname' );
    $self->config_( 'dbuser', '' ); $self->config_( 'dbauth', '' );

    # SQLite 1.05+ have some problems we are resolving.  This lets us
    # give a nice message and then disable the version checking later
    
    $self->config_( 'bad_sqlite_version', '3.0.0' );

    # No default unclassified weight is the number of times more sure
    # POPFile must be of the top class vs the second class, default is
    # 100 times more

    $self->config_( 'unclassified_weight', 100 );

    # The corpus is kept in the 'corpus' subfolder of POPFile
    #
    # DEPRECATED This is only used to find an old corpus that might
    # need to be upgraded

    $self->config_( 'corpus', 'corpus' );

    # The characters that appear before and after a subject
    # modification

    $self->config_( 'subject_mod_left',  '[' );
    $self->config_( 'subject_mod_right', ']' );

    # Get the hostname for use in the X-POPFile-Link header

    $self->{hostname__} = hostname;

    # Allow the user to override the hostname

    $self->config_( 'hostname', $self->{hostname__} );

    # If set to 1 then the X-POPFile-Link will have < > around the URL
    # (i.e. X-POPFile-Link: <http://foo.bar>) when set to 0 there are
    # none (i.e. X-POPFile-Link: http://foo.bar)

    $self->config_( 'xpl_angle', 0 );

    # This parameter is used when the UI is operating in Stealth Mode.
    # If left blank (the default setting) the X-POPFile-Link will use 127.0.0.1
    # otherwise it will use this string instead. The system's HOSTS file should
    # map the string to 127.0.0.1

    $self->config_( 'localhostname', '' );

    # This is a bit mask used to control options when we are using the
    # default SQLite database.  By default all the options are on.
    #
    # 1 = Asynchronous deletes
    # 2 = Backup database every hour
    
    $self->config_( 'sqlite_tweaks', 0xFFFFFFFF );

    $self->mq_register_( 'COMIT', $self );
    $self->mq_register_( 'RELSE', $self );

    # Register for the TICKD message which is sent hourly by the
    # Logger module.  We use this to hourly save the database if bit 1
    # of the sqlite_tweaks is set and we are using SQLite

    $self->mq_register_( 'TICKD', $self );

    return 1;
}

#----------------------------------------------------------------------------
#
# deliver
#
# Called by the message queue to deliver a message
#
# There is no return value from this method
#
#----------------------------------------------------------------------------
sub deliver
{
    my ( $self, $type, @message ) = @_;

    if ( $type eq 'COMIT' ) {
        $self->classified( $message[0], $message[2] );
    }
    
    if ( $type eq 'RELSE' ) {
        $self->release_session_key_private__( $message[0] );
    }    

    if ( $type eq 'TICKD' ) {
        $self->backup_database__();
    }
}

#----------------------------------------------------------------------------
#
# start
#
# Called to start the Bayes module running
#
#----------------------------------------------------------------------------
sub start
{
    my ( $self ) = @_;

    # In Japanese or Korean mode, explicitly set LC_COLLATE to C.
    #
    # This is to avoid Perl crash on Windows because default
    # LC_COLLATE of Japanese Win is Japanese_Japan.932(Shift_JIS),
    # which is different from the charset POPFile uses for Japanese
    # characters(EUC-JP).

    if ( defined( $self->module_config_( 'html', 'language' ) ) &&
       ( $self->module_config_( 'html', 'language' ) =~ /^Nihongo|Korean$/ )) {
        use POSIX qw( locale_h );
        setlocale( LC_COLLATE, 'C' );
    }

    # Pass in the current interface language for language specific parsing

    $self->{parser__}->{lang__}  = $self->module_config_( 'html', 'language' );
    $self->{unclassified__} = log( $self->config_( 'unclassified_weight' ) );

    if ( !$self->db_connect__() ) {
        return 0;
    }

    # Since Text::Kakasi is not thread-safe, we use it under the
    # control of a Mutex to avoid a crash if we are running on
    # Windows and using the fork.

    if ( ( $self->{parser__}->{lang__} eq 'Nihongo' ) && ( $^O eq 'MSWin32' ) && 
         ( ( ( $self->module_config_( 'pop3', 'enabled' ) ) && 
             ( $self->module_config_( 'pop3', 'force_fork' ) ) ) || 
           ( ( $self->module_config_( 'nntp', 'enabled' ) ) && 
             ( $self->module_config_( 'nntp', 'force_fork' ) ) ) || 
           ( ( $self->module_config_( 'smtp', 'enabled' ) ) && 
             ( $self->module_config_( 'smtp', 'force_fork' ) ) ) ) ) {
        $self->{parser__}->{need_kakasi_mutex__} = 1;

        # Prepare the Mutex.
        require POPFile::Mutex;
        $self->{parser__}->{kakasi_mutex__} = new POPFile::Mutex( 'mailparse_kakasi' );
        $self->log_( 2, "Create mutex for Kakasi." );
    }

    $self->upgrade_predatabase_data__();

    return 1;
}

#----------------------------------------------------------------------------
#
# stop
#
# Called when POPFile is terminating
#
#----------------------------------------------------------------------------
sub stop
{
    my ( $self ) = @_;

    $self->db_disconnect__();
    delete $self->{parser__};
}

#----------------------------------------------------------------------------
#
# classified
#
# Called to inform the module about a classification event
#
# There is no return value from this method
#
#----------------------------------------------------------------------------
sub classified
{
    my ( $self, $session, $class ) = @_;

    $self->set_bucket_parameter( $session, $class, 'count',             # PROFILE BLOCK START
        $self->get_bucket_parameter( $session, $class, 'count' ) + 1 ); # PROFILE BLOCK STOP
}

#----------------------------------------------------------------------------
#
# backup_database__
#
# Called when the TICKD message is received each hour and if we are using
# the default SQLite database will make a copy with the .backup extension
#
#----------------------------------------------------------------------------
sub backup_database__
{
    my ( $self ) = @_;

    # If database backup is turned on and we are using SQLite then
    # backup the database by copying it

    if ( ( $self->config_( 'sqlite_tweaks' ) & 2 ) && 
         $self->{db_is_sqlite__} ) {
        if ( !copy( $self->{db_name__}, $self->{db_name__} . ".backup" ) ) {
	    $self->log_( 0, "Failed to backup database ".$self->{db_name__} );
        }
    }
}

#----------------------------------------------------------------------------
#
# tweak_sqlite
#
# Called when a module wants is to tweak access to the SQLite database.
#
# $tweak    The tweak to apply (a bit in the sqlite_tweaks mask)
# $state    1 to enable the tweak, 0 to disable
# $db       The db handle to tweak
#
#----------------------------------------------------------------------------
sub tweak_sqlite
{
    my ( $self, $tweak, $state, $db ) = @_;

    if ( $self->{db_is_sqlite__} && 
         ( $self->config_( 'sqlite_tweaks' ) & $tweak ) ) {

        $self->log_( 1, "Performing tweak $tweak to $state" );

        if ( $tweak == 1 ) {
            my $sync = $state?'off':'normal';
            $db->do( "pragma synchronous=$sync;" );
        }    
    }
}

#----------------------------------------------------------------------------
#
# reclassified
#
# Called to inform the module about a reclassification from one bucket
# to another
#
# session            Valid API session
# bucket             The old bucket name
# newbucket          The new bucket name
# undo               1 if this is an undo operation
#
# There is no return value from this method
#
#----------------------------------------------------------------------------
sub reclassified
{
    my ( $self, $session, $bucket, $newbucket, $undo ) = @_;

    $self->log_( 0, "Reclassification from $bucket to $newbucket" );

    my $c = $undo?-1:1;

    if ( $bucket ne $newbucket ) {
        my $count = $self->get_bucket_parameter(
                        $session, $newbucket, 'count' );
        my $newcount = $count + $c;
        $newcount = 0 if ( $newcount < 0 );
        $self->set_bucket_parameter(
            $session, $newbucket, 'count', $newcount );

        $count = $self->get_bucket_parameter(
                     $session, $bucket, 'count' );
        $newcount = $count - $c;
        $newcount = 0 if ( $newcount < 0 );
        $self->set_bucket_parameter(
            $session, $bucket, 'count', $newcount );

        my $fncount = $self->get_bucket_parameter(
                          $session, $newbucket, 'fncount' );
        my $newfncount = $fncount + $c;
        $newfncount = 0 if ( $newfncount < 0 );
        $self->set_bucket_parameter(
            $session, $newbucket, 'fncount', $newfncount );

        my $fpcount = $self->get_bucket_parameter(
                          $session, $bucket, 'fpcount' );
        my $newfpcount = $fpcount + $c;
        $newfpcount = 0 if ( $newfpcount < 0 );
        $self->set_bucket_parameter(
            $session, $bucket, 'fpcount', $newfpcount );
    }
}

#----------------------------------------------------------------------------
#
# get_color
#
# Retrieves the color for a specific word, color is the most likely bucket
#
# $session  Session key returned by get_session_key
# $word     Word to get the color of
#
#----------------------------------------------------------------------------
sub get_color
{
    my ( $self, $session, $word ) = @_;

    my $max   = -10000;
    my $color = 'black';

    for my $bucket ($self->get_buckets( $session )) {
        my $prob = $self->get_value_( $session, $bucket, $word );

        if ( $prob != 0 )  {
            if ( $prob > $max )  {
                $max   = $prob;
                $color = $self->get_bucket_parameter( $session, $bucket,
                             'color' );
            }
        }
    }

    return $color;
}

#----------------------------------------------------------------------------
#
# get_not_likely_
#
# Returns the probability of a word that doesn't appear
#
#----------------------------------------------------------------------------
sub get_not_likely_
{
    my ( $self, $session ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    return $self->{not_likely__}{$userid};
}

#----------------------------------------------------------------------------
#
# get_value_
#
# Returns the value for a specific word in a bucket.  The word is
# converted to the log value of the probability before return to get
# the raw value just hit the hash directly or call get_base_value_
#
#----------------------------------------------------------------------------
sub get_value_
{
    my ( $self, $session, $bucket, $word ) = @_;

    my $value = $self->db_get_word_count__( $session, $bucket, $word );

    if ( defined( $value ) && ( $value > 0 ) ) {

        # Profiling notes:
        #
        # I tried caching the log of the total value and then doing
        # log( $value ) - $cached and this turned out to be
        # much slower than this single log with a division in it

        return log( $value /
                    $self->get_bucket_word_count( $session, $bucket ) );
    } else {
        return 0;
    }
}

sub get_base_value_
{
    my ( $self, $session, $bucket, $word ) = @_;

    my $value = $self->db_get_word_count__( $session, $bucket, $word );

    if ( defined( $value ) ) {
        return $value;
    } else {
        return 0;
    }
}

#----------------------------------------------------------------------------
#
# set_value_
#
# Sets the value for a word in a bucket and updates the total word
# counts for the bucket and globally
#
#----------------------------------------------------------------------------
sub set_value_
{
    my ( $self, $session, $bucket, $word, $value ) = @_;

    if ( $self->db_put_word_count__( $session, $bucket,
             $word, $value ) == 1 ) {

        # If we set the word count to zero then clean it up by deleting the
        # entry

        my $userid = $self->valid_session_key__( $session );
        my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
        $self->{db_delete_zero_words__}->execute( $bucketid );

        return 1;
    } else {
        return 0;
    }
}

#----------------------------------------------------------------------------
#
# get_sort_value_ behaves the same as get_value_, except that it
# returns not_likely__ rather than 0 if the word is not found.  This
# makes its result more suitable as a sort key for bucket ranking.
#
#----------------------------------------------------------------------------
sub get_sort_value_
{
    my ( $self, $session, $bucket, $word ) = @_;

    my $v = $self->get_value_( $session, $bucket, $word );

    if ( $v == 0 ) {

        my $userid = $self->valid_session_key__( $session );
        return undef if ( !defined( $userid ) );

        return $self->{not_likely__}{$userid};
    } else {
        return $v;
    }
}

#----------------------------------------------------------------------------
#
# update_constants__
#
# Updates not_likely and bucket_start
#
#----------------------------------------------------------------------------
sub update_constants__
{
    my ( $self, $session ) = @_;

    my $wc = $self->get_word_count( $session );

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    if ( $wc > 0 )  {
        $self->{not_likely__}{$userid} = -log( 10 * $wc );

        foreach my $bucket ($self->get_buckets( $session )) {
            my $total = $self->get_bucket_word_count( $session, $bucket );

            if ( $total != 0 ) {
                $self->{bucket_start__}{$userid}{$bucket} = log( $total /
                                                                 $wc );
            } else {
                $self->{bucket_start__}{$userid}{$bucket} = 0;
            }
        }
    } else {
        $self->{not_likely__}{$userid} = 0;
    }
}

#----------------------------------------------------------------------------
#
# db_connect__
#
# Connects to the POPFile database and returns 1 if successful
#
#----------------------------------------------------------------------------
sub db_connect__
{
    my ( $self ) = @_;

    # Connect to the database, note that the database must exist for
    # this to work, to make this easy for people POPFile we will
    # create the database automatically here using the file
    # 'popfile.sql' which should be located in the same directory the
    # Classifier/Bayes.pm module

    # If we are using SQLite then the dbname is actually the name of a
    # file, and hence we treat it like one, otherwise we leave it
    # alone

    my $dbname;
    my $dbconnect = $self->config_( 'dbconnect' );
    my $dbpresent;
    my $sqlite = ( $dbconnect =~ /sqlite/i );

    if ( $sqlite ) {
        $dbname = $self->get_user_path_( $self->config_( 'database' ) );
        $dbpresent = ( -e $dbname ) || 0;                
    } else {
        $dbname = $self->config_( 'database' );
        $dbpresent = 1;
    }

    # Record whether we are using SQLite or not and the name of the
    # database so that other routines can access it; this is used by
    # the backup_database__ routine to make a backup copy of the
    # database when using SQLite.

    $self->{db_is_sqlite__} = $sqlite;
    $self->{db_name__}      = $dbname;

    # Now perform the connect, note that this is database independent
    # at this point, the actual database that we connect to is defined
    # by the dbconnect parameter.

    $dbconnect =~ s/\$dbname/$dbname/g;

    $self->log_( 0, "Attempting to connect to $dbconnect ($dbpresent)" );

    $self->{db__} = DBI->connect( $dbconnect,                    # PROFILE BLOCK START
                                  $self->config_( 'dbuser' ),
                                  $self->config_( 'dbauth' ) );  # PROFILE BLOCK STOP
                                  
    $self->log_( 0, "Using SQLite library version " . $self->{db__}{sqlite_version});
    
    # We check to make sure we're not using DBD::SQLite 1.05 or greater
    # which uses SQLite V 3 If so, we'll use DBD::SQLite2 and SQLite 2.8,
    # which is still compatible with old databases
    
    if ( $self->{db__}{sqlite_version} gt $self->config_('bad_sqlite_version' ) )  {
        $self->log_( 0, "Substituting DBD::SQLite2 for DBD::SQLite 1.05" );        
        $self->log_( 0, "Please install DBD::SQLite2 and set dbconnect to use DBD::SQLite2" );
        
        $dbconnect =~ s/SQLite:/SQLite2:/;
        
        undef $self->{db__};
#         $self->db_disconnect__();
        
        $self->{db__} = DBI->connect( $dbconnect,                    # PROFILE BLOCK START
                                      $self->config_( 'dbuser' ),
                                      $self->config_( 'dbauth' ) );  # PROFILE BLOCK STOP        
    }

    if ( !defined( $self->{db__} ) ) {
        $self->log_( 0, "Failed to connect to database and got error $DBI::errstr" );
        return 0;
    }

    if ( !$dbpresent ) {
        if ( !$self->insert_schema__( $sqlite ) ) {
            return 0;
        }
    }

    # Now check for a need to upgrade the database because the schema
    # has been changed.  From POPFile v0.22.0 there's a special
    # 'popfile' table inside the database that contains the schema
    # version number.  If the version number doesn't match or is
    # missing then do the upgrade.

    open SCHEMA, '<' . $self->get_root_path_( 'Classifier/popfile.sql' );
    <SCHEMA> =~ /-- POPFILE SCHEMA (\d+)/;
    my $version = $1;
    close SCHEMA;

    my $need_upgrade = 1;

    #
    # retrieve the SQL_IDENTIFIER_QUOTE_CHAR for the database then use it
    # to strip off any sqlquotechars from the table names we retrieve
    #

    my $sqlquotechar = $self->{db__}->get_info(29) || ''; 
    my @tables = map { s/$sqlquotechar//g; $_ } ($self->{db__}->tables());

    foreach my $table (@tables) {
        if ( $table eq 'popfile' ) {
            my @row = $self->{db__}->selectrow_array(
               'select version from popfile;' );

            if ( $#row == 0 ) {
                $need_upgrade = ( $row[0] != $version );
            }
        }
    }

    if ( $need_upgrade ) {

        print "\n\nDatabase schema is outdated, performing automatic upgrade\n";
        # The database needs upgrading, so we are going to dump out
        # all the data in the database as INSERT OR IGNORE statements
        # in a temporary file, then DROP all the tables in the
        # database, then recreate the schema from the new schema and
        # finally rerun the inserts.

        my $i = 0;
        my $ins_file = $self->get_user_path_( 'insert.sql' );
        open INSERT, '>' . $ins_file;

        foreach my $table (@tables) {
            next if ( $table eq 'popfile' );
            if ( $sqlite && ( $table =~ /^sqlite_/ ) ) {
                next;
            }
            if ( $i > 99 ) {
                print "\n";
            }
            print "    Saving table $table\n    ";

            my $t = $self->{db__}->prepare( "select * from $table;" );
            $t->execute;
            $i = 0;
            while ( 1 ) {
                if ( ( ++$i % 100 ) == 0 ) {
                    print "[$i]";
                    flush STDOUT;
                }
                if ( ( $i % 1000 ) == 0 ) {
                    print "\n";
                    flush STDOUT;
                }
                my @rows = $t->fetchrow_array;

                last if ( $#rows == -1 );

                print INSERT "INSERT OR IGNORE INTO $table (";
                for my $i (0..$t->{NUM_OF_FIELDS}-1) {
                    if ( $i != 0 ) {
                        print INSERT ',';
                    }
                    print INSERT $t->{NAME}->[$i];
                }
                print INSERT ') VALUES (';
                for my $i (0..$t->{NUM_OF_FIELDS}-1) {
                    if ( $i != 0 ) {
                        print INSERT ',';
                    }
                    my $val = $rows[$i];
                    if ( $t->{TYPE}->[$i] !~ /^int/i ) {
                        $val = '' if ( !defined( $val ) );
                        $val = $self->{db__}->quote( $val );
                    } else {
                        $val = 'NULL' if ( !defined( $val ) );
                    }
                    print INSERT $val;
                }
                print INSERT ");\n";
            }
        }

        close INSERT;

        if ( $i > 99 ) {
            print "\n";
        }

        foreach my $table (@tables) {
            if ( $sqlite && ( $table =~ /^sqlite_/ ) ) {
                next;
            }
            print "    Dropping old table $table\n";
            $self->{db__}->do( "DROP TABLE $table;" );
        }

        print "    Inserting new database schema\n";
        if ( !$self->insert_schema__( $sqlite ) ) {
            return 0;
        }

        print "    Restoring old data\n    ";

        $self->{db__}->begin_work;
        open INSERT, '<' . $ins_file;
        $i = 0;
        while ( <INSERT> ) {
            if ( ( ++$i % 100 ) == 0 ) {
               print "[$i]";
               flush STDOUT;
            }
            if ( ( $i % 1000 ) == 0 ) {
                print "\n";
                flush STDOUT;
            }
            s/[\r\n]//g;
            $self->{db__}->do( $_ );
        }
        close INSERT;
        $self->{db__}->commit;

        unlink $ins_file;
        print "\nDatabase upgrade complete\n\n";
    }

    # Now prepare common SQL statements for use, as a matter of convention the
    # parameters to each statement always appear in the following order:
    #
    # user
    # bucket
    # word
    # parameter

    $self->{db_get_buckets__} = $self->{db__}->prepare(                                 # PROFILE BLOCK START
             'select name, id, pseudo from buckets
                  where buckets.userid = ?;' );                                         # PROFILE BLOCK STOP

    $self->{db_get_wordid__} = $self->{db__}->prepare(                                  # PROFILE BLOCK START
             'select id from words
                  where words.word = ? limit 1;' );                                     # PROFILE BLOCK STOP

    $self->{db_get_userid__} = $self->{db__}->prepare(                                  # PROFILE BLOCK START
             'select id from users where name = ?
                                     and password = ? limit 1;' );                      # PROFILE BLOCK STOP

    $self->{db_get_word_count__} = $self->{db__}->prepare(                              # PROFILE BLOCK START
             'select matrix.times from matrix
                  where matrix.bucketid = ? and
                        matrix.wordid = ? limit 1;' );                                  # PROFILE BLOCK STOP

    $self->{db_put_word_count__} = $self->{db__}->prepare(                              # PROFILE BLOCK START
           'replace into matrix ( bucketid, wordid, times ) values ( ?, ?, ? );' );     # PROFILE BLOCK STOP

    $self->{db_get_bucket_unique_counts__} = $self->{db__}->prepare(                    # PROFILE BLOCK START
             'select count(matrix.wordid), buckets.name from matrix, buckets
                  where buckets.userid = ?
                    and matrix.bucketid = buckets.id
                  group by buckets.name;' );                                            # PROFILE BLOCK STOP

    $self->{db_get_bucket_word_counts__} = $self->{db__}->prepare(                      # PROFILE BLOCK START
             'select sum(matrix.times), buckets.name from matrix, buckets
                  where matrix.bucketid = buckets.id
                    and buckets.userid = ?
                    group by buckets.name;' );                                          # PROFILE BLOCK STOP

    $self->{db_get_unique_word_count__} = $self->{db__}->prepare(                       # PROFILE BLOCK START
             'select count(matrix.wordid) from matrix, buckets
                  where matrix.bucketid = buckets.id and
                        buckets.userid = ?;' );                                         # PROFILE BLOCK STOP

    $self->{db_get_full_total__} = $self->{db__}->prepare(                              # PROFILE BLOCK START
             'select sum(matrix.times) from matrix, buckets
                  where buckets.userid = ? and
                        matrix.bucketid = buckets.id;' );                               # PROFILE BLOCK STOP

    $self->{db_get_bucket_parameter__} = $self->{db__}->prepare(                        # PROFILE BLOCK START
             'select bucket_params.val from bucket_params
                  where bucket_params.bucketid = ? and
                        bucket_params.btid = ?;' );                                     # PROFILE BLOCK STOP

    $self->{db_set_bucket_parameter__} = $self->{db__}->prepare(                        # PROFILE BLOCK START
           'replace into bucket_params ( bucketid, btid, val ) values ( ?, ?, ? );' );  # PROFILE BLOCK STOP

    $self->{db_get_bucket_parameter_default__} = $self->{db__}->prepare(                # PROFILE BLOCK START
             'select bucket_template.def from bucket_template
                  where bucket_template.id = ?;' );                                     # PROFILE BLOCK STOP

    $self->{db_get_buckets_with_magnets__} = $self->{db__}->prepare(                    # PROFILE BLOCK START
             'select buckets.name from buckets, magnets
                  where buckets.userid = ? and
                        magnets.id != 0 and
                        magnets.bucketid = buckets.id group by buckets.name order by buckets.name;' );
                                                                                        # PROFILE BLOCK STOP
    $self->{db_delete_zero_words__} = $self->{db__}->prepare(                           # PROFILE BLOCK START
             'delete from matrix
                  where matrix.times = 0
                    and matrix.bucketid = ?;' );                                        # PROFILE BLOCK STOP

    # Get the mapping from parameter names to ids into a local hash

    my $h = $self->{db__}->prepare( "select name, id from bucket_template;" );
    $h->execute;
    while ( my $row = $h->fetchrow_arrayref ) {
        $self->{db_parameterid__}{$row->[0]} = $row->[1];
    }
    $h->finish;

    return 1;
}

#----------------------------------------------------------------------------
#
# insert_schema__
#
# Insert the POPFile schema in a database
#
# $sqlite          Set to 1 if this is a SQLite database
#
#----------------------------------------------------------------------------
sub insert_schema__
{
    my ( $self, $sqlite ) = @_;

    if ( -e $self->get_root_path_( 'Classifier/popfile.sql' ) ) {
        my $schema = '';

        $self->log_( 0, "Creating database schema" );

        open SCHEMA, '<' . $self->get_root_path_( 'Classifier/popfile.sql' );
        while ( <SCHEMA> ) {
            next if ( /^--/ );
            next if ( !/[a-z;]/ );
            s/--.*$//;

            # If the line begins 'alter' and we are doing SQLite then ignore
            # the line

            if ( $sqlite && ( /^alter/i ) ) {
                next;
            }

            $schema .= $_;

            if ( ( /end;/ ) || ( /\);/ ) || ( /^alter/i ) ) {
                $self->{db__}->do( $schema );
                $schema = '';
            }
        }
        close SCHEMA;
        return 1;
    } else {
        $self->log_( 0, "Can't find the database schema" );
        return 0;
    }
}

#----------------------------------------------------------------------------
#
# db_disconnect__
#
# Disconnect from the POPFile database
#
#----------------------------------------------------------------------------
sub db_disconnect__
{
    my ( $self ) = @_;

    $self->{db_get_buckets__}->finish;
    $self->{db_get_wordid__}->finish;
    $self->{db_get_userid__}->finish;
    $self->{db_get_word_count__}->finish;
    $self->{db_put_word_count__}->finish;
    $self->{db_get_bucket_unique_counts__}->finish;
    $self->{db_get_bucket_word_counts__}->finish;
    $self->{db_get_unique_word_count__}->finish;
    $self->{db_get_full_total__}->finish;
    $self->{db_get_bucket_parameter__}->finish;
    $self->{db_set_bucket_parameter__}->finish;
    $self->{db_get_bucket_parameter_default__}->finish;
    $self->{db_get_buckets_with_magnets__}->finish;
    $self->{db_delete_zero_words__}->finish;

    if ( defined( $self->{db__} ) ) {
        $self->{db__}->disconnect;
        undef $self->{db__};
    }
}

#----------------------------------------------------------------------------
#
# db_update_cache__
#
# Updates our local cache of user and bucket ids.
#
# $session           Must be a valid session
#
#----------------------------------------------------------------------------
sub db_update_cache__
{
    my ( $self, $session ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    delete $self->{db_bucketid__}{$userid};

    $self->{db_get_buckets__}->execute( $userid );
    while ( my $row = $self->{db_get_buckets__}->fetchrow_arrayref ) {
        $self->{db_bucketid__}{$userid}{$row->[0]}{id} = $row->[1];
        $self->{db_bucketid__}{$userid}{$row->[0]}{pseudo} = $row->[2];
        $self->{db_bucketcount__}{$userid}{$row->[0]} = 0;
    }

    $self->{db_get_bucket_word_counts__}->execute( $userid );

    for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) {
        $self->{db_bucketcount__}{$userid}{$b} = 0;
        $self->{db_bucketunique__}{$userid}{$b} = 0;
    }

    while ( my $row = $self->{db_get_bucket_word_counts__}->fetchrow_arrayref ) {
        $self->{db_bucketcount__}{$userid}{$row->[1]} = $row->[0];
    }

    $self->{db_get_bucket_unique_counts__}->execute( $userid );

    while ( my $row = $self->{db_get_bucket_unique_counts__}->fetchrow_arrayref ) {
        $self->{db_bucketunique__}{$userid}{$row->[1]} = $row->[0];
    }

    $self->update_constants__( $session );
}

#----------------------------------------------------------------------------
#
# db_get_word_count__
#
# Return the 'count' value for a word in a bucket.  If the word is not
# found in that bucket then returns undef.
#
# $session          Valid session ID from get_session_key
# $bucket           bucket word is in
# $word             word to lookup
#
#----------------------------------------------------------------------------
sub db_get_word_count__
{
    my ( $self, $session, $bucket, $word ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    $self->{db_get_wordid__}->execute( $word );
    my $result = $self->{db_get_wordid__}->fetchrow_arrayref;
    if ( !defined( $result ) ) {
        return undef;
    }

    my $wordid = $result->[0];

    $self->{db_get_word_count__}->execute( $self->{db_bucketid__}{$userid}{$bucket}{id}, $wordid );
    $result = $self->{db_get_word_count__}->fetchrow_arrayref;
    if ( defined( $result ) ) {
         return $result->[0];
    } else {
         return undef;
    }
}

#----------------------------------------------------------------------------
#
# db_put_word_count__
#
# Update 'count' value for a word in a bucket, if the update fails
# then returns 0 otherwise is returns 1
#
# $session          Valid session ID from get_session_key
# $bucket           bucket word is in
# $word             word to update
# $count            new count value
#
#----------------------------------------------------------------------------
sub db_put_word_count__
{
    my ( $self, $session, $bucket, $word, $count ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    # We need to have two things before we can start, the id of the
    # word in the words table (if there's none then we need to add the
    # word), the bucket id in the buckets table (which must exist)

    $word = $self->{db__}->quote($word);

    my $result = $self->{db__}->selectrow_arrayref(
                     "select words.id from words where words.word = $word limit 1;");

    if ( !defined( $result ) ) {
        $self->{db__}->do( "insert into words ( word ) values ( $word );" );
        $result = $self->{db__}->selectrow_arrayref(
                     "select words.id from words where words.word = $word limit 1;");
    }

    my $wordid = $result->[0];
    my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};

    $self->{db_put_word_count__}->execute( $bucketid, $wordid, $count );

    return 1;
}

#----------------------------------------------------------------------------
#
# upgrade_predatabase_data__
#
# Looks for old POPFile data (in flat files or BerkeleyDB tables) and
# upgrades it to the SQL database.  Data upgraded is removed.
#
#----------------------------------------------------------------------------
sub upgrade_predatabase_data__
{
    my ( $self ) = @_;
    my $c      = 0;

    # There's an assumption here that this is the single user version
    # of POPFile and hence what we do is cheat and get a session key
    # assuming that the user name is admin with password ''

    my $session = $self->get_session_key( 'admin', '' );

    if ( !defined( $session ) ) {
        $self->log_( 0, "Tried to get the session key for user admin and failed; cannot upgrade old data" );
        return;
    }

    my @buckets = glob $self->get_user_path_( $self->config_( 'corpus' ) . '/*' );

    foreach my $bucket (@buckets) {

        # A bucket directory must be a directory

        next unless ( -d $bucket );
        next unless ( ( -e "$bucket/table" ) || ( -e "$bucket/table.db" ) );

        return 0 if ( !$self->upgrade_bucket__( $session, $bucket ) );

        my $color = '';

        # See if there's a color file specified
        if ( open COLOR, '<' . "$bucket/color" ) {
            $color = <COLOR>;

            # Someone (who shall remain nameless) went in and manually created
            # empty color files in their corpus directories which would cause
            # $color at this point to be undefined and hence you'd get warnings
            # about undefined variables below.  So this little test is to deal
            # with that user and to make POPFile a little safer which is always
            # a good thing

            if ( !defined( $color ) ) {
                $color = '';
            } else {
                $color =~ s/[\r\n]//g;
            }
            close COLOR;
            unlink "$bucket/color";
        }

        $bucket =~ /([[:alpha:]0-9-_]+)$/;
        $bucket =  $1;

        $self->set_bucket_color( $session, $bucket, ($color eq '')?$self->{possible_colors__}[$c]:$color );

        $c = ($c+1) % ($#{$self->{possible_colors__}}+1);
    }

    $self->release_session_key( $session );

    return 1;
}

#----------------------------------------------------------------------------
#
# upgrade_bucket__
#
# Loads an individual bucket
#
# $session           Valid session key from get_session_key
# $bucket            The bucket name
#
#----------------------------------------------------------------------------
sub upgrade_bucket__
{
    my ( $self, $session, $bucket ) = @_;

    $bucket =~ /([[:alpha:]0-9-_]+)$/;
    $bucket =  $1;

    $self->create_bucket( $session, $bucket );

    if ( open PARAMS, '<' . $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/params" ) ) {
        while ( <PARAMS> )  {
            s/[\r\n]//g;
            if ( /^([[:lower:]]+) ([^\r\n\t ]+)$/ )  {
                $self->set_bucket_parameter( $session, $bucket, $1, $2 );
            }
        }
        close PARAMS;
        unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/params" );
    }

    # Pre v0.21.0 POPFile had GLOBAL parameters for subject modification,
    # XTC and XPL insertion.  To make the upgrade as clean as possible
    # check these parameters so that if they were OFF we set the equivalent
    # per bucket to off

    foreach my $gl ( 'subject', 'xtc', 'xpl' ) {
        $self->log_( 1, "Checking deprecated parameter GLOBAL_$gl for $bucket\n" );
        my $val = $self->{configuration__}->deprecated_parameter( "GLOBAL_$gl" );
        if ( defined( $val ) && ( $val == 0 ) ) {
            $self->log_( 1, "GLOBAL_$gl is 0 for $bucket, overriding $gl\n" );
            $self->set_bucket_parameter( $session, $bucket, $gl, 0 );
        }
    }

    # See if there are magnets defined
    if ( open MAGNETS, '<' . $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/magnets" ) ) {
        while ( <MAGNETS> )  {
            s/[\r\n]//g;

            # Because of a bug in v0.17.9 and earlier of POPFile the text of
            # some magnets was getting mangled by certain characters having
            # a \ prepended.  Code here removes the \ in these cases to make
            # an upgrade smooth.

            if ( /^([^ ]+) (.+)$/ )  {
                my $type  = $1;
                my $value = $2;

                # Some people were accidently creating magnets with
                # trailing whitespace which really confused them later
                # when their magnet did not match (see comment in
                # UI::HTML::magnet for more detail)

                $value =~ s/^[ \t]+//g;
                $value =~ s/[ \t]+$//g;

                $value =~ s/\\(\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/$1/g;
                $self->create_magnet( $session, $bucket, $type, $value );
            } else {

                # This branch is used to catch the original magnets in an
                # old version of POPFile that were just there for from
                # addresses only

                if ( /^(.+)$/ ) {
                    my $value = $1;
                    $value =~ s/\\(\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/$1/g;
                    $self->create_magnet( $session, $bucket, 'from', $value );
                }
            }
        }
        close MAGNETS;
        unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/magnets" );
    }

    # If there is no existing table but there is a table file (the old style
    # flat file used by POPFile for corpus storage) then create the new
    # database from it thus performing an automatic upgrade.

    if ( -e $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" ) ) {
        $self->log_( 0, "Performing automatic upgrade of $bucket corpus from flat file to DBI" );

        $self->{db__}->begin_work;

        if ( open WORDS, '<' . $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" ) )  {

            my $wc = 1;

            my $first = <WORDS>;
            if ( defined( $first ) && ( $first =~ s/^__CORPUS__ __VERSION__ (\d+)// ) ) {
                if ( $1 != $self->{corpus_version__} )  {
                    print STDERR "Incompatible corpus version in $bucket\n";
                    close WORDS;
                    $self->{db__}->rollback;
                    return 0;
                } else {
                    $self->log_( 0, "Upgrading bucket $bucket..." );

                    while ( <WORDS> ) {
                        if ( $wc % 100 == 0 ) {
                            $self->log_( 0, "$wc" );
                        }
                        $wc += 1;
                        s/[\r\n]//g;

                        if ( /^([^\s]+) (\d+)$/ ) {
                            if ( $2 != 0 ) {
                                $self->db_put_word_count__( $session, $bucket, $1, $2 );
                            }
                        } else {
                            $self->log_( 0, "Found entry in corpus for $bucket that looks wrong: \"$_\" (ignoring)" );
                        }
                    }
                }

                if ( $wc > 1 ) {
                    $wc -= 1;
                    $self->log_( 0, "(completed $wc words)" );
                }
                close WORDS;
            } else {
                close WORDS;
                $self->{db__}->rollback;
                unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" );
                return 0;
            }

            $self->{db__}->commit;
            unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" );
        }
    }

    # Now check to see if there's a BerkeleyDB-style table

    my $bdb_file = $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table.db" );

    if ( -e $bdb_file ) {
        $self->log_( 0, "Performing automatic upgrade of $bucket corpus from BerkeleyDB to DBI" );

        require BerkeleyDB;

        my %h;
        tie %h, "BerkeleyDB::Hash", -Filename => $bdb_file;

        $self->log_( 0, "Upgrading bucket $bucket..." );
        $self->{db__}->begin_work;

        my $wc = 1;

        for my $word (keys %h) {
            if ( $wc % 100 == 0 ) {
                $self->log_( 0, "$wc" );
            }

            next if ( $word =~ /__POPFILE__(LOG__TOTAL|TOTAL|UNIQUE)__/ );

            $wc += 1;
            if ( $h{$word} != 0 ) {
                $self->db_put_word_count__( $session, $bucket, $word, $h{$word} );
            }
        }

        $wc -= 1;
        $self->log_( 0, "(completed $wc words)" );
        $self->{db__}->commit;
        untie %h;
        unlink $bdb_file;
    }

    return 1;
}

#----------------------------------------------------------------------------
#
# magnet_match_helper__
#
# Helper the determines if a specific string matches a certain magnet
# type in a bucket, used by magnet_match_
#
# $session         Valid session from get_session_key
# $match           The string to match
# $bucket          The bucket to check
# $type            The magnet type to check
#
#----------------------------------------------------------------------------
sub magnet_match_helper__
{
    my ( $self, $session, $match, $bucket, $type ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    $match = lc($match);

    # In Japanese and Korean mode, disable locale.  Sorting Japanese
    # and Korean with "use locale" is memory and time consuming, and
    # may cause perl crash.

    my @magnets;

    my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
    my $h = $self->{db__}->prepare(                                           # PROFILE BLOCK START
        "select magnets.val, magnets.id from magnets, users, buckets, magnet_types
             where buckets.id = $bucketid and
                   magnets.id != 0 and
                   users.id = buckets.userid and
                   magnets.bucketid = buckets.id and
                   magnet_types.mtype = '$type' and
                   magnets.mtid = magnet_types.id order by magnets.val;" );   # PROFILE BLOCK STOP

    $h->execute;
    while ( my $row = $h->fetchrow_arrayref ) {
        push @magnets, [$row->[0], $row->[1]];
    }
    $h->finish;

    foreach my $m (@magnets) {
        my ( $magnet, $id ) = @{$m};
        $magnet = lc($magnet);

        for my $i (0..(length($match)-length($magnet))) {
            if ( substr( $match, $i, length($magnet)) eq $magnet ) {
                $self->{magnet_used__}   = 1;
                $self->{magnet_detail__} = $id;

                return 1;
            }
        }
    }

    return 0;
}

#----------------------------------------------------------------------------
#
# magnet_match__
#
# Helper the determines if a specific string matches a certain magnet
# type in a bucket
#
# $session         Valid session from get_session_key
# $match           The string to match
# $bucket          The bucket to check
# $type            The magnet type to check
#
#----------------------------------------------------------------------------
sub magnet_match__
{
    my ( $self, $session, $match, $bucket, $type ) = @_;

    return $self->magnet_match_helper__( $session, $match, $bucket, $type );
}

#----------------------------------------------------------------------------
#
# write_line__
#
# Writes a line to a file and parses it unless the classification is
# already known
#
# $file         File handle for file to write line to
# $line         The line to write
# $class        (optional) The current classification
#
#----------------------------------------------------------------------------
sub write_line__
{
    my ( $self, $file, $line, $class ) = @_;

    print $file $line if defined( $file );

    if ( $class eq '' ) {
        $self->{parser__}->parse_line( $line );
    }
}

#----------------------------------------------------------------------------
#
# add_words_to_bucket__
#
# Takes words previously parsed by the mail parser and adds/subtracts
# them to/from a bucket, this is a helper used by
# add_messages_to_bucket, remove_message_from_bucket
#
# $session        Valid session from get_session_key
# $bucket         Bucket to add to
# $subtract       Set to -1 means subtract the words, set to 1 means add
#
#----------------------------------------------------------------------------
sub add_words_to_bucket__
{
    my ( $self, $session, $bucket, $subtract ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    # Map the list of words to a list of counts currently in the database
    # then update those counts and write them back to the database.

    my $words;
    $words = join( ',', map( $self->{db__}->quote( $_ ), (sort keys %{$self->{parser__}{words__}}) ) );
    $self->{get_wordids__} = $self->{db__}->prepare(        # PROFILE BLOCK START
             "select id, word
                  from words
                  where word in ( $words );" );             # PROFILE BLOCK STOP
    $self->{get_wordids__}->execute;

    my @id_list;
    my %wordmap;

    while ( my $row = $self->{get_wordids__}->fetchrow_arrayref ) {
        push @id_list, ($row->[0]);
        $wordmap{$row->[1]} = $row->[0];
    }

    $self->{get_wordids__}->finish;

    my $ids = join( ',', @id_list );

    $self->{db_getwords__} = $self->{db__}->prepare(                                         # PROFILE BLOCK START
             "select matrix.times, matrix.wordid
                  from matrix
                  where matrix.wordid in ( $ids )
                    and matrix.bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};" );  # PROFILE BLOCK STOP

    $self->{db_getwords__}->execute;

    my %counts;

    while ( my $row = $self->{db_getwords__}->fetchrow_arrayref ) {
        $counts{$row->[1]} = $row->[0];
    }

    $self->{db_getwords__}->finish;

    $self->{db__}->begin_work;
    foreach my $word (keys %{$self->{parser__}->{words__}}) {

        # If there's already a count then it means that the word is
        # already in the database and we have its id in
        # $wordmap{$word} so for speed we execute the
        # db_put_word_count__ query here rather than going through
        # set_value_ which would need to look up the wordid again

        if ( defined( $wordmap{$word} ) && defined( $counts{$wordmap{$word}} ) ) {
            $self->{db_put_word_count__}->execute( $self->{db_bucketid__}{$userid}{$bucket}{id},               # PROFILE BLOCK START
                $wordmap{$word}, $counts{$wordmap{$word}} + $subtract * $self->{parser__}->{words__}{$word} ); # PROFILE BLOCK STOP
        } else {

            # If the word is not in the database and we are trying to
            # subtract then we do nothing because negative values are
            # meaningless

            if ( $subtract == 1 ) {
                $self->db_put_word_count__( $session, $bucket, $word, $self->{parser__}->{words__}{$word} );
            }
        }
    }

    # If we were doing a subtract operation it's possible that some of
    # the words in the bucket now have a zero count and should be
    # removed

    if ( $subtract == -1 ) {
        $self->{db_delete_zero_words__}->execute( $self->{db_bucketid__}{$userid}{$bucket}{id} );
    }

    $self->{db__}->commit;
}

#----------------------------------------------------------------------------
#
# echo_to_dot_
#
# $mail The stream (created with IO::) to send the message to (the
# remote mail server)
# $client (optional) The local mail client (created with IO::) that
# needs the response
# $file (optional) A file to print the response to, caller specifies
# open style
# $before (optional) String to send to client before the dot is sent
#
# echo all information from the $mail server until a single line with
# a . is seen
#
# NOTE Also echoes the line with . to $client but not to $file
#
# Returns 1 if there was a . or 0 if reached EOF before we hit the .
#
#----------------------------------------------------------------------------
sub echo_to_dot_
{
    my ( $self, $mail, $client, $file, $before ) = @_;

    my $hit_dot = 0;

    my $isopen = open FILE, "$file" if ( defined( $file ) );
    binmode FILE if ($isopen);

    while ( my $line = $self->slurp_( $mail ) ) {

        # Check for an abort

        last if ( $self->{alive_} == 0 );

        # The termination has to be a single line with exactly a dot
        # on it and nothing else other than line termination
        # characters.  This is vital so that we do not mistake a line
        # beginning with . as the end of the block

        if ( $line =~ /^\.(\r\n|\r|\n)$/ ) {
            $hit_dot = 1;

            if ( defined( $before ) && ( $before ne '' ) ) {
                print $client $before if ( defined( $client ) );
                print FILE    $before if ( defined( $isopen ) );
            }

            # Note that there is no print FILE here.  This is correct
            # because we do no want the network terminator . to appear
            # in the file version of any message

            print $client $line if ( defined( $client ) );
            last;
        }

        print $client $line if ( defined( $client ) );
        print FILE    $line if ( defined( $isopen ) );

    }

    close FILE if ( $isopen );

    return $hit_dot;
}

#----------------------------------------------------------------------------
#
# substr_euc__
#
# "substr" function which supports EUC Japanese charset
#
# $pos      Start position
# $len      Word length
#
#----------------------------------------------------------------------------
sub substr_euc__
{
    my ( $str, $pos, $len ) = @_;
    my $result_str;
    my $char;
    my $count = 0;
    if ( !$pos ) {
        $pos = 0;
    }
    if ( !$len ) {
        $len = length( $str );
    }

    for ( $pos = 0; $count < $len; $pos++ ) {
        $char = substr( $str, $pos, 1 );
        if ( $char =~ /[\x80-\xff]/ ) {
            $char = substr( $str, $pos++, 2 );
        }
        $result_str .= $char;
        $count++;
    }

    return $result_str;
}

#----------------------------------------------------------------------------
#
# generate_unique_session_key__
#
# Returns a unique string based session key that can be used as a key
# in the api_sessions__
#
#----------------------------------------------------------------------------
sub generate_unique_session_key__
{
    my ( $self ) = @_;

    my @chars = ( 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',   # PROFILE BLOCK START
                  'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'U', 'V', 'W', 'X', 'Y',
                  'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A' ); # PROFILE BLOCK STOP

    my $session;

    do {
        $session = '';
        my $length = int( 16 + rand(4) );

        for my $i (0 .. $length) {
            my $random = $chars[int( rand(36) )];

            # Just to add spice to things we sometimes lowercase the value

            if ( rand(1) < rand(1) ) {
                $random = lc($random);
            }

            $session .= $random;
        }
    } while ( defined( $self->{api_sessions__}{$session} ) );

    return $session;
}

#----------------------------------------------------------------------------
#
# release_session_key_private__
#
# $session        A session key previously returned by get_session_key
#
# Releases and invalidates the session key. Worker function that does the work
# of release_session_key. 
#                   ****DO NOT CALL DIRECTLY****
# unless you want your session key released immediately, possibly preventing
# asynchronous tasks from completing
#
#----------------------------------------------------------------------------
sub release_session_key_private__
{
    my ( $self, $session ) = @_;
    
    if ( defined( $self->{api_sessions__}{$session} ) ) {
        $self->log_( 1, "release_session_key releasing key $session for user $self->{api_sessions__}{$session}" );
        delete $self->{api_sessions__}{$session};
    }
}

#----------------------------------------------------------------------------
#
# valid_session_key__
#
# $session                Session key returned by call to get_session_key
#
# Returns undef is the session key is not valid, or returns the user
# ID associated with the session key which can be used in database
# accesses
#
#----------------------------------------------------------------------------
sub valid_session_key__
{
    my ( $self, $session ) = @_;

    # This provides protection against someone using the XML-RPC
    # interface and calling this API directly to fish for session
    # keys, this must be called from within this module

    return undef if ( caller ne 'Classifier::Bayes' );

    # If the session key is invalid then wait 1 second.  This is done
    # to prevent people from calling a POPFile API such as
    # get_bucket_count with random session keys fishing for a valid
    # key.  The XML-RPC API is single threaded and hence this will
    # delay all use of that API by one second.  Of course in normal
    # use when the user knows the username/password or session key
    # then there is no delay

    if ( !defined( $self->{api_sessions__}{$session} ) ) {
        my ( $package, $filename, $line, $subroutine ) = caller;
        $self->log_( 0, "Invalid session key $session provided in $package @ $line" );
        select( undef, undef, undef, 1 );
    }

    return $self->{api_sessions__}{$session};
}

#----------------------------------------------------------------------------
#----------------------------------------------------------------------------
# _____   _____   _____  _______ _____        _______   _______  _____  _____
#|_____] |     | |_____] |______   |   |      |______   |_____| |_____]   |
#|       |_____| |       |       __|__ |_____ |______   |     | |       __|__
#
# The method below are public and may be accessed by other modules.
# All of them may be accessed remotely through the XMLRPC.pm module
# using the XML-RPC protocol
#
# Note that every API function expects to be passed a $session which
# is obtained by first calling get_session_key with a valid username
# and password.  Once done call the method release_session_key.
#
# See POPFile::API for more details
#
#----------------------------------------------------------------------------
#----------------------------------------------------------------------------

#----------------------------------------------------------------------------
#
# get_session_key
#
# $user           The name of an existing user
# $pwd            The user's password
#
# Returns a string based session key if the username and password
# match, or undef if not
#
#----------------------------------------------------------------------------
sub get_session_key
{
    my ( $self, $user, $pwd ) = @_;

    # The password is stored in the database as an MD5 hash of the
    # username and password concatenated and separated by the string
    # __popfile__, so compute the hash here

    my $hash = md5_hex( $user . '__popfile__' . $pwd );

    $self->{db_get_userid__}->execute( $user, $hash );
    my $result = $self->{db_get_userid__}->fetchrow_arrayref;
    if ( !defined( $result ) ) {

        # The delay of one second here is to prevent people from trying out
        # username/password combinations at high speed to determine the
        # credentials of a valid user

        $self->log_( 0, "Attempt to login with incorrect credentials for user $user" );
        select( undef, undef, undef, 1 );
        return undef;
    }

    my $session = $self->generate_unique_session_key__();

    $self->{api_sessions__}{$session} = $result->[0];

    $self->db_update_cache__( $session );

    $self->log_( 1, "get_session_key returning key $session for user $self->{api_sessions__}{$session}" );

    return $session;
}

#----------------------------------------------------------------------------
#
# release_session_key
#
# $session        A session key previously returned by get_session_key
#
# Releases and invalidates the session key
#
#----------------------------------------------------------------------------
sub release_session_key
{
    my ( $self, $session ) = @_;
    
    $self->mq_post_( "RELSE", $session );
}


#----------------------------------------------------------------------------
#
# get_top_bucket__
#
# Helper function used by classify to get the bucket with the highest
# score from data stored in a matrix of information (see definition of
# %matrix in classify for details) and a list of potential buckets
#
# $userid         User ID for database access
# $id             ID of a word in $matrix
# $matrix         Reference to the %matrix hash in classify
# $buckets        Reference to a list of buckets
#
# Returns the bucket in $buckets with the highest score
#
#----------------------------------------------------------------------------
sub get_top_bucket__
{
    my ( $self, $userid, $id, $matrix, $buckets ) = @_;

    my $best_probability = 0;
    my $top_bucket       = 'unclassified';

    for my $bucket (@$buckets) {
        my $probability = 0;
        if ( defined($$matrix{$id}{$bucket}) && ( $$matrix{$id}{$bucket} > 0 ) ) {
            $probability = $$matrix{$id}{$bucket} / $self->{db_bucketcount__}{$userid}{$bucket};
        }

        if ( $probability > $best_probability ) {
            $best_probability = $probability;
            $top_bucket       = $bucket;
        }
    }

    return $top_bucket;
}

#----------------------------------------------------------------------------
#
# classify
#
# $session   A valid session key returned by a call to get_session_key
# $file The name of the file containing the text to classify (or undef
# to use the data already in the parser)
# $templ     Reference to the UI template used for word score display
# $matrix (optional) Reference to a hash that will be filled with the
# word matrix used in classification
# $idmap (optional) Reference to a hash that will map word ids in the
# $matrix to actual words
#
# Splits the mail message into valid words, then runs the Bayes
# algorithm to figure out which bucket it belongs in.  Returns the
# bucket name
#
#----------------------------------------------------------------------------
sub classify
{
    my ( $self, $session, $file, $templ, $matrix, $idmap ) = @_;
    my $msg_total = 0;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    $self->{unclassified__} = log( $self->config_( 'unclassified_weight' ) );

    $self->{magnet_used__}   = 0;
    $self->{magnet_detail__} = 0;

    if ( defined( $file ) ) {
        $self->{parser__}->parse_file( $file,                                           # PROFILE BLOCK START
                                       $self->global_config_( 'message_cutoff'   ) );   # PROFILE BLOCK STOP
    }

    # Check to see if this email should be classified based on a magnet
    # Get the list of buckets

    my @buckets = $self->get_buckets( $session );

    for my $bucket ($self->get_buckets_with_magnets( $session ))  {
        for my $type ($self->get_magnet_types_in_bucket( $session, $bucket )) {
            if ( $self->magnet_match__( $session, $self->{parser__}->get_header($type), $bucket, $type ) ) {
                return $bucket;
            }
        }
    }

    # If the user has not defined any buckets then we escape here
    # return unclassified

    return "unclassified" if ( $#buckets == -1 );

    # The score hash will contain the likelihood that the given
    # message is in each bucket, the buckets are the keys for score

    # Set up the initial score as P(bucket)

    my %score;
    my %matchcount;

    # Build up a list of the buckets that are OK to use for
    # classification (i.e.  that have at least one word in them).

    my @ok_buckets;

    for my $bucket (@buckets) {
        if ( $self->{bucket_start__}{$userid}{$bucket} != 0 ) {
            $score{$bucket} = $self->{bucket_start__}{$userid}{$bucket};
            $matchcount{$bucket} = 0;
            push @ok_buckets, ( $bucket );
        }
    }

    @buckets = @ok_buckets;

    # For each word go through the buckets and calculate
    # P(word|bucket) and then calculate P(word|bucket) ^ word count
    # and multiply to the score

    my $word_count = 0;

    # The correction value is used to generate score displays variable
    # which are consistent with the word scores shown by the GUI's
    # word lookup feature.  It is computed to make the contribution of
    # a word which is unrepresented in a bucket zero.  This correction
    # affects only the values displayed in the display; it has no
    # effect on the classification process.

    my $correction = 0;

    # Classification against the database works in a sequence of steps
    # to get the fastest time possible.  The steps are as follows:
    #
    # 1. Convert the list of words returned by the parser into a list
    #    of unique word ids that can be used in the database.  This
    #    requires a select against the database to get the word ids
    #    (and associated words) which is then converted into two
    #    things: @id_list which is just the sorted list of word ids
    #    and %idmap which maps a word to its id.
    #
    # 2. Then run a second select that get the triplet (count, id,
    #    bucket) for each word id and each bucket.  The triplet
    #    contains the word count from the database for each bucket and
    #    each id, where there is an entry. That data gets loaded into
    #    the sparse matrix %matrix.
    #
    # 3. Do the normal classification loop as before running against
    # the @id_list for the words and for each bucket.  If there's an
    # entry in %matrix for the id/bucket combination then calculate
    # the probability, otherwise use the not_likely probability.
    #
    # NOTE.  Since there is a single not_likely probability we do not
    # worry about the fact that the select in 1 might return a shorter
    # list of words than was found in the message (because some words
    # are not in the database) since the missing words will be the
    # same for all buckets and hence constitute a fixed scaling factor
    # on all the buckets which is irrelevant in deciding which the
    # winning bucket is.

    my $words;
    $words = join( ',', map( $self->{db__}->quote( $_ ), (sort keys %{$self->{parser__}{words__}}) ) );
    $self->{get_wordids__} = $self->{db__}->prepare(  # PROFILE BLOCK START
             "select id, word
                  from words
                  where word in ( $words )
                  order by id;" );                    # PROFILE BLOCK STOP
    $self->{get_wordids__}->execute;

    my @id_list;
    my %temp_idmap;

    if ( !defined( $idmap ) ) {
        $idmap = \%temp_idmap;
    }

    while ( my $row = $self->{get_wordids__}->fetchrow_arrayref ) {
        push @id_list, ($row->[0]);
        $$idmap{$row->[0]} = $row->[1];
    }

    $self->{get_wordids__}->finish;

    my $ids = join( ',', @id_list );

    $self->{db_classify__} = $self->{db__}->prepare(            # PROFILE BLOCK START
             "select matrix.times, matrix.wordid, buckets.name
                  from matrix, buckets
                  where matrix.wordid in ( $ids )
                    and matrix.bucketid = buckets.id
                    and buckets.userid = $userid;" );           # PROFILE BLOCK STOP

    $self->{db_classify__}->execute;

    # %matrix maps wordids and bucket names to counts
    # $matrix{$wordid}{$bucket} == $count

    my %temp_matrix;

    if ( !defined( $matrix ) ) {
        $matrix = \%temp_matrix;
    }

    while ( my $row = $self->{db_classify__}->fetchrow_arrayref ) {
        $$matrix{$row->[1]}{$row->[2]} = $row->[0];
    }

    $self->{db_classify__}->finish;

    foreach my $id (@id_list) {
        $word_count += 2;
        my $wmax = -10000;

        foreach my $bucket (@buckets) {
            my $probability = 0;

            if ( defined($$matrix{$id}{$bucket}) && ( $$matrix{$id}{$bucket} > 0 ) ) {
                $probability = log( $$matrix{$id}{$bucket} / $self->{db_bucketcount__}{$userid}{$bucket} );
            }

            $matchcount{$bucket} += $self->{parser__}{words__}{$$idmap{$id}} if ($probability != 0);
            $probability = $self->{not_likely__}{$userid} if ( $probability == 0 );
            $wmax = $probability if ( $wmax < $probability );
            $score{$bucket} += ( $probability * $self->{parser__}{words__}{$$idmap{$id}} );
        }

        if ($wmax > $self->{not_likely__}{$userid}) {
            $correction += $self->{not_likely__}{$userid} * $self->{parser__}{words__}{$$idmap{$id}};
        } else {
            $correction += $wmax * $self->{parser__}{words__}{$$idmap{$id}};
        }
    }

    # Now sort the scores to find the highest and return that bucket
    # as the classification

    my @ranking = sort {$score{$b} <=> $score{$a}} keys %score;

    my %raw_score;
    my $base_score = $score{$ranking[0]};
    my $total = 0;

    # If the first and second bucket are too close in their
    # probabilities, call the message unclassified.  Also if there are
    # fewer than 2 buckets.

    my $class = 'unclassified';

    if ( @buckets > 1 && $score{$ranking[0]} > ( $score{$ranking[1]} + $self->{unclassified__} ) ) {
        $class = $ranking[0];
    }

    # Compute the total of all the scores to generate the normalized
    # scores and probability estimate.  $total is always 1 after the
    # first loop iteration, so any additional term less than 2 ** -54
    # is insignificant, and need not be computed.

    my $ln2p_54 = -54 * log(2);

    foreach my $b (@ranking) {
        $raw_score{$b} = $score{$b};
        $score{$b} -= $base_score;

        $total += exp($score{$b}) if ($score{$b} > $ln2p_54 );
    }

    if ($self->{wordscores__} && defined($templ) ) {
        my %qm = %{$self->{parser__}->quickmagnets()};
        my $mlen = scalar(keys %{$self->{parser__}->quickmagnets()});

        if ( $mlen >= 0 ) {
            $templ->param( 'View_QuickMagnets_If' => 1 );
            $templ->param( 'View_QuickMagnets_Count' => ($mlen + 1) );
            my @buckets = $self->get_buckets( $session );
            my $i = 0;
            my %types = $self->get_magnet_types( $session );

            my @bucket_data;
            foreach my $bucket (@buckets) {
                my %row_data;
                $row_data{View_QuickMagnets_Bucket} = $bucket;
                $row_data{View_QuickMagnets_Bucket_Color} = $self->get_bucket_color( $session, $bucket );
                push ( @bucket_data, \%row_data );
            }

            my @qm_data;
            foreach my $type (sort keys %types) {
                my %row_data;

                if (defined $qm{$type}) {
                    $i++;

                    $row_data{View_QuickMagnets_Type} = $type;
                    $row_data{View_QuickMagnets_I} = $i;
                    $row_data{View_QuickMagnets_Loop_Buckets} = \@bucket_data;

                    my @magnet_data;
                    foreach my $magnet ( @{$qm{$type}} ) {
                        my %row_magnet;
                        $row_magnet{View_QuickMagnets_Magnet} = $magnet;
                        push ( @magnet_data, \%row_magnet );
                    }
                    $row_data{View_QuickMagnets_Loop_Magnets} = \@magnet_data;

                    push ( @qm_data, \%row_data );
                }
            }
            $templ->param( 'View_QuickMagnets_Loop' => \@qm_data );
        }

        $templ->param( 'View_Score_If_Score' => $self->{wmformat__} eq 'score' );
        my $log10 = log(10.0);

        my @score_data;
        foreach my $b (@ranking) {
             my %row_data;
             my $prob = exp($score{$b})/$total;
             my $probstr;
             my $rawstr;

             # If the computed probability would display as 1, display
             # it as .999999 instead.  We don't want to give the
             # impression that POPFile is ever completely sure of its
             # classification.

             if ($prob >= .999999) {
                 $probstr = sprintf("%12.6f", 0.999999);
             } else {
                 if ($prob >= 0.1 || $prob == 0.0) {
                     $probstr = sprintf("%12.6f", $prob);
                 } else {
                    $probstr = sprintf("%17.6e", $prob);
                 }
             }

             my $color = $self->get_bucket_color( $session, $b );

             $row_data{View_Score_Bucket} = $b;
             $row_data{View_Score_Bucket_Color} = $color;
             $row_data{View_Score_MatchCount} = $matchcount{$b};
             $row_data{View_Score_ProbStr} = $probstr;

             if ($self->{wmformat__} eq 'score') {
                 $row_data{View_Score_If_Score} = 1;
                 $rawstr = sprintf("%12.6f", ($raw_score{$b} - $correction)/$log10);
                 $row_data{View_Score_RawStr} = $rawstr;
             }
             push ( @score_data, \%row_data );
        }
        $templ->param( 'View_Score_Loop_Scores' => \@score_data );

        if ( $self->{wmformat__} ne '' ) {
            $templ->param( 'View_Score_If_Table' => 1 );

            my @header_data;
            foreach my $ix (0..($#buckets > 7? 7: $#buckets)) {
                my %row_data;
                my $bucket = $ranking[$ix];
                my $bucketcolor  = $self->get_bucket_color( $session, $bucket );
                $row_data{View_Score_Bucket} = $bucket;
                $row_data{View_Score_Bucket_Color} = $bucketcolor;
                push ( @header_data, \%row_data );
            }
            $templ->param( 'View_Score_Loop_Bucket_Header' => \@header_data );

            my %wordprobs;

            # If the word matrix is supposed to show probabilities,
            # compute them, saving the results in %wordprobs.

            if ( $self->{wmformat__} eq 'prob') {
                foreach my $id (@id_list) {
                    my $sumfreq = 0;
                    my %wval;
                    foreach my $bucket (@ranking) {
                        $wval{$bucket} = $$matrix{$id}{$bucket} || 0;
                        $sumfreq += $wval{$bucket};
                    }

                    # If $sumfreq is still zero then this word didn't
                    # appear in any buckets so we shouldn't create
                    # wordprobs entries for it

                    if ( $sumfreq != 0 ) {
                        foreach my $bucket (@ranking) {
                            $wordprobs{$bucket,$id} = $wval{$bucket} / $sumfreq;
                        }
                    }
                }
            }

            my @ranked_ids;
            if ($self->{wmformat__} eq 'prob') {
                @ranked_ids = sort {($wordprobs{$ranking[0],$b}||0) <=> ($wordprobs{$ranking[0],$a}||0)} @id_list;
            } else {
                @ranked_ids = sort {($$matrix{$b}{$ranking[0]}||0) <=> ($$matrix{$a}{$ranking[0]}||0)} @id_list;
            }

            my @word_data;
            my %chart;
            foreach my $id (@ranked_ids) {
                my %row_data;
                my $known = 0;

                foreach my $bucket (@ranking) {
                    if ( defined( $$matrix{$id}{$bucket} ) ) {
                        $known = 1;
                        last;
                    }
                }

                if ( $known == 1 ) {
                    my $wordcolor = $self->get_bucket_color( $session, $self->get_top_bucket__( $userid, $id, $matrix, \@ranking ) );
                    my $count = $self->{parser__}->{words__}{$$idmap{$id}};

                    $row_data{View_Score_Word} = $$idmap{$id};
                    $row_data{View_Score_Word_Color} = $wordcolor;
                    $row_data{View_Score_Word_Count} = $count;

                    my $base_probability = 0;
                    if ( defined($$matrix{$id}{$ranking[0]}) && ( $$matrix{$id}{$ranking[0]} > 0 ) ) {
                        $base_probability = log( $$matrix{$id}{$ranking[0]} / $self->{db_bucketcount__}{$userid}{$ranking[0]} );
                    }

                    my @per_bucket;
                    my @score;
                    foreach my $ix (0..($#buckets > 7? 7: $#buckets)) {
                        my %bucket_row;
                        my $bucket = $ranking[$ix];
                        my $probability = 0;
                        if ( defined($$matrix{$id}{$bucket}) && ( $$matrix{$id}{$bucket} > 0 ) ) {
                            $probability = log( $$matrix{$id}{$bucket} / $self->{db_bucketcount__}{$userid}{$bucket} );
                        }
                        my $color = 'black';

                        if ( $probability >= $base_probability || $base_probability == 0 ) {
                            $color = $self->get_bucket_color( $session, $bucket );
                        }

                        $bucket_row{View_Score_If_Probability} = ( $probability != 0 );
                        $bucket_row{View_Score_Word_Color} = $color;
                        if ( $probability != 0 ) {
                            my $wordprobstr;
                            if ($self->{wmformat__} eq 'score') {
                                $wordprobstr  = sprintf("%12.4f", ($probability - $self->{not_likely__}{$userid})/$log10 );
                                push ( @score, $wordprobstr );
                            } else {
                                if ($self->{wmformat__} eq 'prob') {
                                    $wordprobstr  = sprintf("%12.4f", $wordprobs{$bucket,$id});
                                } else {
                                    $wordprobstr  = sprintf("%13.5f", exp($probability) );
                                }
                            }
                            $bucket_row{View_Score_Probability} = $wordprobstr;
                        }
                        else {
                            # Scores eq 0 must also be remembered.
                            push @score, 0;
                        }
                        push ( @per_bucket, \%bucket_row );
                    }
                    $row_data{View_Score_Loop_Per_Bucket} = \@per_bucket;

                    # If we are doing the word scores then we build up
                    # a hash that maps the name of a word to a value
                    # which is the difference between the word scores
                    # for the top two buckets.  We later use this to
                    # draw a chart

                    if ( $self->{wmformat__} eq 'score' ) {
                        $chart{$$idmap{$id}} = ( $score[0] || 0 ) - ( $score[1] || 0 );
                    }

                    push ( @word_data, \%row_data );
                }
            }
            $templ->param( 'View_Score_Loop_Words' => \@word_data );

            if ( $self->{wmformat__} eq 'score' ) {
                # Draw a chart that shows how the decision between the top
                # two buckets was made.

                my @words = sort { $chart{$b} <=> $chart{$a} } keys %chart;

                my @chart_data;
                my $max_chart = $chart{$words[0]};
                my $min_chart = $chart{$words[$#words]};
                my $scale = ( $max_chart > $min_chart ) ? 400 / ( $max_chart - $min_chart ) : 0;

                my $color_1 = $self->get_bucket_color( $session, $ranking[0] );
                my $color_2 = $self->get_bucket_color( $session, $ranking[1] );

                $templ->param( 'Bucket_1' => $ranking[0] );
                $templ->param( 'Bucket_2' => $ranking[1] );

                $templ->param( 'Color_Bucket_1' => $color_1 );
                $templ->param( 'Color_Bucket_2' => $color_2 );

                $templ->param( 'Score_Bucket_1' => sprintf("%.3f", ($raw_score{$ranking[0]} - $correction)/$log10) );
                $templ->param( 'Score_Bucket_2' => sprintf("%.3f", ($raw_score{$ranking[1]} - $correction)/$log10) );

                for ( my $i=0; $i <= $#words; $i++ ) {
                    my $word_1 = $words[$i];
                    my $word_2 = $words[$#words - $i];

                    my $width_1 = int( $chart{$word_1} * $scale + .5 );
                    my $width_2 = int( $chart{$word_2} * $scale - .5 ) * -1;

                    last if ( $width_1 <=0 && $width_2 <= 0 );
                    
                    my %row_data;

                    $row_data{View_Chart_Word_1} = $word_1;
                    if ( $width_1 > 0 ) {
                        $row_data{View_If_Bar_1} = 1;
                        $row_data{View_Width_1}  = $width_1;
                        $row_data{View_Color_1}  = $color_1;
                        $row_data{Score_Word_1}  = sprintf "%.3f", $chart{$word_1};
                    }
                    else {
                        $row_data{View_If_Bar_1} = 0;
                    }

                    $row_data{View_Chart_Word_2} = $word_2;
                    if ( $width_2 > 0 ) {
                        $row_data{View_If_Bar_2} = 1;
                        $row_data{View_Width_2}  = $width_2;
                        $row_data{View_Color_2}  = $color_2;
                        $row_data{Score_Word_2}  = sprintf "%.3f", $chart{$word_2};
                    }
                    else {
                        $row_data{View_If_Bar_2} = 0;
                    }

                    push ( @chart_data, \%row_data );
                }
                $templ->param( 'View_Loop_Chart' => \@chart_data );
                $templ->param( 'If_chart' => 1 );
            }
            else {
                $templ->param( 'If_chart' => 0 );
            }
        }
    }

    return $class;
}

#----------------------------------------------------------------------------
#
# classify_and_modify
#
# This method reads an email terminated by . on a line by itself (or
# the end of stream) from a handle and creates an entry in the
# history, outputting the same email on another handle with the
# appropriate header modifications and insertions
#
# $session  - A valid session key returned by a call to get_session_key
# $mail     - an open stream to read the email from
# $client   - an open stream to write the modified email to
# $nosave   - set to 1 indicates that this should not save to history
# $class    - if we already know the classification
# $slot     - Must be defined if $class is set
# $echo     - 1 to echo to the client, 0 to supress, defaults to 1
# $crlf     - The sequence to use at the end of a line in the output,
#   normally this is left undefined and this method uses $eol (the
#   normal network end of line), but if this method is being used with
#   real files you may wish to pass in \n instead
#
# Returns a classification if it worked and the slot ID of the history
# item related to this classification
#
# IMPORTANT NOTE: $mail and $client should be binmode
#
#----------------------------------------------------------------------------
sub classify_and_modify
{
    my ( $self, $session, $mail, $client, $nosave, $class, $slot, $echo, $crlf ) = @_;

    $echo = 1    unless (defined $echo);
    $crlf = $eol unless (defined $crlf);

    my $msg_subject;              # The message subject
    my $msg_head_before = '';     # Store the message headers that
                                  # come before Subject here
    my $msg_head_after = '';      # Store the message headers that
                                  # come after Subject here
    my $msg_head_q      = '';     # Store questionable header lines here
    my $msg_body        = '';     # Store the message body here
    my $in_subject_header = 0;    # 1 if in Subject header

    # These two variables are used to control the insertion of the
    # X-POPFile-TimeoutPrevention header when downloading long or slow
    # emails

    my $last_timeout   = time;
    my $timeout_count  = 0;

    # Indicates whether the first time through the receive loop we got
    # the full body, this will happen on small emails

    my $got_full_body  = 0;

    # The size of the message downloaded so far.

    my $message_size   = 0;

    # The classification for this message

    my $classification = '';

    # Whether we are currently reading the mail headers or not

    my $getting_headers = 1;

    my $msg_file;

    # If we don't yet know the classification then start the parser

    $class = '' if ( !defined( $class ) );
    if ( $class eq '' ) {
        $self->{parser__}->start_parse();
        ( $slot, $msg_file ) = $self->{history__}->reserve_slot();
    } else {
        $msg_file = $self->{history__}->get_slot_file( $slot );
    }

    # We append .TMP to the filename for the MSG file so that if we are in
    # middle of downloading a message and we refresh the history we do not
    # get class file errors

    open MSG, ">$msg_file" unless $nosave;

    while ( my $line = $self->slurp_( $mail ) ) {
        my $fileline;

        # This is done so that we remove the network style end of line
        # CR LF and allow Perl to decide on the local system EOL which
        # it will expand out of \n when this gets written to the temp
        # file

        $fileline = $line;
        $fileline =~ s/[\r\n]//g;
        $fileline .= "\n";

        # Check for an abort

        last if ( $self->{alive_} == 0 );

        # The termination of a message is a line consisting of exactly
        # .CRLF so we detect that here exactly

        if ( $line =~ /^\.(\r\n|\r|\n)$/ ) {
            $got_full_body = 1;
            last;
        }

        if ( $getting_headers )  {

            # Kill header lines containing only whitespace (Exim does this)

            next if ( $line =~ /^[ \t]+(\r\n|\r|\n)$/i );

            if ( !( $line =~ /^(\r\n|\r|\n)$/i ) )  {
                $message_size += length $line;
                $self->write_line__( $nosave?undef:\*MSG, $fileline, $class );

                # If there is no echoing occuring, it doesn't matter
                # what we do to these

                if ( $echo ) {
                    if ( $line =~ /^Subject:(.*)/i )  {
                        $msg_subject = $1;
                        $msg_subject =~ s/(\012|\015)//g;
                        $in_subject_header = 1;
                        next;
                    } elsif ( $line !~ /^[ \t]/ ) {
                        $in_subject_header = 0;
                    }

                    # Strip out the X-Text-Classification header that
                    # is in an incoming message

                    next if ( $line =~ /^X-Text-Classification:/i );
                    next if ( $line =~ /^X-POPFile-Link:/i );

                    # Store any lines that appear as though they may
                    # be non-header content Lines that are headers
                    # begin with whitespace or Alphanumerics and "-"
                    # followed by a colon.
                    #
                    # This prevents weird things like HTML before the
                    # headers terminate from causing the XPL and XTC
                    # headers to be inserted in places some clients
                    # can't detect

                    if ( ( $line =~ /^[ \t]/ ) && $in_subject_header ) {
                        $line =~ s/(\012|\015)//g;
                        $msg_subject .= $crlf . $line;
                        next;
                    }

                    if ( $line =~ /^([ \t]|([A-Z0-9\-_]+:))/i ) {
                        if ( !defined($msg_subject) )  {
                            $msg_head_before .= $msg_head_q . $line;
                        } else {
                            $msg_head_after  .= $msg_head_q . $line;
                        }
                        $msg_head_q = '';
                    } else {

                        # Gather up any header lines that are questionable

                        $self->log_( 1, "Found odd email header: $line" );
                        $msg_head_q .= $line;
                    }
                }
            } else {
                $self->write_line__( $nosave?undef:\*MSG, "\n", $class );
                $message_size += length $crlf;
                $getting_headers = 0;
            }
        } else {
            $message_size += length $line;
            $msg_body     .= $line;
            $self->write_line__( $nosave?undef:\*MSG, $fileline, $class );
        }

        # Check to see if too much time has passed and we need to keep
        # the mail client happy

        if ( time > ( $last_timeout + 2 ) ) {
            print $client "X-POPFile-TimeoutPrevention: $timeout_count$crlf" if ( $echo );
            $timeout_count += 1;
            $last_timeout = time;
        }

        last if ( ( $message_size > $self->global_config_( 'message_cutoff' ) ) && ( $getting_headers == 0 ) );
    }

    close MSG unless $nosave;

    # If we don't yet know the classification then stop the parser
    if ( $class eq '' ) {
        $self->{parser__}->stop_parse();
    }

    # Do the text classification and update the counter for that
    # bucket that we just downloaded an email of that type

    $classification = ($class ne '')?$class:$self->classify( $session, undef);

    my $subject_modification = $self->get_bucket_parameter( $session, $classification, 'subject'    );
    my $xtc_insertion        = $self->get_bucket_parameter( $session, $classification, 'xtc'        );
    my $xpl_insertion        = $self->get_bucket_parameter( $session, $classification, 'xpl'        );
    my $quarantine           = $self->get_bucket_parameter( $session, $classification, 'quarantine' );

    my $modification = $self->config_( 'subject_mod_left' ) . $classification . $self->config_( 'subject_mod_right' );

    # Add the Subject line modification or the original line back again
    # Don't add the classification unless it is not present

    if (  ( defined( $msg_subject ) && ( $msg_subject !~ /\Q$modification\E/ ) ) && # PROFILE BLOCK START
          ( $subject_modification == 1 ) &&
          ( $quarantine == 0 ) )  {                                                 # PROFILE BLOCK STOP
         $msg_subject = " $modification$msg_subject";
    }

    if ( !defined( $msg_subject )       &&                                         # PROFILE BLOCK START
         ( $subject_modification == 1 ) &&
         ( $quarantine == 0 ) )  {                                                 # PROFILE BLOCK STOP
         $msg_subject = " $modification";
    }

    $msg_subject = '' if ( !defined( $msg_subject ) );

    $msg_head_before .= 'Subject:' . $msg_subject;
    $msg_head_before .= $crlf;

    # Add the XTC header
    $msg_head_after .= "X-Text-Classification: $classification$crlf" if ( ( $xtc_insertion   ) && # PROFILE BLOCK START
                                                                          ( $quarantine == 0 ) ); # PROFILE BLOCK STOP

    # Add the XPL header

    my $xpl = $self->config_( 'xpl_angle' )?'<':'';

    my $xpl_localhost = ($self->config_( 'localhostname' ) eq '')?"127.0.0.1":$self->config_( 'localhostname' );

    $xpl .= "http://";

    $xpl .= $self->module_config_( 'html', 'local' )?$xpl_localhost:$self->config_( 'hostname' );

    $xpl .= ":" . $self->module_config_( 'html', 'port' ) . "/jump_to_message?view=$slot";

    if ( $self->config_( 'xpl_angle' ) ) {
        $xpl .= '>';
    }

    $xpl .= "$crlf";

    if ( $xpl_insertion && ( $quarantine == 0 ) ) {
        $msg_head_after .= 'X-POPFile-Link: ' . $xpl;
    }

    $msg_head_after .= $msg_head_q . "$crlf";

    # Echo the text of the message to the client

    if ( $echo ) {

        # If the bucket is quarantined then we'll treat it specially
        # by changing the message header to contain information from
        # POPFile and wrapping the original message in a MIME encoding

       if ( $quarantine == 1 ) {
           my ( $orig_from, $orig_to, $orig_subject ) = ( $self->{parser__}->get_header('from'), $self->{parser__}->get_header('to'), $self->{parser__}->get_header('subject') );
           my ( $encoded_from, $encoded_to ) = ( $orig_from, $orig_to );
           if ( $self->{parser__}->{lang__} eq 'Nihongo' ) {
               require Encode;

               Encode::from_to( $orig_from, 'euc-jp', 'iso-2022-jp');
               Encode::from_to( $orig_to, 'euc-jp', 'iso-2022-jp');
               Encode::from_to( $orig_subject, 'euc-jp', 'iso-2022-jp');

               $encoded_from = $orig_from;
               $encoded_to = $orig_to;
               $encoded_from =~ s/(\x1B\x24\x42.+\x1B\x28\x42)/"=?ISO-2022-JP?B?" . encode_base64($1,'') . "?="/eg;
               $encoded_to =~ s/(\x1B\x24\x42.+\x1B\x28\x42)/"=?ISO-2022-JP?B?" . encode_base64($1,'') . "?="/eg;
           }

           print $client "From: $encoded_from$crlf";
           print $client "To: $encoded_to$crlf";
           print $client "Date: " . $self->{parser__}->get_header( 'date' ) . "$crlf";
           # Don't add the classification unless it is not present
           if ( ( defined( $msg_subject ) && ( $msg_subject !~ /\[\Q$classification\E\]/ ) ) && # PROFILE BLOCK START
                 ( $subject_modification == 1 ) ) {                                             # PROFILE BLOCK STOP
               $msg_subject = " $modification$msg_subject";
           }
           print $client "Subject:$msg_subject$crlf";
           print $client "X-Text-Classification: $classification$crlf" if ( $xtc_insertion );
           print $client 'X-POPFile-Link: ' . $xpl if ( $xpl_insertion );
           print $client "MIME-Version: 1.0$crlf";
           print $client "Content-Type: multipart/report; boundary=\"$slot\"$crlf$crlf--$slot$crlf";
           print $client "Content-Type: text/plain";
           print $client "; charset=iso-2022-jp" if ( $self->{parser__}->{lang__} eq 'Nihongo' );
           print $client "$crlf$crlf";
           print $client "POPFile has quarantined a message.  It is attached to this email.$crlf$crlf";
           print $client "Quarantined Message Detail$crlf$crlf";

           print $client "Original From: $orig_from$crlf";
           print $client "Original To: $orig_to$crlf";
           print $client "Original Subject: $orig_subject$crlf";

           print $client "To examine the email open the attachment. ";
           print $client "To change this mail's classification go to $xpl";
           print $client "$crlf";
           print $client "The first 20 words found in the email are:$crlf$crlf";

           my $first20 = $self->{parser__}->first20();
           if ( $self->{parser__}->{lang__} eq 'Nihongo' ) {
               require Encode;

               Encode::from_to( $first20, 'euc-jp', 'iso-2022-jp');
           }

           print $client $first20;
           print $client "$crlf--$slot$crlf";
           print $client "Content-Type: message/rfc822$crlf$crlf";
        }

        print $client $msg_head_before;
        print $client $msg_head_after;
        print $client $msg_body;
    }

    my $before_dot = '';

    if ( $quarantine && $echo ) {
        $before_dot = "$crlf--$slot--$crlf";
    }

    my $need_dot = 0;

    if ( $got_full_body ) {
        $need_dot = 1;
    } else {
        $need_dot = !$self->echo_to_dot_( $mail, $echo?$client:undef, $nosave?undef:'>>' . $msg_file, $before_dot ) && !$nosave;
    }

    if ( $need_dot ) {
        print $client $before_dot if ( $before_dot ne '' );
        print $client ".$crlf"    if ( $echo );
    }

    # In some cases it's possible (and totally illegal) to get a . in
    # the middle of the message, to cope with the we call flush_extra_
    # here to remove any extra stuff the POP3 server is sending Make
    # sure to supress output if we are not echoing, and to save to
    # file if not echoing and saving

    if ( !($nosave || $echo) ) {

        # if we're saving (not nosave) and not echoing, we can safely
        # unload this into the temp file

        if (open FLUSH, ">$msg_file.flush") {
            binmode FLUSH;

            # TODO: Do this in a faster way (without flushing to one
            # file then copying to another) (perhaps a select on $mail
            # to predict if there is flushable data)

            $self->flush_extra_( $mail, \*FLUSH, 0);
            close FLUSH;

            # append any data we got to the actual temp file

            if ( ( (-s "$msg_file.flush") > 0 ) && ( open FLUSH, "<$msg_file.flush" ) ) {
                binmode FLUSH;
                if ( open TEMP, ">>$msg_file" ) {
                    binmode TEMP;

                    # The only time we get data here is if it is after
                    # a CRLF.CRLF We have to re-create it to avoid
                    # data-loss

                    print TEMP ".$crlf";

                    print TEMP $_ while (<FLUSH>);

                    # NOTE: The last line flushed MAY be a CRLF.CRLF,
                    # which isn't actually part of the message body

                    close TEMP;
                }
                close FLUSH;
            }
            unlink("$msg_file.flush");
        }
    } else {

        # if we are echoing, the client can make sure we have no data
        # loss otherwise, the data can be discarded (not saved and not
        # echoed)

        $self->flush_extra_( $mail, $client, $echo?0:1);
    }

    if ( $class eq '' ) {
        if ( $nosave ) {
            $self->{history__}->release_slot( $slot );
        } else {
            $self->{history__}->commit_slot( $session, $slot, $classification, $self->{magnet_detail__} );
        }
    }

    return ( $classification, $slot, $self->{magnet_used__} );
}

#----------------------------------------------------------------------------
#
# get_buckets
#
# Returns a list containing all the real bucket names sorted into
# alphabetic order
#
# $session   A valid session key returned by a call to get_session_key
#
#----------------------------------------------------------------------------
sub get_buckets
{
    my ( $self, $session ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    # Note that get_buckets does not return pseudo buckets

    my @buckets;

    for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) {
        if ( $self->{db_bucketid__}{$userid}{$b}{pseudo} == 0 ) {
            push @buckets, ($b);
        }
    }

    return @buckets;
}

#----------------------------------------------------------------------------
#
# get_bucket_id
#
# Returns the internal ID for a bucket for database calls
#
# $session   A valid session key returned by a call to get_session_key
# $bucket    The bucket name
#
#----------------------------------------------------------------------------
sub get_bucket_id
{
    my ( $self, $session, $bucket ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    return $self->{db_bucketid__}{$userid}{$bucket}{id};
}

#----------------------------------------------------------------------------
#
# get_bucket_name
#
# Returns the name of a bucket from an internal ID
#
# $session   A valid session key returned by a call to get_session_key
# $id        The bucket id
#
#----------------------------------------------------------------------------
sub get_bucket_name
{
    my ( $self, $session, $id ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    foreach $b (keys %{$self->{db_bucketid__}{$userid}}) {
        if ( $id == $self->{db_bucketid__}{$userid}{$b}{id} ) {
            return $b;
        }
    }

    return '';
}

#----------------------------------------------------------------------------
#
# get_pseudo_buckets
#
# Returns a list containing all the pseudo bucket names sorted into
# alphabetic order
#
# $session   A valid session key returned by a call to get_session_key
#
#----------------------------------------------------------------------------
sub get_pseudo_buckets
{
    my ( $self, $session ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    my @buckets;

    for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) {
        if ( $self->{db_bucketid__}{$userid}{$b}{pseudo} == 1 ) {
            push @buckets, ($b);
        }
    }

    return @buckets;
}

#----------------------------------------------------------------------------
#
# get_all_buckets
#
# Returns a list containing all the bucket names sorted into
# alphabetic order
#
# $session   A valid session key returned by a call to get_session_key
#
#----------------------------------------------------------------------------
sub get_all_buckets
{
    my ( $self, $session ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    my @buckets;

    for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) {
         push @buckets, ($b);
    }

    return @buckets;
}

#----------------------------------------------------------------------------
#
# is_pseudo_bucket
#
# Returns 1 if the named bucket is pseudo
#
# $session   A valid session key returned by a call to get_session_key
# $bucket    The bucket to check
#
#----------------------------------------------------------------------------
sub is_pseudo_bucket
{
    my ( $self, $session, $bucket ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    return ( defined($self->{db_bucketid__}{$userid}{$bucket})   # PROFILE BLOCK START
          && $self->{db_bucketid__}{$userid}{$bucket}{pseudo} ); # PROFILE BLOCK STOP
}

#----------------------------------------------------------------------------
#
# is_bucket
#
# Returns 1 if the named bucket is a bucket
#
# $session   A valid session key returned by a call to get_session_key
# $bucket    The bucket to check
#
#----------------------------------------------------------------------------
sub is_bucket
{
    my ( $self, $session, $bucket ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    return ( ( defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) &&  # PROFILE BLOCK START
             ( !$self->{db_bucketid__}{$userid}{$bucket}{pseudo} ) );    # PROFILE BLOCK STOP
}

#----------------------------------------------------------------------------
#
# get_bucket_word_count
#
# Returns the total word count (including duplicates) for the passed in bucket
#
# $session     A valid session key returned by a call to get_session_key
# $bucket      The name of the bucket for which the word count is desired
#
#----------------------------------------------------------------------------
sub get_bucket_word_count
{
    my ( $self, $session, $bucket ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    my $c = $self->{db_bucketcount__}{$userid}{$bucket};

    return defined($c)?$c:0;
}

#----------------------------------------------------------------------------
#
# get_bucket_word_list
#
# Returns a list of words all with the same first character
#
# $session     A valid session key returned by a call to get_session_key
# $bucket      The name of the bucket for which the word count is desired
# $prefix      The first character of the words
#
#----------------------------------------------------------------------------
sub get_bucket_word_list
{
    my ( $self, $session, $bucket, $prefix ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
    my $result = $self->{db__}->selectcol_arrayref(  # PROFILE BLOCK START
        "select words.word from matrix, words
         where matrix.wordid  = words.id and
               matrix.bucketid = $bucketid and
               words.word like '$prefix%';");        # PROFILE BLOCK STOP

    return @{$result};
}

#----------------------------------------------------------------------------
#
# get_bucket_word_prefixes
#
# Returns a list of all the initial letters of words in a bucket
#
# $session     A valid session key returned by a call to get_session_key
# $bucket      The name of the bucket for which the word count is desired
#
#----------------------------------------------------------------------------
sub get_bucket_word_prefixes
{
    my ( $self, $session, $bucket ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    my $prev = '';

    my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
    my $result = $self->{db__}->selectcol_arrayref(   # PROFILE BLOCK START
        "select words.word from matrix, words
         where matrix.wordid  = words.id and
               matrix.bucketid = $bucketid;");        # PROFILE BLOCK STOP

    # In Japanese mode, disable locale and use substr_euc, the substr
    # function which supports EUC Japanese charset.  Sorting Japanese
    # with "use locale" is memory and time consuming, and may cause
    # perl crash.

    if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
        return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr_euc__($_,0,1)} @{$result};
    } else {
        if  ( $self->module_config_( 'html', 'language' ) eq 'Korean' ) {
            return grep {$_ ne $prev && ($prev = $_, 1)} sort map {$_ =~ /([\x20-\x80]|$eksc)/} @{$result};
        } else {
            return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr($_,0,1)}  @{$result};
        }
    }
}

#----------------------------------------------------------------------------
#
# get_word_count
#
# Returns the total word count (including duplicates)
#
# $session   A valid session key returned by a call to get_session_key
#
#----------------------------------------------------------------------------
sub get_word_count
{
    my ( $self, $session ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    $self->{db_get_full_total__}->execute( $userid );
    return $self->{db_get_full_total__}->fetchrow_arrayref->[0];
}

#----------------------------------------------------------------------------
#
# get_count_for_word
#
# Returns the number of times the word occurs in a bucket
#
# $session         A valid session key returned by a call to get_session_key
# $bucket          The bucket we are asking about
# $word            The word we are asking about
#
#----------------------------------------------------------------------------
sub get_count_for_word
{
    my ( $self, $session, $bucket, $word ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    return $self->get_base_value_( $session, $bucket, $word );
}

#----------------------------------------------------------------------------
#
# get_bucket_unique_count
#
# Returns the unique word count (excluding duplicates) for the passed
# in bucket
#
# $session     A valid session key returned by a call to get_session_key
# $bucket      The name of the bucket for which the word count is desired
#
#----------------------------------------------------------------------------
sub get_bucket_unique_count
{
    my ( $self, $session, $bucket ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    my $c = $self->{db_bucketunique__}{$userid}{$bucket};

    return defined($c)?$c:0;
}

#----------------------------------------------------------------------------
#
# get_unique_word_count
#
# Returns the unique word count (excluding duplicates) for all buckets
#
# $session   A valid session key returned by a call to get_session_key
#
#----------------------------------------------------------------------------
sub get_unique_word_count
{
    my ( $self, $session ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    $self->{db_get_unique_word_count__}->execute( $userid );
    return $self->{db_get_unique_word_count__}->fetchrow_arrayref->[0];
}

#----------------------------------------------------------------------------
#
# get_bucket_color
#
# Returns the color associated with a bucket
#
# $session   A valid session key returned by a call to get_session_key
# $bucket      The name of the bucket for which the color is requested
#
# NOTE  This API is DEPRECATED in favor of calling get_bucket_parameter for
#       the parameter named 'color'
#----------------------------------------------------------------------------
sub get_bucket_color
{
    my ( $self, $session, $bucket ) = @_;

    return $self->get_bucket_parameter( $session, $bucket, 'color' );
}

#----------------------------------------------------------------------------
#
# set_bucket_color
#
# Returns the color associated with a bucket
#
# $session     A valid session key returned by a call to get_session_key
# $bucket      The name of the bucket for which the color is requested
# $color       The new color
#
# NOTE  This API is DEPRECATED in favor of calling set_bucket_parameter for
#       the parameter named 'color'
#----------------------------------------------------------------------------
sub set_bucket_color
{
    my ( $self, $session, $bucket, $color ) = @_;

    return $self->set_bucket_parameter( $session, $bucket, 'color', $color );
}

#----------------------------------------------------------------------------
#
# get_bucket_parameter
#
# Returns the value of a per bucket parameter
#
# $session     A valid session key returned by a call to get_session_key
# $bucket      The name of the bucket
# $parameter   The name of the parameter
#
#----------------------------------------------------------------------------
sub get_bucket_parameter
{
    my ( $self, $session, $bucket, $parameter ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    # See if there's a cached value

    if ( defined( $self->{db_parameters__}{$userid}{$bucket}{$parameter} ) ) {
        return $self->{db_parameters__}{$userid}{$bucket}{$parameter};
    }

    # Make sure that the bucket passed in actually exists

    if ( !defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) {
        return undef;
    }

    # If there is a non-default value for this parameter then return it.

    $self->{db_get_bucket_parameter__}->execute( $self->{db_bucketid__}{$userid}{$bucket}{id}, $self->{db_parameterid__}{$parameter} );
    my $result = $self->{db_get_bucket_parameter__}->fetchrow_arrayref;

    # If this parameter has not been defined for this specific bucket then
    # get the default value

    if ( !defined( $result ) ) {
        $self->{db_get_bucket_parameter_default__}->execute(  # PROFILE BLOCK START
            $self->{db_parameterid__}{$parameter} );          # PROFILE BLOCK STOP
        $result = $self->{db_get_bucket_parameter_default__}->fetchrow_arrayref;
    }

    if ( defined( $result ) ) {
        $self->{db_parameters__}{$userid}{$bucket}{$parameter} = $result->[0];
        return $result->[0];
    } else {
        return undef;
    }
}

#----------------------------------------------------------------------------
#
# set_bucket_parameter
#
# Sets the value associated with a bucket specific parameter
#
# $session     A valid session key returned by a call to get_session_key
# $bucket      The name of the bucket
# $parameter   The name of the parameter
# $value       The new value
#
#----------------------------------------------------------------------------
sub set_bucket_parameter
{
    my ( $self, $session, $bucket, $parameter, $value ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    # Make sure that the bucket passed in actually exists

    if ( !defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) {
        return undef;
    }

    my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
    my $btid     = $self->{db_parameterid__}{$parameter};

    # Exactly one row should be affected by this statement

    $self->{db_set_bucket_parameter__}->execute( $bucketid, $btid, $value );

    if ( defined( $self->{db_parameters__}{$userid}{$bucket}{$parameter} ) ) {
        $self->{db_parameters__}{$userid}{$bucket}{$parameter} = $value;
    }

    return 1;
}

#----------------------------------------------------------------------------
#
# get_html_colored_message
#
# Parser a mail message stored in a file and returns HTML representing
# the message with coloring of the words
#
# $session        A valid session key returned by a call to get_session_key
# $file           The file to parse
#
#----------------------------------------------------------------------------
sub get_html_colored_message
{
    my ( $self, $session, $file ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    $self->{parser__}->{color__} = $session;
    $self->{parser__}->{color_matrix__} = undef;
    $self->{parser__}->{color_idmap__}  = undef;
    $self->{parser__}->{color_userid__} = undef;
    $self->{parser__}->{bayes__} = bless $self;
    
    my $result = $self->{parser__}->parse_file( $file,   # PROFILE BLOCK START
           $self->global_config_( 'message_cutoff'   ) ); # PROFILE BLOCK STOP

    $self->{parser__}->{color__} = '';

    return $result;
}

#----------------------------------------------------------------------------
#
# fast_get_html_colored_message
#
# Parser a mail message stored in a file and returns HTML representing the message
# with coloring of the words
#
# $session        A valid session key returned by a call to get_session_key
# $file           The file to colorize
# $matrix         Reference to the matrix hash from a call to classify
# $idmap          Reference to the idmap hash from a call to classify
#
#----------------------------------------------------------------------------
sub fast_get_html_colored_message
{
    my ( $self, $session, $file, $matrix, $idmap ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    $self->{parser__}->{color__}        = $session;
    $self->{parser__}->{color_matrix__} = $matrix;
    $self->{parser__}->{color_idmap__}  = $idmap;
    $self->{parser__}->{color_userid__} = $userid;
    $self->{parser__}->{bayes__}        = bless $self;

    my $result = $self->{parser__}->parse_file( $file,
                                                $self->global_config_( 'message_cutoff'   ) );

    $self->{parser__}->{color__} = '';

    return $result;
}

#----------------------------------------------------------------------------
#
# create_bucket
#
# Creates a new bucket, returns 1 if the creation succeeded
#
# $session     A valid session key returned by a call to get_session_key
# $bucket      Name for the new bucket
#
#----------------------------------------------------------------------------
sub create_bucket
{
    my ( $self, $session, $bucket ) = @_;

    if ( $self->is_bucket( $session, $bucket ) ||           # PROFILE BLOCK START
         $self->is_pseudo_bucket( $session, $bucket ) ) {   # PROFILE BLOCK STOP
        return 0;
    }

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    $bucket = $self->{db__}->quote( $bucket );

    $self->{db__}->do(                                                                    # PROFILE BLOCK START
        "insert into buckets ( name, pseudo, userid ) values ( $bucket, 0, $userid );" ); # PROFILE BLOCK STOP
    $self->db_update_cache__( $session );

    return 1;
}

#----------------------------------------------------------------------------
#
# delete_bucket
#
# Deletes a bucket, returns 1 if the delete succeeded
#
# $session     A valid session key returned by a call to get_session_key
# $bucket      Name of the bucket to delete
#
#----------------------------------------------------------------------------
sub delete_bucket
{
    my ( $self, $session, $bucket ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    # Make sure that the bucket passed in actually exists

    if ( !defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) {
        return 0;
    }

    $self->{db__}->do(                                                                        # PROFILE BLOCK START
        "delete from buckets where buckets.userid = $userid and buckets.name = '$bucket';" ); # PROFILE BLOCK STOP
    $self->db_update_cache__( $session );

    return 1;
}

#----------------------------------------------------------------------------
#
# rename_bucket
#
# Renames a bucket, returns 1 if the rename succeeded
#
# $session             A valid session key returned by a call to get_session_key
# $old_bucket          The old name of the bucket
# $new_bucket          The new name of the bucket
#
#----------------------------------------------------------------------------
sub rename_bucket
{
    my ( $self, $session, $old_bucket, $new_bucket ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    # Make sure that the bucket passed in actually exists

    if ( !defined( $self->{db_bucketid__}{$userid}{$old_bucket} ) ) {
        $self->log_( 0, "Bad bucket name $old_bucket to rename_bucket" );
        return 0;
    }

    my $id = $self->{db__}->quote( $self->{db_bucketid__}{$userid}{$old_bucket}{id} );
    $new_bucket = $self->{db__}->quote( $new_bucket );

    $self->log_( 1, "Rename bucket $old_bucket to $new_bucket" );

    my $result = $self->{db__}->do( "update buckets set name = $new_bucket where id = $id;" );

    if ( !defined( $result ) || ( $result == -1 ) ) {
        return 0;
    } else {
        $self->db_update_cache__( $session );
        return 1;
    }
}

#----------------------------------------------------------------------------
#
# add_messages_to_bucket
#
# Parses mail messages and updates the statistics in the specified bucket
#
# $session         A valid session key returned by a call to get_session_key
# $bucket          Name of the bucket to be updated
# @files           List of file names to parse
#
#----------------------------------------------------------------------------
sub add_messages_to_bucket
{
    my ( $self, $session, $bucket, @files ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    if ( !defined( $self->{db_bucketid__}{$userid}{$bucket}{id} ) ) {
        return 0;
    }

    # This is done to clear out the word list because in the loop
    # below we are going to not reset the word list on each parse

    $self->{parser__}->start_parse();
    $self->{parser__}->stop_parse();

    foreach my $file (@files) {
        $self->{parser__}->parse_file( $file,  # PROFILE BLOCK START
            $self->global_config_( 'message_cutoff'   ),
            0 );  # PROFILE BLOCK STOP (Do not reset word list)
    }

    $self->add_words_to_bucket__( $session, $bucket, 1 );
    $self->db_update_cache__( $session );

    return 1;
}

#----------------------------------------------------------------------------
#
# add_message_to_bucket
#
# Parses a mail message and updates the statistics in the specified bucket
#
# $session         A valid session key returned by a call to get_session_key
# $bucket          Name of the bucket to be updated
# $file            Name of file containing mail message to parse
#
#----------------------------------------------------------------------------
sub add_message_to_bucket
{
    my ( $self, $session, $bucket, $file ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    return $self->add_messages_to_bucket( $session, $bucket, $file );
}

#----------------------------------------------------------------------------
#
# remove_message_from_bucket
#
# Parses a mail message and updates the statistics in the specified bucket
#
# $session         A valid session key returned by a call to get_session_key
# $bucket          Name of the bucket to be updated
# $file            Name of file containing mail message to parse
#
#----------------------------------------------------------------------------
sub remove_message_from_bucket
{
    my ( $self, $session, $bucket, $file ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    $self->{parser__}->parse_file( $file,               # PROFILE BLOCK START
         $self->global_config_( 'message_cutoff'   ) ); # PROFILE BLOCK STOP
    $self->add_words_to_bucket__( $session, $bucket, -1 );

    $self->db_update_cache__( $session );

    return 1;
}

#----------------------------------------------------------------------------
#
# get_buckets_with_magnets
#
# Returns the names of the buckets for which magnets are defined
#
# $session     A valid session key returned by a call to get_session_key
#
#----------------------------------------------------------------------------
sub get_buckets_with_magnets
{
    my ( $self, $session ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    my @result;

    $self->{db_get_buckets_with_magnets__}->execute( $userid );
    while ( my $row = $self->{db_get_buckets_with_magnets__}->fetchrow_arrayref ) {
        push @result, ($row->[0]);
    }

    return @result;
}

#----------------------------------------------------------------------------
#
# get_magnet_types_in_bucket
#
# Returns the types of the magnets in a specific bucket
#
# $session     A valid session key returned by a call to get_session_key
# $bucket      The bucket to search for magnets
#
#----------------------------------------------------------------------------
sub get_magnet_types_in_bucket
{
    my ( $self, $session, $bucket ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    my @result;

    my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
    my $h = $self->{db__}->prepare( "select magnet_types.mtype from magnet_types, magnets, buckets
        where magnet_types.id = magnets.mtid and
              magnets.bucketid = buckets.id and
              buckets.id = $bucketid
              group by magnet_types.mtype
              order by magnet_types.mtype;" );

    $h->execute;
    while ( my $row = $h->fetchrow_arrayref ) {
        push @result, ($row->[0]);
    }
    $h->finish;

    return @result;
}

#----------------------------------------------------------------------------
#
# clear_bucket
#
# Removes all words from a bucket
#
# $session        A valid session key returned by a call to get_session_key
# $bucket         The bucket to clear
#
#----------------------------------------------------------------------------
sub clear_bucket
{
    my ( $self, $session, $bucket ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};

    $self->{db__}->do( "delete from matrix where matrix.bucketid = $bucketid;" );
    $self->db_update_cache__( $session );
}

#----------------------------------------------------------------------------
#
# clear_magnets
#
# Removes every magnet currently defined
#
# $session     A valid session key returned by a call to get_session_key
#
#----------------------------------------------------------------------------
sub clear_magnets
{
    my ( $self, $session ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    for my $bucket (keys %{$self->{db_bucketid__}{$userid}}) {
        my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
        $self->{db__}->do( "delete from magnets where magnets.bucketid = $bucketid" );
    }
}

#----------------------------------------------------------------------------
#
# get_magnets
#
# Returns the magnets of a certain type in a bucket
#
# $session         A valid session key returned by a call to get_session_key
# $bucket          The bucket to search for magnets
# $type            The magnet type (e.g. from, to or subject)
#
#----------------------------------------------------------------------------
sub get_magnets
{
    my ( $self, $session, $bucket, $type ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    my @result;

    my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
    my $h = $self->{db__}->prepare( "select magnets.val from magnets, magnet_types
        where magnets.bucketid = $bucketid and
              magnets.id != 0 and
              magnet_types.id = magnets.mtid and
              magnet_types.mtype = '$type' order by magnets.val;" );

    $h->execute;
    while ( my $row = $h->fetchrow_arrayref ) {
        push @result, ($row->[0]);
    }
    $h->finish;

    return @result;
}

#----------------------------------------------------------------------------
#
# create_magnet
#
# Make a new magnet
#
# $session         A valid session key returned by a call to get_session_key
# $bucket          The bucket the magnet belongs in
# $type            The magnet type (e.g. from, to or subject)
# $text            The text of the magnet
#
#----------------------------------------------------------------------------
sub create_magnet
{
    my ( $self, $session, $bucket, $type, $text ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
    my $result = $self->{db__}->selectrow_arrayref("select magnet_types.id from magnet_types
                                                        where magnet_types.mtype = '$type';" );

    my $mtid = $result->[0];

    $text = $self->{db__}->quote( $text );

    $self->{db__}->do( "insert into magnets ( bucketid, mtid, val )
                                     values ( $bucketid, $mtid, $text );" );
}

#----------------------------------------------------------------------------
#
# get_magnet_types
#
# Get a hash mapping magnet types (e.g. from) to magnet names (e.g. From);
#
# $session     A valid session key returned by a call to get_session_key
#
#----------------------------------------------------------------------------
sub get_magnet_types
{
    my ( $self, $session ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    my %result;

    my $h = $self->{db__}->prepare( "select magnet_types.mtype, magnet_types.header from magnet_types order by mtype;" );

    $h->execute;
    while ( my $row = $h->fetchrow_arrayref ) {
        $result{$row->[0]} = $row->[1];
    }
    $h->finish;

    return %result;
}

#----------------------------------------------------------------------------
#
# delete_magnet
#
# Remove a new magnet
#
# $session         A valid session key returned by a call to get_session_key
# $bucket          The bucket the magnet belongs in
# $type            The magnet type (e.g. from, to or subject)
# $text            The text of the magnet
#
#----------------------------------------------------------------------------
sub delete_magnet
{
    my ( $self, $session, $bucket, $type, $text ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};
    my $result = $self->{db__}->selectrow_arrayref("select magnet_types.id from magnet_types
                                                        where magnet_types.mtype = '$type';" );

    my $mtid = $result->[0];

    $self->{db__}->do( "delete from magnets
                            where magnets.bucketid = $bucketid and
                                  magnets.mtid = $mtid and
                                  magnets.val  = '$text';" );
}

#----------------------------------------------------------------------------
#
# get_stopword_list
#
# Gets the complete list of stop words
#
# $session     A valid session key returned by a call to get_session_key
#
#----------------------------------------------------------------------------
sub get_stopword_list
{
    my ( $self, $session ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    return $self->{parser__}->{mangle__}->stopwords();
}

#----------------------------------------------------------------------------
#
# magnet_count
#
# Gets the number of magnets that are defined
#
# $session     A valid session key returned by a call to get_session_key
#
#----------------------------------------------------------------------------
sub magnet_count
{
    my ( $self, $session ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    my $result = $self->{db__}->selectrow_arrayref( "select count(*) from magnets, buckets
        where buckets.userid = $userid and
              magnets.id != 0 and
              magnets.bucketid = buckets.id;" );

    if ( defined( $result ) ) {
        return $result->[0];
    } else {
        return 0;
    }
}

#----------------------------------------------------------------------------
#
# add_stopword, remove_stopword
#
# Adds or removes a stop word
#
# $session     A valid session key returned by a call to get_session_key
# $stopword    The word to add or remove
#
# Return 0 for a bad stop word, and 1 otherwise
#
#----------------------------------------------------------------------------
sub add_stopword
{
    my ( $self, $session, $stopword ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    # Pass language parameter to add_stopword()

    return $self->{parser__}->{mangle__}->add_stopword( $stopword, $self->module_config_( 'html', 'language' ) );
}

sub remove_stopword
{
    my ( $self, $session, $stopword ) = @_;

    my $userid = $self->valid_session_key__( $session );
    return undef if ( !defined( $userid ) );

    # Pass language parameter to remove_stopword()

    return $self->{parser__}->{mangle__}->remove_stopword( $stopword, $self->module_config_( 'html', 'language' ) );
}

#----------------------------------------------------------------------------
#----------------------------------------------------------------------------
# _____   _____   _____  _______ _____        _______   _______  _____  _____
#|_____] |     | |_____] |______   |   |      |______   |_____| |_____]   |
#|       |_____| |       |       __|__ |_____ |______   |     | |       __|__
#
#----------------------------------------------------------------------------
#----------------------------------------------------------------------------

# GETTERS/SETTERS

sub wordscores
{
    my ( $self, $value ) = @_;

    $self->{wordscores__} = $value if (defined $value);
    return $self->{wordscores__};
}

sub wmformat
{
    my ( $self, $value ) = @_;

    $self->{wmformat__} = $value if (defined $value);
    return $self->{wmformat__};
}

sub db
{
    my ( $self ) = @_;

    return $self->{db__};
}

sub history
{
    my ( $self, $history ) = @_;

    $self->{history__} = $history;
}

1;



syntax highlighted by Code2HTML, v. 0.9.1