/* $Id: bmf.c,v 1.20 2002/10/20 18:19:17 tommy Exp $ */

/*
 * Copyright (c) 2002 Tom Marshall <tommy@tig-grr.com>
 *
 * This program is free software.  It may be distributed under the terms
 * in the file LICENSE, found in the top level of the distribution.
 *
 * bmf.c: top level Bayesian mail filter app.
 */

#include "config.h"
#include "dbg.h"
#include "str.h"
#include "lex.h"
#include "vec.h"
#include "dbh.h"
#include "filt.h"

/* modes of operation (mutually exclusive) */
typedef enum
{
    mode_test,      /* test and produce report */
    mode_normal,    /* test and register result */
    mode_reg_s,     /* register as spam */
    mode_reg_n,     /* register as non-spam */
    mode_n_to_s,    /* undo non-spam registration and register as spam */
    mode_s_to_n     /* undo spam registration and register as non-spam */
} runmode_t;

static void usage( void )
{
    printf( "\n"
            "Usage: " PACKAGE " [mode] [options]\n"
            "\n"
            "Modes of operation (mutually exclusive; the last one specified is used):\n"
            "\t\tRegister message using historical data if no mode is specified.\n"
            "\t-n\tRegister message as non-spam.\n"
            "\t-s\tRegister message as spam.\n"
            "\t-N\tRegister message as non-spam and undo prior registration as spam.\n"
            "\t-S\tRegister message as spam and undo prior registration as non-spam.\n"
            "\t-t\tTest mode, print report and do not save results.\n"
            "\n"
            "Other options:\n"
            "\t-f fmt\tSpecify database format (text|db|mysql).\n"
            "\t-d db\tSpecify database or directory name.\n"
            "\t-i file\tSpecify file to read instead of stdin.\n"
            "\t-k n\tSpecify count of extrema to use (keepers), default is 15.\n"
            "\t-m type\t[DEPRECATED] Specify mail storage format (mbox|maildir)\n"
            "\t-p\tPassthrough mode, like SpamAssassin.\n"
            "\t-v\tIncrease verbosity level.\n"
            "\t-V\tShow version information and exit.\n"
            "\t-h\tShow this message and exit.\n"
            "\n" );
    exit( 2 );
}

static void version( void )
{
    printf( "\n"
            PACKAGE " version " VERSION " - a Bayesian mail filter\n"
            "Copyright (c) 2002 Tom Marshall\n"
            "\n"
            PACKAGE " comes with ABSOLUTELY NO WARRANTY.\n"
            "This is free software.  You are welcome to redistribute it under the terms\n"
            "of the GNU General Public License.  See the file LICENSE in the source\n"
            "distribution, or visit http://www.gnu.org/licenses/gpl.html\n"
            "\n" );
    exit( 2 );
}

int main( int argc, char** argv )
{
    int         ch;
    dbfmt_t     dbfmt = db_db;
    char*       dbname = NULL;
    bool_t      rdonly;

    runmode_t   mode = mode_normal;
    mbox_t      mboxtype = detect;
    bool_t      do_passthru = false;

    dbh_t*      pdb;
    dbt_t*      pblist;
    dbt_t*      pglist;
    dbt_t*      ptable;
    vec_t       mlist;
    stats_t     stats;
    lex_t       lex;
    tok_t       tok;
    bool_t      is_spam;

    int fd = STDIN_FILENO;
    char* infile = NULL;

    srand(time(NULL));
    atexit( dump_alloc_heap );

#ifdef HAVE_LIBDB
    dbfmt = db_db;
#else
    dbfmt = db_text;
#endif

    stats.keepers = DEF_KEEPERS;
    while( (ch = getopt( argc, argv, "NSVd:f:i:hk:m:npstv" )) != EOF )
    {
        switch( ch )
        {
        case 'N':
            mode = mode_s_to_n;
            break;
        case 'S':
            mode = mode_n_to_s;
            break;
        case 'V':
            version();
            break;  /* notreached */
        case 'd':
            free( dbname );
            dbname = strdup( optarg );
            break;
        case 'f':
            if( strcasecmp( optarg, "text" ) == 0 )
            {
                dbfmt = db_text;
            }
            else if( strcasecmp( optarg, "db" ) == 0 )
            {
                dbfmt = db_db;
            }
            else if( strcasecmp( optarg, "mysql" ) == 0 )
            {
                dbfmt = db_mysql;
            }
            else
            {
                usage();
            }
            break;
        case 'h':
            usage();
            break;  /* notreached */
        case 'i':
            free( infile );
            infile = strdup( optarg );
            break;
        case 'k':
            stats.keepers = atoi( optarg );
            break;
        case 'm':
            if( strcasecmp( optarg, "mbox" ) == 0 )
            {
                mboxtype = mbox;
            }
            else if( strcasecmp( optarg, "maildir" ) == 0 )
            {
                mboxtype = maildir;
            }
            else
            {
                usage();
            }
            break;
        case 'n':
            mode = mode_reg_n;
            break;
        case 'p':
            do_passthru = true;
            break;
        case 's':
            mode = mode_reg_s;
            break;
        case 't':
            mode = mode_test;
            break;
        case 'v':
            g_verbose++;
            verbose( 1, "Verbose level now %u\n", g_verbose );
            break;
        default:
            usage();
        }
    }
    stats.extrema = (discrim_t*)malloc( stats.keepers*sizeof(discrim_t) );

    if( infile != NULL )
    {
        fd = open( infile, O_RDONLY );
        if( fd == -1 )
        {
            fprintf( stderr, "%s: cannot open input file '%s': %s\n",
                     argv[0], infile, strerror(errno) );
            exit( 2 );
        }
    }

    pdb = dbh_open( dbfmt, "localhost", dbname, DB_USER, DB_PASS );
    if( pdb == NULL )
    {
        fprintf( stderr, "%s: cannot open database\n", argv[0] );
        exit( 2 );
    }

    lex_create( &lex, mboxtype );
    if( !lex_load( &lex, fd ) )
    {
        fprintf( stderr, "%s: cannot read input\n", argv[0] );
        exit( 2 );
    }
    lex_nexttoken( &lex, &tok );
    if( tok.tt == eof )
    {
        fprintf( stderr, "%s: no input available\n", argv[0] );
        exit( 2 );
    }

    while( tok.tt != eof )
    {
        if( mboxtype == mbox && tok.tt != from )
        {
            fprintf( stderr, "%s: input does not look like an mbox message\n", argv[0] );
            exit( 2 );
        }

        rdonly = (mode == mode_test || mode == mode_reg_n);
        pblist = pdb->opentable( pdb, "spamlist", rdonly );
        if( pblist == NULL )
        {
            fprintf( stderr, "%s: cannot open spamlist\n", argv[0] );
            exit( 2 );
        }

        rdonly = (mode == mode_test || mode == mode_reg_s);
        pglist = pdb->opentable( pdb, "goodlist", rdonly );
        if( pglist == NULL )
        {
            fprintf( stderr, "%s: cannot open goodlist\n", argv[0] );
            exit( 2 );
        }

        vec_create( &mlist );
        bvec_loadmsg( &mlist, &lex, &tok );

        switch( mode )
        {
        case mode_test:
            bayesfilt( pglist, pblist, &mlist, &stats );
            is_spam = (stats.spamicity > SPAM_CUTOFF);
            break;
        case mode_normal:
            bayesfilt( pglist, pblist, &mlist, &stats );
            is_spam = (stats.spamicity > SPAM_CUTOFF);
            ptable = (is_spam ? pblist : pglist);
            svec_sort( &mlist );
            if( !ptable->mergeclose( ptable, &mlist ) )
            {
                fprintf( stderr, "%s: cannot merge/save list\n", argv[0] );
                exit( 2 );
            }
            break;
        case mode_reg_s:
            stats.spamicity = 1.0;
            is_spam = true;
            svec_sort( &mlist );
            if( !pblist->mergeclose( pblist, &mlist ) )
            {
                fprintf( stderr, "%s: cannot merge/save list\n", argv[0] );
                exit( 2 );
            }
            break;
        case mode_reg_n:
            stats.spamicity = 0.0;
            is_spam = false;
            svec_sort( &mlist );
            if( !pglist->mergeclose( pglist, &mlist ) )
            {
                fprintf( stderr, "%s: cannot merge/save list\n", argv[0] );
                exit( 2 );
            }
            break;
        case mode_n_to_s:
            stats.spamicity = 1.0;
            is_spam = true;
            svec_sort( &mlist );
            if( !pblist->mergeclose( pblist, &mlist ) ||
                !pglist->unmergeclose( pglist, &mlist ) )
            {
                fprintf( stderr, "%s: cannot merge/save list\n", argv[0] );
                exit( 2 );
            }
            break;
        case mode_s_to_n:
            stats.spamicity = 0.0;
            is_spam = false;
            svec_sort( &mlist );
            if( !pblist->unmergeclose( pblist, &mlist ) ||
                !pglist->mergeclose( pglist, &mlist ) )
            {
                fprintf( stderr, "%s: cannot merge/save list\n", argv[0] );
                exit( 2 );
            }
            break;
        default:
            usage();
        }

        if( mode == mode_test )
        {
            statdump( &stats, STDOUT_FILENO );
        }

        if( do_passthru )
        {
            lex_passthru( &lex, is_spam, stats.spamicity );
        }

        vec_destroy( &mlist );

        pglist->close( pglist );
        free( pglist );
        pblist->close( pblist );
        free( pblist );
    }

    lex_destroy( &lex );

    pdb->close( pdb );
    free( pdb );

    if( infile != NULL )
    {
        free( infile );
        close( fd );
    }
    free( stats.extrema );

    return ( (do_passthru || is_spam) ? 0 : 1 );
}


syntax highlighted by Code2HTML, v. 0.9.1