# POPFILE LOADABLE MODULE
package Classifier::WordMangle;

use POPFile::Module;
@ISA = ("POPFile::Module");

# ----------------------------------------------------------------------------
#
# WordMangle.pm --- Mangle words for better classification
#
# Copyright (c) 2001-2006 John Graham-Cumming
#
#   This file is part of POPFile
#
#   POPFile is free software; you can redistribute it and/or modify it
#   under the terms of version 2 of the GNU General Public License as
#   published by the Free Software Foundation.
#
#   POPFile is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with POPFile; if not, write to the Free Software
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# ----------------------------------------------------------------------------

use strict;
use warnings;
use locale;

# These are used for Japanese support

my $ascii = '[\x00-\x7F]'; # ASCII chars
my $two_bytes_euc_jp = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'; # 2bytes EUC-JP chars
my $three_bytes_euc_jp = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])'; # 3bytes EUC-JP chars
my $euc_jp = "(?:$ascii|$two_bytes_euc_jp|$three_bytes_euc_jp)"; # EUC-JP chars

#----------------------------------------------------------------------------
# new
#
#   Class new() function
#----------------------------------------------------------------------------

sub new
{
    my $type = shift;
    my $self = POPFile::Module->new();

    $self->{stop__} = {};

    bless $self, $type;

    $self->name( 'wordmangle' );

    return $self;
}

sub start
{
    my ( $self ) = @_;

    $self->load_stopwords();

    return 1;
}

# ----------------------------------------------------------------------------
#
# load_stopwords, save_stopwords - load and save the stop word list in the stopwords file
#
# ----------------------------------------------------------------------------
sub load_stopwords
{
    my ($self) = @_;

    if ( open STOPS, '<' . $self->get_user_path_( 'stopwords' ) ) {
        delete $self->{stop__};
        while ( <STOPS> ) {
            s/[\r\n]//g;
            $self->{stop__}{$_} = 1;
        }

        close STOPS;
    } else { 
        $self->log_( 0, "Failed to open stopwords file" );
    }
}

sub save_stopwords
{
    my ($self) = @_;

    if ( open STOPS, '>' . $self->get_user_path_( 'stopwords' ) ) {
        for my $word (keys %{$self->{stop__}}) {
            print STOPS "$word\n";
        }

        close STOPS;
    }
}

# ----------------------------------------------------------------------------
#
# mangle
#
# Mangles a word into either the empty string to indicate that the word should be ignored
# or the canonical form
#
# $word         The word to either mangle into a nice form, or return empty string if this word
#               is to be ignored
# $allow_colon  Set to any value allows : inside a word, this is used when mangle is used
#               while loading the corpus in Bayes.pm but is not used anywhere else, the colon
#               is used as a separator to indicate special words found in certain lines
#               of the mail header
#
# $ignore_stops If defined ignores the stop word list
#
# ----------------------------------------------------------------------------
sub mangle
{
    my ($self, $word, $allow_colon, $ignore_stops) = @_;

    # All words are treated as lowercase

    my $lcword = lc($word);

    return '' unless $lcword;

    # Stop words are ignored

    return '' if ( ( ( $self->{stop__}{$lcword} ) ||   # PROFILE BLOCK START
                     ( $self->{stop__}{$word} ) ) &&
                   ( !defined( $ignore_stops ) ) );    # PROFILE BLOCK STOP

    # Remove characters that would mess up a Perl regexp and replace with .

    $lcword =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.|\\)/\./g;

    # Long words are ignored also

    return '' if ( length($lcword) > 45 );

    # Ditch long hex numbers

    return '' if ( $lcword =~ /^[A-F0-9]{8,}$/i );

    # Colons are forbidden inside words, we should never get passed a word
    # with a colon in it here, but if we do then we strip the colon.  The colon
    # is used as a separator between a special identifier and a word, see MailParse.pm
    # for more details

    $lcword =~ s/://g if ( !defined( $allow_colon ) );

    return ($lcword =~ /:/ )?$word:$lcword;
}

# ----------------------------------------------------------------------------
#
# add_stopword, remove_stopword
#
# Adds or removes a stop word
#
# $stopword    The word to add or remove
# $lang        The current language
#
# Returns 1 if successful, or 0 for a bad stop word
# ----------------------------------------------------------------------------

sub add_stopword
{
    my ( $self, $stopword, $lang ) = @_;

    # In Japanese mode, reject non EUC Japanese characters.

    if ( $lang eq 'Nihongo') {
        if ( $stopword !~ /$euc_jp/i ) {
            return 0;
        }
    } else {
        if ( ( $stopword !~ /:/ ) && ( $stopword =~ /[^[:alpha:]\-_\.\@0-9]/i ) ) {
            return 0;
        }
    }

    $stopword = $self->mangle( $stopword, 1, 1 );

    if ( $stopword ne '' ) {
        $self->{stop__}{$stopword} = 1;
        $self->save_stopwords();

       return 1;
    }

    return 0;
}

sub remove_stopword
{
    my ( $self, $stopword, $lang ) = @_;

    # In Japanese mode, reject non EUC Japanese characters.

    if ( $lang eq 'Nihongo') {
        if ( $stopword !~ /$euc_jp/i ) {
            return 0;
        }
    } else {
        if ( ( $stopword !~ /:/ ) && ( $stopword =~ /[^[:alpha:]\-_\.\@0-9]/i ) ) {
            return 0;
        }
    }

    $stopword = $self->mangle( $stopword, 1, 1 );

    if ( $stopword ne '' ) {
        delete $self->{stop__}{$stopword};
        $self->save_stopwords();

        return 1;
    }

    return 0;
}

# GETTER/SETTERS

sub stopwords
{
    my ( $self, $value ) = @_;

    if ( defined( $value ) ) {
        %{$self->{stop__}} = %{$value};
    }

    return keys %{$self->{stop__}};
}

1;


syntax highlighted by Code2HTML, v. 0.9.1