# POPFILE LOADABLE MODULE package Classifier::WordMangle; use POPFile::Module; @ISA = ("POPFile::Module"); # ---------------------------------------------------------------------------- # # WordMangle.pm --- Mangle words for better classification # # Copyright (c) 2001-2006 John Graham-Cumming # # This file is part of POPFile # # POPFile is free software; you can redistribute it and/or modify it # under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # POPFile is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with POPFile; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # ---------------------------------------------------------------------------- use strict; use warnings; use locale; # These are used for Japanese support my $ascii = '[\x00-\x7F]'; # ASCII chars my $two_bytes_euc_jp = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'; # 2bytes EUC-JP chars my $three_bytes_euc_jp = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])'; # 3bytes EUC-JP chars my $euc_jp = "(?:$ascii|$two_bytes_euc_jp|$three_bytes_euc_jp)"; # EUC-JP chars #---------------------------------------------------------------------------- # new # # Class new() function #---------------------------------------------------------------------------- sub new { my $type = shift; my $self = POPFile::Module->new(); $self->{stop__} = {}; bless $self, $type; $self->name( 'wordmangle' ); return $self; } sub start { my ( $self ) = @_; $self->load_stopwords(); return 1; } # ---------------------------------------------------------------------------- # # load_stopwords, save_stopwords - load and save the stop word list in the stopwords file # # ---------------------------------------------------------------------------- sub load_stopwords { my ($self) = @_; if ( open STOPS, '<' . $self->get_user_path_( 'stopwords' ) ) { delete $self->{stop__}; while ( ) { s/[\r\n]//g; $self->{stop__}{$_} = 1; } close STOPS; } else { $self->log_( 0, "Failed to open stopwords file" ); } } sub save_stopwords { my ($self) = @_; if ( open STOPS, '>' . $self->get_user_path_( 'stopwords' ) ) { for my $word (keys %{$self->{stop__}}) { print STOPS "$word\n"; } close STOPS; } } # ---------------------------------------------------------------------------- # # mangle # # Mangles a word into either the empty string to indicate that the word should be ignored # or the canonical form # # $word The word to either mangle into a nice form, or return empty string if this word # is to be ignored # $allow_colon Set to any value allows : inside a word, this is used when mangle is used # while loading the corpus in Bayes.pm but is not used anywhere else, the colon # is used as a separator to indicate special words found in certain lines # of the mail header # # $ignore_stops If defined ignores the stop word list # # ---------------------------------------------------------------------------- sub mangle { my ($self, $word, $allow_colon, $ignore_stops) = @_; # All words are treated as lowercase my $lcword = lc($word); return '' unless $lcword; # Stop words are ignored return '' if ( ( ( $self->{stop__}{$lcword} ) || # PROFILE BLOCK START ( $self->{stop__}{$word} ) ) && ( !defined( $ignore_stops ) ) ); # PROFILE BLOCK STOP # Remove characters that would mess up a Perl regexp and replace with . $lcword =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.|\\)/\./g; # Long words are ignored also return '' if ( length($lcword) > 45 ); # Ditch long hex numbers return '' if ( $lcword =~ /^[A-F0-9]{8,}$/i ); # Colons are forbidden inside words, we should never get passed a word # with a colon in it here, but if we do then we strip the colon. The colon # is used as a separator between a special identifier and a word, see MailParse.pm # for more details $lcword =~ s/://g if ( !defined( $allow_colon ) ); return ($lcword =~ /:/ )?$word:$lcword; } # ---------------------------------------------------------------------------- # # add_stopword, remove_stopword # # Adds or removes a stop word # # $stopword The word to add or remove # $lang The current language # # Returns 1 if successful, or 0 for a bad stop word # ---------------------------------------------------------------------------- sub add_stopword { my ( $self, $stopword, $lang ) = @_; # In Japanese mode, reject non EUC Japanese characters. if ( $lang eq 'Nihongo') { if ( $stopword !~ /$euc_jp/i ) { return 0; } } else { if ( ( $stopword !~ /:/ ) && ( $stopword =~ /[^[:alpha:]\-_\.\@0-9]/i ) ) { return 0; } } $stopword = $self->mangle( $stopword, 1, 1 ); if ( $stopword ne '' ) { $self->{stop__}{$stopword} = 1; $self->save_stopwords(); return 1; } return 0; } sub remove_stopword { my ( $self, $stopword, $lang ) = @_; # In Japanese mode, reject non EUC Japanese characters. if ( $lang eq 'Nihongo') { if ( $stopword !~ /$euc_jp/i ) { return 0; } } else { if ( ( $stopword !~ /:/ ) && ( $stopword =~ /[^[:alpha:]\-_\.\@0-9]/i ) ) { return 0; } } $stopword = $self->mangle( $stopword, 1, 1 ); if ( $stopword ne '' ) { delete $self->{stop__}{$stopword}; $self->save_stopwords(); return 1; } return 0; } # GETTER/SETTERS sub stopwords { my ( $self, $value ) = @_; if ( defined( $value ) ) { %{$self->{stop__}} = %{$value}; } return keys %{$self->{stop__}}; } 1;