# POPFILE LOADABLE MODULE
package Classifier::WordMangle;
use POPFile::Module;
@ISA = ("POPFile::Module");
# ----------------------------------------------------------------------------
#
# WordMangle.pm --- Mangle words for better classification
#
# Copyright (c) 2001-2006 John Graham-Cumming
#
# This file is part of POPFile
#
# POPFile is free software; you can redistribute it and/or modify it
# under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# POPFile is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with POPFile; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# ----------------------------------------------------------------------------
use strict;
use warnings;
use locale;
# These are used for Japanese support
my $ascii = '[\x00-\x7F]'; # ASCII chars
my $two_bytes_euc_jp = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'; # 2bytes EUC-JP chars
my $three_bytes_euc_jp = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])'; # 3bytes EUC-JP chars
my $euc_jp = "(?:$ascii|$two_bytes_euc_jp|$three_bytes_euc_jp)"; # EUC-JP chars
#----------------------------------------------------------------------------
# new
#
# Class new() function
#----------------------------------------------------------------------------
sub new
{
my $type = shift;
my $self = POPFile::Module->new();
$self->{stop__} = {};
bless $self, $type;
$self->name( 'wordmangle' );
return $self;
}
sub start
{
my ( $self ) = @_;
$self->load_stopwords();
return 1;
}
# ----------------------------------------------------------------------------
#
# load_stopwords, save_stopwords - load and save the stop word list in the stopwords file
#
# ----------------------------------------------------------------------------
sub load_stopwords
{
my ($self) = @_;
if ( open STOPS, '<' . $self->get_user_path_( 'stopwords' ) ) {
delete $self->{stop__};
while ( <STOPS> ) {
s/[\r\n]//g;
$self->{stop__}{$_} = 1;
}
close STOPS;
} else {
$self->log_( 0, "Failed to open stopwords file" );
}
}
sub save_stopwords
{
my ($self) = @_;
if ( open STOPS, '>' . $self->get_user_path_( 'stopwords' ) ) {
for my $word (keys %{$self->{stop__}}) {
print STOPS "$word\n";
}
close STOPS;
}
}
# ----------------------------------------------------------------------------
#
# mangle
#
# Mangles a word into either the empty string to indicate that the word should be ignored
# or the canonical form
#
# $word The word to either mangle into a nice form, or return empty string if this word
# is to be ignored
# $allow_colon Set to any value allows : inside a word, this is used when mangle is used
# while loading the corpus in Bayes.pm but is not used anywhere else, the colon
# is used as a separator to indicate special words found in certain lines
# of the mail header
#
# $ignore_stops If defined ignores the stop word list
#
# ----------------------------------------------------------------------------
sub mangle
{
my ($self, $word, $allow_colon, $ignore_stops) = @_;
# All words are treated as lowercase
my $lcword = lc($word);
return '' unless $lcword;
# Stop words are ignored
return '' if ( ( ( $self->{stop__}{$lcword} ) || # PROFILE BLOCK START
( $self->{stop__}{$word} ) ) &&
( !defined( $ignore_stops ) ) ); # PROFILE BLOCK STOP
# Remove characters that would mess up a Perl regexp and replace with .
$lcword =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.|\\)/\./g;
# Long words are ignored also
return '' if ( length($lcword) > 45 );
# Ditch long hex numbers
return '' if ( $lcword =~ /^[A-F0-9]{8,}$/i );
# Colons are forbidden inside words, we should never get passed a word
# with a colon in it here, but if we do then we strip the colon. The colon
# is used as a separator between a special identifier and a word, see MailParse.pm
# for more details
$lcword =~ s/://g if ( !defined( $allow_colon ) );
return ($lcword =~ /:/ )?$word:$lcword;
}
# ----------------------------------------------------------------------------
#
# add_stopword, remove_stopword
#
# Adds or removes a stop word
#
# $stopword The word to add or remove
# $lang The current language
#
# Returns 1 if successful, or 0 for a bad stop word
# ----------------------------------------------------------------------------
sub add_stopword
{
my ( $self, $stopword, $lang ) = @_;
# In Japanese mode, reject non EUC Japanese characters.
if ( $lang eq 'Nihongo') {
if ( $stopword !~ /$euc_jp/i ) {
return 0;
}
} else {
if ( ( $stopword !~ /:/ ) && ( $stopword =~ /[^[:alpha:]\-_\.\@0-9]/i ) ) {
return 0;
}
}
$stopword = $self->mangle( $stopword, 1, 1 );
if ( $stopword ne '' ) {
$self->{stop__}{$stopword} = 1;
$self->save_stopwords();
return 1;
}
return 0;
}
sub remove_stopword
{
my ( $self, $stopword, $lang ) = @_;
# In Japanese mode, reject non EUC Japanese characters.
if ( $lang eq 'Nihongo') {
if ( $stopword !~ /$euc_jp/i ) {
return 0;
}
} else {
if ( ( $stopword !~ /:/ ) && ( $stopword =~ /[^[:alpha:]\-_\.\@0-9]/i ) ) {
return 0;
}
}
$stopword = $self->mangle( $stopword, 1, 1 );
if ( $stopword ne '' ) {
delete $self->{stop__}{$stopword};
$self->save_stopwords();
return 1;
}
return 0;
}
# GETTER/SETTERS
sub stopwords
{
my ( $self, $value ) = @_;
if ( defined( $value ) ) {
%{$self->{stop__}} = %{$value};
}
return keys %{$self->{stop__}};
}
1;
syntax highlighted by Code2HTML, v. 0.9.1