.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sh \" Subsection heading
.br
.if t .Sp
.ne 5
.PP
\fB\\$1\fR
.PP
..
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  | will give a
.\" real vertical bar.  \*(C+ will give a nicer C++.  Capital omega is used to
.\" do unbreakable dashes and therefore won't be available.  \*(C` and \*(C'
.\" expand to `' in nroff, nothing in troff, for use with C<>.
.tr \(*W-|\(bv\*(Tr
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
'br\}
.\"
.\" If the F register is turned on, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.if \nF \{\
.    de IX
.    tm Index:\\$1\t\\n%\t"\\$2"
..
.    nr % 0
.    rr F
.\}
.\"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.hy 0
.if n .na
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "AI::Categorizer::Document 3"
.TH AI::Categorizer::Document 3 "2008-01-14" "perl v5.8.8" "User Contributed Perl Documentation"
.SH "NAME"
AI::Categorizer::Document \- Embodies a document
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
.Vb 1
\& use AI::Categorizer::Document;
.Ve
.PP
.Vb 3
\& # Simplest way to create a document:
\& my $d = new AI::Categorizer::Document(name => $string,
\&                                       content => $string);
.Ve
.PP
.Vb 12
\& # Other parameters are accepted:
\& my $d = new AI::Categorizer::Document(name => $string,
\&                                       categories => \e@category_objects,
\&                                       content => { subject => $string,
\&                                                    body => $string2, ... },
\&                                       content_weights => { subject => 3,
\&                                                            body => 1, ... },
\&                                       stopwords => \e%skip_these_words,
\&                                       stemming => $string,
\&                                       front_bias => $float,
\&                                       use_features => $feature_vector,
\&                                      );
.Ve
.PP
.Vb 3
\& # Specify explicit feature vector:
\& my $d = new AI::Categorizer::Document(name => $string);
\& $d->features( $feature_vector );
.Ve
.PP
.Vb 3
\& # Now pass the document to a categorization algorithm:
\& my $learner = AI::Categorizer::Learner::NaiveBayes->restore_state($path);
\& my $hypothesis = $learner->categorize($document);
.Ve
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
The Document class embodies the data in a single document, and
contains methods for turning this data into a FeatureVector.  Usually
documents are plain text, but subclasses of the Document class may
handle any kind of data.
.SH "METHODS"
.IX Header "METHODS"
.IP "new(%parameters)" 4
.IX Item "new(%parameters)"
Creates a new Document object.  Document objects are used during
training (for the training documents), testing (for the test
documents), and when categorizing new unseen documents in an
application (for the unseen documents).  However, you'll typically
only call \f(CW\*(C`new()\*(C'\fR in the latter case, since the KnowledgeSet or
Collection classes will create Document objects for you in the former
cases.
.Sp
The \f(CW\*(C`new()\*(C'\fR method accepts the following parameters:
.RS 4
.IP "name" 4
.IX Item "name"
A string that identifies this document.  Required.
.IP "content" 4
.IX Item "content"
The raw content of this document.  May be specified as either a string
or as a hash reference, allowing structured document types.
.IP "content_weights" 4
.IX Item "content_weights"
A hash reference indicating the weights that should be assigned to
features in different sections of a structured document when creating
its feature vector.  The weight is a multiplier of the feature vector
values.  For instance, if a \f(CW\*(C`subject\*(C'\fR section has a weight of 3 and a
\&\f(CW\*(C`body\*(C'\fR section has a weight of 1, and word counts are used as feature
vector values, then it will be as if all words appearing in the
\&\f(CW\*(C`subject\*(C'\fR appeared 3 times.
.Sp
If no weights are specified, all weights are set to 1.
.IP "front_bias" 4
.IX Item "front_bias"
Allows smooth bias of the weights of words in a document according to
their position.  The value should be a number between \-1 and 1.
Positive numbers indicate that words toward the beginning of the
document should have higher weight than words toward the end of the
document.  Negative numbers indicate the opposite.  A bias of 0
indicates that no biasing should be done.
.IP "categories" 4
.IX Item "categories"
A reference to an array of Category objects that this document belongs
to.  Optional.
.IP "stopwords" 4
.IX Item "stopwords"
A list/hash of features (words) that should be ignored when parsing
document content.  A hash reference is preferred, with the features as
the keys.  If you pass an array reference containing the features, it
will be converted to a hash reference internally.
.IP "use_features" 4
.IX Item "use_features"
A Feature Vector specifying the only features that should be
considered when parsing this document.  This is an alternative to
using \f(CW\*(C`stopwords\*(C'\fR.
.IP "stemming" 4
.IX Item "stemming"
Indicates the linguistic procedure that should be used to convert
tokens in the document to features.  Possible values are \f(CW\*(C`none\*(C'\fR,
which indicates that the tokens should be used without change, or
\&\f(CW\*(C`porter\*(C'\fR, indicating that the Porter stemming algorithm should be
applied to each token.  This requires the \f(CW\*(C`Lingua::Stem\*(C'\fR module from
\&\s-1CPAN\s0.
.IP "stopword_behavior" 4
.IX Item "stopword_behavior"
There are a few ways you might want the stopword list (specified with
the \f(CW\*(C`stopwords\*(C'\fR parameter) to interact with the stemming algorithm
(specified with the \f(CW\*(C`stemming\*(C'\fR parameter).  These options can be
controlled with the \f(CW\*(C`stopword_behavior\*(C'\fR parameter, which can take the
following values:
.RS 4
.IP "no_stem" 4
.IX Item "no_stem"
Match stopwords against non-stemmed document words.  
.IP "stem" 4
.IX Item "stem"
Stem stopwords according to 'stemming' parameter, then match them
against stemmed document words.
.IP "pre_stemmed" 4
.IX Item "pre_stemmed"
Stopwords are already stemmed, match them against stemmed document
words.
.RE
.RS 4
.Sp
The default value is \f(CW\*(C`stem\*(C'\fR, which seems to produce the best results
in most cases I've tried.  I'm not aware of any studies comparing the
\&\f(CW\*(C`no_stem\*(C'\fR behavior to the \f(CW\*(C`stem\*(C'\fR behavior in the general case.
.Sp
This parameter has no effect if there are no stopwords being used, or
if stemming is not being used.  In the latter case, the list of
stopwords will always be matched as-is against the document words.
.Sp
Note that if the \f(CW\*(C`stem\*(C'\fR option is used, the data structure passed as
the \f(CW\*(C`stopwords\*(C'\fR parameter will be modified in-place to contain the
stemmed versions of the stopwords supplied.
.RE
.RE
.RS 4
.RE
.ie n .IP "read( path => $path )" 4
.el .IP "read( path => \f(CW$path\fR )" 4
.IX Item "read( path => $path )"
An alternative constructor method which reads a file on disk and
returns a document with that file's contents.
.ie n .IP "parse( content => $content )" 4
.el .IP "parse( content => \f(CW$content\fR )" 4
.IX Item "parse( content => $content )"
.PD 0
.IP "\fIname()\fR" 4
.IX Item "name()"
.PD
Returns this document's \f(CW\*(C`name\*(C'\fR property as specified when the
document was created.
.IP "\fIfeatures()\fR" 4
.IX Item "features()"
Returns the Feature Vector associated with this document.
.IP "\fIcategories()\fR" 4
.IX Item "categories()"
In a list context, returns a list of Category objects to which this
document belongs.  In a scalar context, returns the number of such
categories.
.IP "\fIcreate_feature_vector()\fR" 4
.IX Item "create_feature_vector()"
Creates this document's Feature Vector by parsing its content.  You
won't call this method directly, it's called by \f(CW\*(C`new()\*(C'\fR.
.SH "AUTHOR"
.IX Header "AUTHOR"
Ken Williams <ken@mathforum.org>
.SH "COPYRIGHT"
.IX Header "COPYRIGHT"
This distribution is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.  These terms apply to
every file in the distribution \- if you have questions, please contact
the author.