.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32 .\" .\" Standard preamble: .\" ======================================================================== .de Sh \" Subsection heading .br .if t .Sp .ne 5 .PP \fB\\$1\fR .PP .. .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. | will give a .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' .\" expand to `' in nroff, nothing in troff, for use with C<>. .tr \(*W-|\(bv\*(Tr .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' 'br\} .\" .\" If the F register is turned on, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .if \nF \{\ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . nr % 0 . rr F .\} .\" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .hy 0 .if n .na .\" .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). .\" Fear. Run. Save yourself. No user-serviceable parts. . \" fudge factors for nroff and troff .if n \{\ . ds #H 0 . ds #V .8m . ds #F .3m . ds #[ \f1 . ds #] \fP .\} .if t \{\ . ds #H ((1u-(\\\\n(.fu%2u))*.13m) . ds #V .6m . ds #F 0 . ds #[ \& . ds #] \& .\} . \" simple accents for nroff and troff .if n \{\ . ds ' \& . ds ` \& . ds ^ \& . ds , \& . ds ~ ~ . ds / .\} .if t \{\ . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' .\} . \" troff and (daisy-wheel) nroff accents .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' .ds 8 \h'\*(#H'\(*b\h'-\*(#H' .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] .ds ae a\h'-(\w'a'u*4/10)'e .ds Ae A\h'-(\w'A'u*4/10)'E . \" corrections for vroff .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' . \" for low resolution devices (crt and lpr) .if \n(.H>23 .if \n(.V>19 \ \{\ . ds : e . ds 8 ss . ds o a . ds d- d\h'-1'\(ga . ds D- D\h'-1'\(hy . ds th \o'bp' . ds Th \o'LP' . ds ae ae . ds Ae AE .\} .rm #[ #] #H #V #F C .\" ======================================================================== .\" .IX Title "AI::Categorizer::Collection 3" .TH AI::Categorizer::Collection 3 "2008-01-14" "perl v5.8.8" "User Contributed Perl Documentation" .SH "NAME" AI::Categorizer::Collection \- Access stored documents .SH "SYNOPSIS" .IX Header "SYNOPSIS" .Vb 8 \& my $c = new AI::Categorizer::Collection::Files \& (path => '/tmp/docs/training', \& category_file => '/tmp/docs/cats.txt'); \& print "Total number of docs: ", $c->count_documents, "\en"; \& while (my $document = $c->next) { \& ... \& } \& $c->rewind; # For further operations .Ve .SH "DESCRIPTION" .IX Header "DESCRIPTION" This abstract class implements an iterator for accessing documents in their natively stored format. You cannot directly create an instance of the Collection class, because it is abstract \- see the documentation for the \f(CW\*(C`Files\*(C'\fR, \f(CW\*(C`SingleFile\*(C'\fR, or \f(CW\*(C`InMemory\*(C'\fR subclasses for a concrete interface. .SH "METHODS" .IX Header "METHODS" .IP "\fInew()\fR" 4 .IX Item "new()" Creates a new Collection object and returns it. Accepts the following parameters: .RS 4 .IP "category_hash" 4 .IX Item "category_hash" Indicates a reference to a hash which maps document names to category names. The keys of the hash are the document names, each value should be a reference to an array containing the names of the categories to which each document belongs. .IP "category_file" 4 .IX Item "category_file" Indicates a file which should be read in order to create the \&\f(CW\*(C`category_hash\*(C'\fR. Each line of the file should list a document's name, followed by a list of category names, all separated by whitespace. .IP "stopword_file" 4 .IX Item "stopword_file" Specifies a file containing a list of \*(L"stopwords\*(R", which are words that should automatically be disregarded when scanning/reading documents. The file should contain one word per line. The file will be parsed and then fed as the \f(CW\*(C`stopwords\*(C'\fR parameter to the Document \f(CW\*(C`new()\*(C'\fR method. .IP "verbose" 4 .IX Item "verbose" If true, some status/debugging information will be printed to \&\f(CW\*(C`STDOUT\*(C'\fR during operation. .IP "document_class" 4 .IX Item "document_class" The class indicating what type of Document object should be created. This generally specifies the format that the documents are stored in. The default is \f(CW\*(C`AI::Categorizer::Document::Text\*(C'\fR. .RE .RS 4 .RE .IP "\fInext()\fR" 4 .IX Item "next()" Returns the next Document object in the Collection. .IP "\fIrewind()\fR" 4 .IX Item "rewind()" Resets the iterator for further calls to \f(CW\*(C`next()\*(C'\fR. .IP "\fIcount_documents()\fR" 4 .IX Item "count_documents()" Returns the total number of documents in the Collection. Note that this usually resets the iterator. This is because it may not be possible to resume iterating where we left off. .SH "AUTHOR" .IX Header "AUTHOR" Ken Williams, ken@mathforum.org .SH "COPYRIGHT" .IX Header "COPYRIGHT" Copyright 2002\-2003 Ken Williams. All rights reserved. .PP This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. .SH "SEE ALSO" .IX Header "SEE ALSO" \&\fIAI::Categorizer\fR\|(3), \fIStorable\fR\|(3)