.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32 .\" .\" Standard preamble: .\" ======================================================================== .de Sh \" Subsection heading .br .if t .Sp .ne 5 .PP \fB\\$1\fR .PP .. .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. | will give a .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' .\" expand to `' in nroff, nothing in troff, for use with C<>. .tr \(*W-|\(bv\*(Tr .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' 'br\} .\" .\" If the F register is turned on, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .if \nF \{\ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . nr % 0 . rr F .\} .\" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .hy 0 .if n .na .\" .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). .\" Fear. Run. Save yourself. No user-serviceable parts. . \" fudge factors for nroff and troff .if n \{\ . ds #H 0 . ds #V .8m . ds #F .3m . ds #[ \f1 . ds #] \fP .\} .if t \{\ . ds #H ((1u-(\\\\n(.fu%2u))*.13m) . ds #V .6m . ds #F 0 . ds #[ \& . ds #] \& .\} . \" simple accents for nroff and troff .if n \{\ . ds ' \& . ds ` \& . ds ^ \& . ds , \& . ds ~ ~ . ds / .\} .if t \{\ . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' .\} . \" troff and (daisy-wheel) nroff accents .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' .ds 8 \h'\*(#H'\(*b\h'-\*(#H' .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] .ds ae a\h'-(\w'a'u*4/10)'e .ds Ae A\h'-(\w'A'u*4/10)'E . \" corrections for vroff .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' . \" for low resolution devices (crt and lpr) .if \n(.H>23 .if \n(.V>19 \ \{\ . ds : e . ds 8 ss . ds o a . ds d- d\h'-1'\(ga . ds D- D\h'-1'\(hy . ds th \o'bp' . ds Th \o'LP' . ds ae ae . ds Ae AE .\} .rm #[ #] #H #V #F C .\" ======================================================================== .\" .IX Title "KinoSearch::Docs::Tutorial 3" .TH KinoSearch::Docs::Tutorial 3 "2008-01-14" "perl v5.8.8" "User Contributed Perl Documentation" .SH "NAME" KinoSearch::Docs::Tutorial \- sample indexing and search applications .SH "DESCRIPTION" .IX Header "DESCRIPTION" The following sample code for invindexer.plx and search.cgi can be used to create a simple search engine. It requires the html presentation of the \s-1US\s0 Constitution included in the distribution for KinoSearch, under \&\f(CW\*(C`t/us_constitution\*(C'\fR. .PP Note that a proper indexer for html documents would not rely on quick-n-dirty regular expressions for stripping tags, as this one does for the sake of brevity \*(-- it would use a dedicated parsing module such as HTML::Parser. .Sh "invindexer.plx" .IX Subsection "invindexer.plx" .Vb 3 \& #!/usr/bin/perl \& use strict; \& use warnings; .Ve .PP .Vb 3 \& use File::Spec; \& use KinoSearch::InvIndexer; \& use KinoSearch::Analysis::PolyAnalyzer; .Ve .PP .Vb 13 \& ### In order for invindexer.plx to work correctly, you must modify \& ### $source_dir, $path_to_invindex, and possibly $base_url. \& ### \& ### $source_dir must lead to the directory containing the US \& ### Constitution html files. \& ### \& ### $path_to_invindex is the future location of the invindex. \& ### \& ### $base_url should reflect the location of the us_constitution directory \& ### when accessed via a web browser. \& my $source_dir = ''; \& my $path_to_invindex = ''; \& my $base_url = '/us_constitution'; .Ve .PP .Vb 4 \& opendir( my $source_dh, $source_dir ) \& or die "Couldn't opendir '$source_dir': $!"; \& my @filenames = grep {/\e.html/} readdir $source_dh; \& closedir $source_dh or die "Couldn't closedir '$source_dir': $!"; .Ve .PP .Vb 4 \& ### STEP 1: Choose an Analyzer. \& my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( \& language => 'en', \& ); .Ve .PP .Vb 6 \& ### STEP 2: Create a InvIndexer object. \& my $invindexer = KinoSearch::InvIndexer->new( \& analyzer => $analyzer, \& invindex => $path_to_invindex, \& create => 1, \& ); .Ve .PP .Vb 10 \& ### STEP 3: Define fields. \& $invindexer->spec_field( name => 'title' ); \& $invindexer->spec_field( \& name => 'bodytext', \& vectorized => 1, \& ); \& $invindexer->spec_field( \& name => 'url', \& indexed => 0, \& ); .Ve .PP .Vb 6 \& foreach my $filename (@filenames) { \& next if $filename eq 'index.html'; \& my $filepath = File::Spec->catfile( $source_dir, $filename ); \& open( my $fh, '<', $filepath ) \& or die "couldn't open file '$filepath': $!"; \& my $content = do { local $/; <$fh> }; .Ve .PP .Vb 2 \& ### STEP 4: Start a new document. \& my $doc = $invindexer->new_doc; .Ve .PP .Vb 7 \& $content =~ m#
\& $hit->{title}
\& $score
\&
\& $hit->{excerpt}
\&
\& $hit->{url}
\&
No matches for $q
|; \& } \& else { \& # calculate the nums for the first and last hit to display \& my $last_result = min( ( $offset + $hits_per_page ), $total_hits ); \& my $first_result = min( ( $offset + 1 ), $last_result ); .Ve .PP .Vb 9 \& # display the result nums, start paging info \& $num_hits_info = qq| \&\& Results $first_result-$last_result \& of $total_hits for $q. \&
\&\& Results Page: \& |; .Ve .PP .Vb 5 \& # calculate first and last hits pages to display / link to \& my $current_page = int( $first_result / $hits_per_page ) + 1; \& my $last_page = ceil( $total_hits / $hits_per_page ); \& my $first_page = max( 1, ( $current_page - 9 ) ); \& $last_page = min( $last_page, ( $current_page + 10 ) ); .Ve .PP .Vb 3 \& # create a url for use in paging links \& my $href = $cgi->url( -relative => 1 ) . "?" . $cgi->query_string; \& $href .= ";offset=0" unless $href =~ /offset=/; .Ve .PP .Vb 6 \& # generate the "Prev" link; \& if ( $current_page > 1 ) { \& my $new_offset = ( $current_page - 2 ) * $hits_per_page; \& $href =~ s/(?<=offset=)\ed+/$new_offset/; \& $num_hits_info .= qq|<= Prev\en|; \& } .Ve .PP .Vb 11 \& # generate paging links \& for my $page_num ( $first_page .. $last_page ) { \& if ( $page_num == $current_page ) { \& $num_hits_info .= qq|$page_num \en|; \& } \& else { \& my $new_offset = ( $page_num - 1 ) * $hits_per_page; \& $href =~ s/(?<=offset=)\ed+/$new_offset/; \& $num_hits_info .= qq|$page_num\en|; \& } \& } .Ve .PP .Vb 6 \& # generate the "Next" link \& if ( $current_page != $last_page ) { \& my $new_offset = $current_page * $hits_per_page; \& $href =~ s/(?<=offset=)\ed+/$new_offset/; \& $num_hits_info .= qq|Next =>\en|; \& } .Ve .PP .Vb 3 \& # finish paging links \& $num_hits_info .= "
\en"; \& } .Ve .PP .Vb 12 \& # blast it all out \& print "Content-type: text/html\en\en"; \& print <\& Powered by \& \& KinoSearch \& \& \&
\&