.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sh \" Subsection heading
.br
.if t .Sp
.ne 5
.PP
\fB\\$1\fR
.PP
..
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  | will give a
.\" real vertical bar.  \*(C+ will give a nicer C++.  Capital omega is used to
.\" do unbreakable dashes and therefore won't be available.  \*(C` and \*(C'
.\" expand to `' in nroff, nothing in troff, for use with C<>.
.tr \(*W-|\(bv\*(Tr
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
'br\}
.\"
.\" If the F register is turned on, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.if \nF \{\
.    de IX
.    tm Index:\\$1\t\\n%\t"\\$2"
..
.    nr % 0
.    rr F
.\}
.\"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.hy 0
.if n .na
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "KinoSearch::Docs::Tutorial 3"
.TH KinoSearch::Docs::Tutorial 3 "2008-01-14" "perl v5.8.8" "User Contributed Perl Documentation"
.SH "NAME"
KinoSearch::Docs::Tutorial \- sample indexing and search applications
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
The following sample code for invindexer.plx and search.cgi can be used to
create a simple search engine. It requires the html presentation of the \s-1US\s0
Constitution included in the distribution for KinoSearch, under
\&\f(CW\*(C`t/us_constitution\*(C'\fR.
.PP
Note that a proper indexer for html documents would not rely on quick-n-dirty
regular expressions for stripping tags, as this one does for the sake of
brevity \*(-- it would use a dedicated parsing module such as
HTML::Parser.
.Sh "invindexer.plx"
.IX Subsection "invindexer.plx"
.Vb 3
\&    #!/usr/bin/perl
\&    use strict;
\&    use warnings;
.Ve
.PP
.Vb 3
\&    use File::Spec;
\&    use KinoSearch::InvIndexer;
\&    use KinoSearch::Analysis::PolyAnalyzer;
.Ve
.PP
.Vb 13
\&    ### In order for invindexer.plx to work correctly, you must modify 
\&    ### $source_dir, $path_to_invindex, and possibly $base_url.
\&    ###
\&    ### $source_dir must lead to the directory containing the US
\&    ### Constitution html files.
\&    ###
\&    ### $path_to_invindex is the future location of the invindex.
\&    ###
\&    ### $base_url should reflect the location of the us_constitution directory
\&    ### when accessed via a web browser.
\&    my $source_dir       = '';
\&    my $path_to_invindex = '';
\&    my $base_url         = '/us_constitution';
.Ve
.PP
.Vb 4
\&    opendir( my $source_dh, $source_dir )
\&        or die "Couldn't opendir '$source_dir': $!";
\&    my @filenames = grep {/\e.html/} readdir $source_dh;
\&    closedir $source_dh or die "Couldn't closedir '$source_dir': $!";
.Ve
.PP
.Vb 4
\&    ### STEP 1: Choose an Analyzer.
\&    my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( 
\&        language => 'en',
\&    );
.Ve
.PP
.Vb 6
\&    ### STEP 2: Create a InvIndexer object.
\&    my $invindexer = KinoSearch::InvIndexer->new(
\&        analyzer => $analyzer,
\&        invindex => $path_to_invindex,
\&        create   => 1,
\&    );
.Ve
.PP
.Vb 10
\&    ### STEP 3: Define fields.
\&    $invindexer->spec_field( name => 'title' );
\&    $invindexer->spec_field( 
\&        name       => 'bodytext',
\&        vectorized => 1,
\&    );
\&    $invindexer->spec_field(
\&        name    => 'url',
\&        indexed => 0,
\&    );
.Ve
.PP
.Vb 6
\&    foreach my $filename (@filenames) {
\&        next if $filename eq 'index.html';
\&        my $filepath = File::Spec->catfile( $source_dir, $filename );
\&        open( my $fh, '<', $filepath )
\&            or die "couldn't open file '$filepath': $!";
\&        my $content = do { local $/; <$fh> };
.Ve
.PP
.Vb 2
\&        ### STEP 4: Start a new document.
\&        my $doc = $invindexer->new_doc;
.Ve
.PP
.Vb 7
\&        $content =~ m#<title>(.*?)</title>#s
\&            or die "couldn't isolate title in '$filepath'";
\&        my $title = $1;
\&        $content =~ m#<div id="bodytext">(.*?)</div><!--bodytext-->#s
\&            or die "couldn't isolate bodytext in '$filepath'";
\&        my $bodytext = $1;
\&        $bodytext =~ s/<.*?>/ /gsm;    # quick and dirty tag stripping
.Ve
.PP
.Vb 4
\&        ### STEP 5: Set the value for each field.
\&        $doc->set_value( url      => "$base_url/$filename" );
\&        $doc->set_value( title    => $title );
\&        $doc->set_value( bodytext => $bodytext );
.Ve
.PP
.Vb 2
\&        ### STEP 6 Add the document to the invindex.
\&        $invindexer->add_doc($doc);
.Ve
.PP
.Vb 2
\&        ### STEP 7 Repeat steps 3-5 for each document in the collection.
\&    }
.Ve
.PP
.Vb 2
\&    ### STEP 8 Finalize the invindex.
\&    $invindexer->finish;
.Ve
.Sh "search.cgi"
.IX Subsection "search.cgi"
.Vb 3
\&    #!/usr/bin/perl -T
\&    use strict;
\&    use warnings;
.Ve
.PP
.Vb 6
\&    use CGI;
\&    use List::Util qw( max min );
\&    use POSIX qw( ceil );
\&    use KinoSearch::Searcher;
\&    use KinoSearch::Analysis::PolyAnalyzer;
\&    use KinoSearch::Highlight::Highlighter;
.Ve
.PP
.Vb 6
\&    my $cgi           = CGI->new;
\&    my $q             = $cgi->param('q');
\&    my $offset        = $cgi->param('offset');
\&    my $hits_per_page = 10;
\&    $q      = '' unless defined $q;
\&    $offset = 0  unless defined $offset;
.Ve
.PP
.Vb 6
\&    ### In order for search.cgi to work, $path_to_invindex must be modified so
\&    ### that it points to the invindex created by invindexer.plx, and
\&    ### $base_url may have to change to reflect where a web-browser should
\&    ### look for the us_constitution directory.
\&    my $path_to_invindex = '';
\&    my $base_url         = '/us_constitution';
.Ve
.PP
.Vb 4
\&    ### STEP 1: Specify the same Analyzer used to create the invindex.
\&    my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( 
\&        language => 'en', 
\&    );
.Ve
.PP
.Vb 5
\&    ### STEP 2: Create a Searcher object.
\&    my $searcher = KinoSearch::Searcher->new(
\&        invindex => $path_to_invindex,
\&        analyzer => $analyzer,
\&    );
.Ve
.PP
.Vb 2
\&    ### STEP 3: Feed a query to the Search object.
\&    my $hits = $searcher->search($q);
.Ve
.PP
.Vb 4
\&    ### STEP 4: Arrange for highlighted excerpts to be created.
\&    my $highlighter = KinoSearch::Highlight::Highlighter->new( 
\&        excerpt_field => 'bodytext' );
\&    $hits->create_excerpts( highlighter => $highlighter );
.Ve
.PP
.Vb 2
\&    ### STEP 5: Process the search.
\&    $hits->seek( $offset, $hits_per_page );
.Ve
.PP
.Vb 1
\&    ### STEP 6: Format the results however you like.
.Ve
.PP
.Vb 15
\&    # create result list
\&    my $report = '';
\&    while ( my $hit = $hits->fetch_hit_hashref ) {
\&        my $score = sprintf( "%0.3f", $hit->{score} );
\&        $report .= qq|
\&            <p>
\&                <a href="$hit->{url}"><strong>$hit->{title}</strong></a>
\&                <em>$score</em>
\&                <br>
\&                $hit->{excerpt}
\&                <br>
\&                <span class="excerptURL">$hit->{url}</span>
\&            </p>
\&            |;
\&    }
.Ve
.PP
.Vb 1
\&    $q =~ s/"/&quot;/g;
.Ve
.PP
.Vb 15
\&    # display info about the number of hits, paging links
\&    my $total_hits = $hits->total_hits;
\&    my $num_hits_info;
\&    if ( !length $q ) {
\&        # no query, no display
\&        $num_hits_info = '';
\&    }
\&    elsif ( $total_hits == 0 ) {
\&        # alert the user that their search failed
\&        $num_hits_info = qq|<p>No matches for <strong>$q</strong></p>|;
\&    }
\&    else {
\&        # calculate the nums for the first and last hit to display
\&        my $last_result = min( ( $offset + $hits_per_page ), $total_hits );
\&        my $first_result = min( ( $offset + 1 ), $last_result );
.Ve
.PP
.Vb 9
\&        # display the result nums, start paging info
\&        $num_hits_info = qq|
\&            <p>
\&                Results <strong>$first_result-$last_result</strong> 
\&                of <strong>$total_hits</strong> for <strong>$q</strong>.
\&            </p>
\&            <p>
\&                Results Page:
\&            |;
.Ve
.PP
.Vb 5
\&        # calculate first and last hits pages to display / link to
\&        my $current_page = int( $first_result / $hits_per_page ) + 1;
\&        my $last_page    = ceil( $total_hits / $hits_per_page );
\&        my $first_page   = max( 1, ( $current_page - 9 ) );
\&        $last_page = min( $last_page, ( $current_page + 10 ) );
.Ve
.PP
.Vb 3
\&        # create a url for use in paging links
\&        my $href = $cgi->url( -relative => 1 ) . "?" . $cgi->query_string;
\&        $href .= ";offset=0" unless $href =~ /offset=/;
.Ve
.PP
.Vb 6
\&        # generate the "Prev" link;
\&        if ( $current_page > 1 ) {
\&            my $new_offset = ( $current_page - 2 ) * $hits_per_page;
\&            $href =~ s/(?<=offset=)\ed+/$new_offset/;
\&            $num_hits_info .= qq|<a href="$href">&lt;= Prev</a>\en|;
\&        }
.Ve
.PP
.Vb 11
\&        # generate paging links
\&        for my $page_num ( $first_page .. $last_page ) {
\&            if ( $page_num == $current_page ) {
\&                $num_hits_info .= qq|$page_num \en|;
\&            }
\&            else {
\&                my $new_offset = ( $page_num - 1 ) * $hits_per_page;
\&                $href =~ s/(?<=offset=)\ed+/$new_offset/;
\&                $num_hits_info .= qq|<a href="$href">$page_num</a>\en|;
\&            }
\&        }
.Ve
.PP
.Vb 6
\&        # generate the "Next" link
\&        if ( $current_page != $last_page ) {
\&            my $new_offset = $current_page * $hits_per_page;
\&            $href =~ s/(?<=offset=)\ed+/$new_offset/;
\&            $num_hits_info .= qq|<a href="$href">Next =&gt;</a>\en|;
\&        }
.Ve
.PP
.Vb 3
\&        # finish paging links
\&        $num_hits_info .= "</p>\en";
\&    }
.Ve
.PP
.Vb 12
\&    # blast it all out
\&    print "Content-type: text/html\en\en";
\&    print <<END_HTML;
\&    <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
\&        "http://www.w3.org/TR/html4/loose.dtd">
\&    <html>
\&    <head>
\&        <meta http-equiv="Content-type" 
\&            content="text/html;charset=ISO-8859-1">
\&        <link rel="stylesheet" type="text/css" href="$base_url/uscon.css">
\&        <title>KinoSearch: $q</title>
\&    </head>
.Ve
.PP
.Vb 1
\&    <body>
.Ve
.PP
.Vb 10
\&        <div id="navigation">
\&            <form id="usconSearch" action="">
\&                <strong>
\&                Search the <a href="$base_url/index.html">US Constitution</a>:
\&                </strong>
\&                <input type="text" name="q" id="q" value="$q">
\&                <input type="submit" value="=&gt;">
\&                <input type="hidden" name="offset" value="0">
\&            </form>
\&        </div><!--navigation-->
.Ve
.PP
.Vb 1
\&        <div id="bodytext">
.Ve
.PP
.Vb 1
\&        $report
.Ve
.PP
.Vb 1
\&        $num_hits_info
.Ve
.PP
.Vb 8
\&        <p style="font-size: smaller; color: #666">
\&            <em>Powered by 
\&                <a href="http://www.rectangular.com/kinosearch/">
\&                    KinoSearch
\&                </a>
\&            </em>
\&        </p>
\&        </div><!--bodytext-->
.Ve
.PP
.Vb 1
\&    </body>
.Ve
.PP
.Vb 2
\&    </html>
\&    END_HTML
.Ve
.SH "COPYRIGHT"
.IX Header "COPYRIGHT"
Copyright 2005\-2006 Marvin Humphrey
.SH "LICENSE, DISCLAIMER, BUGS, etc."
.IX Header "LICENSE, DISCLAIMER, BUGS, etc."
See KinoSearch version 0.15.