#!/usr/bin/perl -w
# Copyright (C) 2002-2003 Nadav Har'El and Dan Kenigsberg
#
# Packs a sorted, binarized wordlist into, into one dictionary with
# or'ed prefix hints, a description file, and a stem file.
# Usage: cat sorted_bin |
# pack-desc.pl -p prefixesout -d descout -s stemsout > wordsout
#
use Carp;
use Getopt::Std;
my %opts;
# -p - output prefix file.
# -d - output description file.
# -s - output stems file.
# -l - output dictionary sizes that are relevant to linginfo
if(!getopts('p:d:s:l:', \%opts)){
exit(1);
}
my $out_prefixes=$opts{p};
my $out_desc=$opts{d};
my $out_stems=$opts{s};
my $out_lingsizes=$opts{l};
my ($lastword,$lastspecifiers, $flatlen);
my ($stems,$dmasks,$specifiers,$c,%pointers);
my @words;
$c=0;
while(<>){
# I want to print this message only *after* the script started reading
# its input - but only once.
print STDERR "reading input sorted flat file...\n"
if !defined($lastword);
chomp;
next unless m/^(.*)\t(.*)\t(.*)\t(.*)$/;
my ($word,$specifier,$dmask,$stem) = ($1,$2,$3,$4);
# found a new word. output the packed one.
if($lastword && $lastword ne $word) {
$pointers{$lastword} = $c;
$stems =~ s/:$//o;
push @words, "$lastword\t$lastspecifiers\t$dmasks\t$stems";
# $flatlen += $#lastword+2+($#dmasks+1)/2*5+2;
$stems="";$dmasks="";$specifiers=0;$c++;
# it takes ages. let's notify the package builder.
print STDERR "#" if !($c%1000) ;
}
$stems .= "$stem:";
$dmasks .= $dmask;
$specifiers |= $specifier;
$lastword = $word;
$lastspecifiers = $specifiers;
}
# reached EOF. output the final word. TODO: don't do this ugly copy-paste of
# code. call a proper function!
$pointers{$lastword} = $c;
$stems =~ s/:$//o;
push @words, "$lastword\t$lastspecifiers\t$dmasks\t$stems";
$stems="";$dmasks="";$specifiers=0;$c++;
print STDERR "\nwriting output files...";
if ($out_prefixes) {
open(PREFIXES,">$out_prefixes")
or croak "Couldn't write -p parameter '$out_prefixes'";
}
if ($out_desc) {
open(DESCS,">$out_desc")
or croak "Couldn't write -d parameter '$out_desc'";
}
if ($out_stems) {
open(STEMS,">$out_stems")
or croak "Couldn't write -s parameter '$out_stems'";
}
foreach (@words) {
m/^(.*)\t(.*)\t(.*)\t(.*)$/o;
my ($word,$specifier,$dcodes,$stems) =($1,$2,$3,$4);
print $word,"\n";
print PREFIXES chr($specifier) if $out_prefixes;
print DESCS $dcodes,"\n" if $out_desc;
$flatlen += length($word)+1;
if ($out_stems) {
foreach (split(':',$stems)) {
my $i = $pointers{$_};
my $c1 = $i%94;
my $c2 = ($i-$c1)/94%94;
my $c3 = ($i-$c1-$c2)/94/94;
print STEMS chr(33+$c1).chr(33+$c2).chr(33+$c3);
$flatlen += 5;
}
print STEMS "\n";
$flatlen += 2;
}
}
close PREFIXES if $out_prefixes;
close DESCS if $out_desc;
close STEMS if $out_stems;
print STDERR "creating $out_lingsizes...\n";
open(DESC_SIZES,">$out_lingsizes") or die "cannot write $out_lingsizes\n";
#print DESC_SIZES "#define FLATSIZE ".$flatlen."\n";
#print DESC_SIZES "#define LOOKUPLEN ".($#words+1)."\n";
print DESC_SIZES $flatlen." ".($#words+1)."\n";
close DESC_SIZES;
syntax highlighted by Code2HTML, v. 0.9.1