#!/usr/bin/perl -w
# Copyright (C) 2002-2003 Nadav Har'El and Dan Kenigsberg
#
# Packs a sorted, binarized wordlist into, into one dictionary with
# or'ed prefix hints, a description file, and a stem file.
# Usage: cat sorted_bin | 
#	pack-desc.pl -p prefixesout -d descout -s stemsout > wordsout
#

use Carp;


use Getopt::Std;
my %opts;
# -p - output prefix file.
# -d - output description file.
# -s - output stems file.
# -l - output dictionary sizes that are relevant to linginfo
if(!getopts('p:d:s:l:', \%opts)){
	exit(1);
}

my $out_prefixes=$opts{p};
my $out_desc=$opts{d};
my $out_stems=$opts{s};
my $out_lingsizes=$opts{l};

my ($lastword,$lastspecifiers, $flatlen);
my ($stems,$dmasks,$specifiers,$c,%pointers);
my @words;
$c=0;
while(<>){
	# I want to print this message only *after* the script started reading
	# its input - but only once.
	print STDERR "reading input sorted flat file...\n"
		 if !defined($lastword);
	chomp;
	next unless m/^(.*)\t(.*)\t(.*)\t(.*)$/;
	my ($word,$specifier,$dmask,$stem) = ($1,$2,$3,$4);
	# found a new word. output the packed one.
	if($lastword && $lastword ne $word) {
		$pointers{$lastword} = $c;
		$stems =~ s/:$//o;
		push @words, "$lastword\t$lastspecifiers\t$dmasks\t$stems";
#		$flatlen += $#lastword+2+($#dmasks+1)/2*5+2;
		$stems="";$dmasks="";$specifiers=0;$c++;
		# it takes ages. let's notify the package builder.
		print STDERR "#" if !($c%1000) ;
	}
	$stems .= "$stem:";
	$dmasks .= $dmask;
	$specifiers |= $specifier;

	$lastword = $word;
	$lastspecifiers = $specifiers;
}
# reached EOF. output the final word. TODO: don't do this ugly copy-paste of
# code. call a proper function!
		$pointers{$lastword} = $c;
		$stems =~ s/:$//o;
		push @words, "$lastword\t$lastspecifiers\t$dmasks\t$stems";
		$stems="";$dmasks="";$specifiers=0;$c++;

print STDERR "\nwriting output files...";
if ($out_prefixes) {
	open(PREFIXES,">$out_prefixes") 
		or croak "Couldn't write -p parameter '$out_prefixes'";
}
if ($out_desc) {
	open(DESCS,">$out_desc")
		or croak "Couldn't write -d parameter '$out_desc'";
}
if ($out_stems) {
	open(STEMS,">$out_stems")
		or croak "Couldn't write -s parameter '$out_stems'";
}

foreach (@words) {
	m/^(.*)\t(.*)\t(.*)\t(.*)$/o;
	my ($word,$specifier,$dcodes,$stems) =($1,$2,$3,$4);
	print $word,"\n";
	print PREFIXES chr($specifier) if $out_prefixes;
	print DESCS $dcodes,"\n" if $out_desc;
	$flatlen += length($word)+1;
	if ($out_stems) {
		foreach (split(':',$stems)) {
			my $i = $pointers{$_};
			my $c1 = $i%94;
			my $c2 = ($i-$c1)/94%94;
			my $c3 = ($i-$c1-$c2)/94/94;
			print STEMS chr(33+$c1).chr(33+$c2).chr(33+$c3);
			$flatlen += 5;
		}
		print STEMS "\n";
		$flatlen += 2;
	}
}

close PREFIXES if $out_prefixes;
close DESCS if $out_desc;
close STEMS if $out_stems;

print STDERR "creating $out_lingsizes...\n";

open(DESC_SIZES,">$out_lingsizes") or die "cannot write $out_lingsizes\n";
#print DESC_SIZES "#define FLATSIZE ".$flatlen."\n";
#print DESC_SIZES "#define LOOKUPLEN ".($#words+1)."\n";
print DESC_SIZES $flatlen." ".($#words+1)."\n";
close DESC_SIZES;



syntax highlighted by Code2HTML, v. 0.9.1