#!/usr/bin/perl -w
# Copyright (C) 2002-2004 Nadav Har'El and Dan Kenigsberg
#
# converts the textual linguistic information in wolig-d-dictionaries,
# into binary data.
# Usage: cat dict1 dict2 ... | binarize-desc.pl > dict_bin_out
#
use Carp;
require "PrefixBits.pl";
# "perl -w" warns about variables only used once (it assumes they are a
# typo). This ugliness gets rid of this warning. Is there a more sensible way?
($PS_L,$PS_B,$PS_VERB,$PS_NONDEF,$PS_IMPER,$PS_MISC)=
($PS_L,$PS_B,$PS_VERB,$PS_NONDEF,$PS_IMPER,$PS_MISC);
my $specifier;
############ description handlng
my $D_NOUN=1;
my $D_VERB=2*$D_NOUN;
my $D_ADJ=3*$D_NOUN;
my $D_TYPEMASK=3*$D_NOUN;
my $D_GENDERBASE=4*$D_NOUN;
my $D_MASCULINE=1*$D_GENDERBASE;
my $D_FEMININE=2*$D_MASCULINE;
my $D_GENDERMASK=3*$D_GENDERBASE;
my $D_GUFBASE=4*$D_GENDERBASE;
my $D_FIRST=$D_GUFBASE*1;
my $D_SECOND=$D_GUFBASE*2;
my $D_THIRD=$D_GUFBASE*3;
my $D_GUFMASK=$D_GUFBASE*3;
my $D_NUMBASE=4*$D_GUFBASE;
my $D_SINGULAR=1*$D_NUMBASE;
my $D_DOUBLE=2*$D_NUMBASE;
my $D_PLURAL=3*$D_NUMBASE;
my $D_NUMMASK=3*$D_NUMBASE;
my $D_TENSEBASE=4*$D_NUMBASE;
my $D_INFINITIVE=1*$D_TENSEBASE;
my $D_BINFINITIVE=6*$D_TENSEBASE;
my $D_PAST=2*$D_TENSEBASE;
my $D_PRESENT=3*$D_TENSEBASE;
my $D_FUTURE=4*$D_TENSEBASE;
my $D_IMPERATIVE=5*$D_TENSEBASE;
my $D_TENSEMASK=7*$D_TENSEBASE;
my $D_OGENDERBASE=8*$D_TENSEBASE;
my $D_OMASCULINE=1*$D_OGENDERBASE;
my $D_OFEMININE=2*$D_OMASCULINE;
my $D_OGENDERMASK=3*$D_OGENDERBASE;
my $D_OGUFBASE=4*$D_OGENDERBASE;
my $D_OFIRST=$D_OGUFBASE*1;
my $D_OSECOND=$D_OGUFBASE*2;
my $D_OTHIRD=$D_OGUFBASE*3;
my $D_OGUFMASK=3*$D_OGUFBASE;
my $D_ONUMBASE=4*$D_OGUFBASE;
my $D_OSINGULAR=1*$D_ONUMBASE;
my $D_ODOUBLE=2*$D_ONUMBASE;
my $D_OPLURAL=3*$D_ONUMBASE;
my $D_ONUMMASK=3*$D_ONUMBASE;
my $D_OMASK=3*$D_ONUMBASE+3*$D_OGUFBASE+3*$D_OMASCULINE;
my $D_OSMICHUT=4*$D_ONUMBASE;
my $D_SPECNOUN=2*$D_OSMICHUT;
my $D_STARTBIT=2*$D_SPECNOUN;
#print STDERR "finalbit $D_STARTBIT\n";
sub text2mask {
my $dmask = 0;
my $desc = shift;
return 0 if !$desc;
if($desc=~m/^([^א-ת]|^)פ([^א-ת]|$)/o) {$dmask |= $D_VERB}
elsif($desc=~m/([^א-ת]|^)ע([^א-ת]|$)/o) {$dmask |= $D_NOUN}
elsif($desc=~m/([^א-ת]|^)ת([^א-ת]|$)/o) {$dmask |= $D_ADJ}
if($desc=~m/,עבר([^א-ת]|$)/o) {$dmask |= $D_PAST}
elsif($desc=~m/,הווה([^א-ת]|$)/o) {$dmask |= $D_PRESENT}
elsif($desc=~m/,עתיד([^א-ת]|$)/o) {$dmask |= $D_FUTURE}
elsif($desc=~m/,ציווי([^א-ת]|$)/o) {$dmask |= $D_IMPERATIVE}
elsif($desc=~m/,מקור([^א-ת]|$)/o) {$dmask |= $D_INFINITIVE}
if(($dmask & $D_TYPEMASK) == $D_VERB) {
if($desc=~m/,יחיד([^א-ת]|$)/o) {$dmask |= $D_MASCULINE | $D_SINGULAR}
elsif($desc=~m/,יחידה([^א-ת]|$)/o) {$dmask |= $D_FEMININE | $D_SINGULAR}
elsif($desc=~m/,רבים([^א-ת]|$)/o) {$dmask |= $D_MASCULINE | $D_PLURAL}
elsif($desc=~m/,רבות([^א-ת]|$)/o) {$dmask |= $D_FEMININE | $D_PLURAL}
}
# currently, wolig and woo have confusing -d output with regards
# to gender.
if((($dmask & $D_TYPEMASK) == $D_NOUN) ||
(($dmask & $D_TYPEMASK) == $D_ADJ)) {
if($desc=~m/,יחיד([^א-ת]|$)/o) {$dmask |= $D_SINGULAR}
elsif($desc=~m/,רבים([^א-ת]|$)/o) {$dmask |= $D_PLURAL}
if($desc=~m/([^א-ת]|^)ז([^א-ת]|$)/o) {$dmask |= $D_MASCULINE};
if($desc=~m/([^א-ת]|^)נ([^א-ת]|$)/o) {$dmask |= $D_FEMININE};
}
if($desc=~m/,אני/o) {$dmask |= $D_FIRST | $D_SINGULAR}
elsif($desc=~m/,אתה/o) {$dmask |= $D_SECOND | $D_SINGULAR | $D_MASCULINE}
elsif($desc=~m/,את([^א-ת]|$)/o) {$dmask |= $D_SECOND | $D_SINGULAR | $D_FEMININE}
elsif($desc=~m/,הוא([^א-ת]|$)/o) {$dmask |= $D_THIRD | $D_SINGULAR | $D_MASCULINE}
elsif($desc=~m/,היא([^א-ת]|$)/o) {$dmask |= $D_THIRD | $D_SINGULAR | $D_FEMININE}
elsif($desc=~m/,אנו/o) {$dmask |= $D_FIRST | $D_PLURAL}
elsif($desc=~m/,אתם/o) {$dmask |= $D_SECOND | $D_PLURAL | $D_MASCULINE}
elsif($desc=~m/,אתן([^א-ת]|$)/o) {$dmask |= $D_SECOND | $D_PLURAL | $D_FEMININE}
elsif($desc=~m/,הם([^א-ת]|$)/o) {$dmask |= $D_THIRD | $D_PLURAL | $D_MASCULINE}
elsif($desc=~m/,הן([^א-ת]|$)/o) {$dmask |= $D_THIRD | $D_PLURAL | $D_FEMININE}
if($desc=~m!/אני!o) {$dmask |= $D_OFIRST | $D_OSINGULAR}
elsif($desc=~m!/אתה!o) {$dmask |= $D_OSECOND | $D_OSINGULAR | $D_OMASCULINE}
elsif($desc=~m!/את([^א-ת]|$)!o) {$dmask |= $D_OSECOND | $D_OSINGULAR | $D_OFEMININE}
elsif($desc=~m!/הוא([^א-ת]|$)!o) {$dmask |= $D_OTHIRD | $D_OSINGULAR | $D_OMASCULINE}
elsif($desc=~m!/היא([^א-ת]|$)!o) {$dmask |= $D_OTHIRD | $D_OSINGULAR | $D_OFEMININE}
elsif($desc=~m!/אנו!o) {$dmask |= $D_OFIRST | $D_OPLURAL}
elsif($desc=~m!/אנחנו!o) {$dmask |= $D_OFIRST | $D_OPLURAL}
elsif($desc=~m!/אתם!o) {$dmask |= $D_OSECOND | $D_OPLURAL | $D_OMASCULINE}
elsif($desc=~m!/אתן([^א-ת]|$)!o) {$dmask |= $D_OSECOND | $D_OPLURAL | $D_OFEMININE}
elsif($desc=~m!/הם([^א-ת]|$)!o) {$dmask |= $D_OTHIRD | $D_OPLURAL | $D_OMASCULINE}
elsif($desc=~m!/הן([^א-ת]|$)!o) {$dmask |= $D_OTHIRD | $D_OPLURAL | $D_OFEMININE}
if($desc=~m!סמיכות!o) {$dmask |= $D_OSMICHUT}
if($desc=~m!פרטי!o) {$dmask |= $D_SPECNOUN}
return $dmask;
}
sub mask2text {
my $dmask = shift;
my $s;
return "" if !$dmask;
$s = ${ {$D_NOUN=>'ע', $D_VERB=>'פ', $D_ADJ=>'ת', 0=>'' } }{ ($dmask & $D_TYPEMASK) };
$s .= ${ {$D_MASCULINE=>',ז', $D_FEMININE=>',נ', 0=>'' } }
{ ($dmask & $D_GENDERMASK) };
$s .= ${ {$D_FIRST=>',1', $D_SECOND=>',2', $D_THIRD=>',3', 0=>'' } }{ ($dmask & $D_GUFMASK) };
$s .= ${ {$D_SINGULAR=>',יחיד', $D_DOUBLE=>',זוגי', $D_PLURAL=>',רבים', 0=>'' } }{ ($dmask & $D_NUMMASK) };
$s .= ${ {$D_PAST=>',עבר', $D_PRESENT=>',הווה', $D_FUTURE=>',עתיד', $D_IMPERATIVE=>',ציווי', $D_INFINITIVE=>',מקור', $D_BINFINITIVE=>',מקור,ב,', 0=>'' } }{ ($dmask & $D_TENSEMASK) };
$s .= ",פרטי" if ($dmask & $D_SPECNOUN);
$s .= ",סמיכות" if ($dmask & $D_OSMICHUT);
if ($dmask & $D_OMASK) {
$s .= ",כינוי/".${ {$D_OMASCULINE=>',ז', $D_OFEMININE=>',נ', 0=>'' } }{ ($dmask & $D_OGENDERMASK) };
$s .= ${ {$D_OFIRST=>',1', $D_OSECOND=>',2', $D_OTHIRD=>',3', 0=>'' } }{ ($dmask & $D_OGUFMASK) };
$s .= ${ {$D_OSINGULAR=>',יחיד', $D_ODOUBLE=>',זוגי', $D_OPLURAL=>',רבים', 0=>'' } }{ ($dmask & $D_ONUMMASK) };
}
return $s;
}
my (%pack_desc_hash,@dmasks,$stem);
############
print STDERR "reading input dictionaries...\n";
my $c=0;
while(<>){
chomp;
#next if /---/; # TODO: this isn't needed. remove it.
#s/-$//o; # TODO: dan added this. remove it.
s/\+ / /o; # The Makefile was supposed to remove those, but still...
if(/^L/o){
$specifier = $PS_L;
s/^L//o;
} elsif(/^B/o){
$specifier = $PS_B;
s/^B//o;
} elsif(!/^[א-ת]/o){
$stem = undef if m/---/;
next; # not a word
} elsif(/-$/o){
# In wolig.pl's simple output (without -d), this specified smichut,
# and we shouldn't allow prefixes with he hayedia. This case is
# useful for smichut words in extrawords.
$specifier = $PS_NONDEF;
s/-$//o;
} elsif(/ פ,/o) {
if(/ .*ציווי/o) {
$specifier = $PS_IMPER;
} elsif(!/ .*הווה/o) {
$specifier = $PS_VERB;
} elsif(/ .*סמיכות/) {
$specifier = $PS_NONDEF;
} else {
$specifier = $PS_ALL;
}
} elsif(/[ ,][עת],/) {
if (/ .*סמיכות/o || m:,של/:o || / .*פרטי/o) {
$specifier = $PS_NONDEF;
} else {
$specifier = $PS_ALL;
}
} else {
$specifier = $PS_ALL;
}
s/ (.*)$//; # remove all the "-d" explanations after the word
$stem = $_ if !defined($stem);
# $specifiers{$_} |= $specifier;
my $dmask = defined($1) ? &text2mask($1) : 0;
$dmask = $dmask & ~$D_INFINITIVE | $D_BINFINITIVE if $specifier==$PS_B;
my $dcode = $pack_desc_hash{$dmask};
if (!$dcode) {
my $i = 0+keys(%pack_desc_hash);
$dcode = chr(ord('A')+$i%26).chr(ord('A')+($i - $i%26)/26);
$pack_desc_hash{$dmask} = $dcode;
push @dmasks, $dmask;
}
print "$_\t$specifier\t$dcode\t$stem\n";
$c++;
print STDERR "#" if !($c%1000);
}
print STDERR "\ncreate dmask.c...\n";
open(DESC_C,">dmask.c") or die "cannot create dmask.c\n";
print DESC_C "/* This file is automatically generated by binarize-desc.pl.\n".
" DO NOT EDIT THIS FILE DIRECTLY!\n*/\n";
print DESC_C "int dmasks[] = {\n";
print DESC_C join(",\n", @dmasks),"\n};\n";
# the following segment was generate from this very perl code using
# grep '^my $D_' pmerge-bin | perl -pe 's/^my \$(D_.*)=.*$/#define $1 \$${1}/;'
print DESC_C << "EOF"
#define D_NOUN $D_NOUN
#define D_VERB $D_VERB
#define D_ADJ $D_ADJ
#define D_TYPEMASK $D_TYPEMASK
#define D_GENDERBASE $D_GENDERBASE
#define D_MASCULINE $D_MASCULINE
#define D_FEMININE $D_FEMININE
#define D_GENDERMASK $D_GENDERMASK
#define D_GUFBASE $D_GUFBASE
#define D_FIRST $D_FIRST
#define D_SECOND $D_SECOND
#define D_THIRD $D_THIRD
#define D_GUFMASK $D_GUFMASK
#define D_NUMBASE $D_NUMBASE
#define D_SINGULAR $D_SINGULAR
#define D_DOUBLE $D_DOUBLE
#define D_PLURAL $D_PLURAL
#define D_NUMMASK $D_NUMMASK
#define D_TENSEBASE $D_TENSEBASE
#define D_INFINITIVE $D_INFINITIVE
#define D_BINFINITIVE $D_BINFINITIVE
#define D_PAST $D_PAST
#define D_PRESENT $D_PRESENT
#define D_FUTURE $D_FUTURE
#define D_IMPERATIVE $D_IMPERATIVE
#define D_TENSEMASK $D_TENSEMASK
#define D_OGENDERBASE $D_OGENDERBASE
#define D_OMASCULINE $D_OMASCULINE
#define D_OFEMININE $D_OFEMININE
#define D_OGENDERMASK $D_OGENDERMASK
#define D_OGUFBASE $D_OGUFBASE
#define D_OFIRST $D_OFIRST
#define D_OSECOND $D_OSECOND
#define D_OTHIRD $D_OTHIRD
#define D_OGUFMASK $D_OGUFMASK
#define D_ONUMBASE $D_ONUMBASE
#define D_OSINGULAR $D_OSINGULAR
#define D_ODOUBLE $D_ODOUBLE
#define D_OPLURAL $D_OPLURAL
#define D_ONUMMASK $D_ONUMMASK
#define D_OMASK $D_OMASK
#define D_OSMICHUT $D_OSMICHUT
#define D_SPECNOUN $D_SPECNOUN
#define D_STARTBIT $D_STARTBIT
#define PS_ALL $PS_ALL
#define PS_B $PS_B
#define PS_L $PS_L
#define PS_VERB $PS_VERB
#define PS_NONDEF $PS_NONDEF
#define PS_IMPER $PS_IMPER
#define PS_MISC $PS_MISC
EOF
;
close DESC_C;
syntax highlighted by Code2HTML, v. 0.9.1