/* Copyright (C) 2003-2004 Nadav Har'El and Dan Kenigsberg */
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "dict_radix.h"
#include "hspell.h"
#include "linginfo.h"
/* TODO: compile out debug code in production version... */
int hspell_debug=0;
/* Load the data files. Returns 0 on success, -1 if couldn't read the
dictionary.
*/
static int
load_data(struct dict_radix **dictp)
{
clock_t t1, t2;
if(hspell_debug){
fprintf(stderr,"Loading data files... ");
t1=clock();
}
*dictp = new_dict_radix();
#ifndef DICTIONARY_BASE
#define DICTIONARY_BASE "./hebrew.wgz"
#endif
if(!read_dict(*dictp, DICTIONARY_BASE)){
return -1;
}
if(hspell_debug){
t2=clock();
fprintf(stderr,"done (%d ms).\n",
(int)((t2-t1)/(CLOCKS_PER_SEC/1000)));
}
return 0;
}
/*
* The prefix tree "prefix_tree" is built by build_prefix_tree, from a list of
* known combinations of prefixes. Each prefix also has a mask that determines
* to what kind of words it can be applied.
*
* The list of known prefixes and masks were defined in the prefixes[] and
* masks[] arrays in prefixes.c. That file is automatically generated by the
* genprefixes.pl program.
*/
#include "prefixes.c"
struct prefix_node {
/* if a prefix has a certain 'mask', and lookup on a word returns
* 'val' (a bitmask of prefixes allowed for it), our prefix is
* allowed on this word if and only if (mask & val)!=0.
*
* This means that 'mask' defines the bits that this prefix "supplies"
* and he 'val' defined for a word is the bits this words insists on
* getting at least one of (i.e., val is the list of types of
* prefixes that are allowed for this word).
*/
int mask;
struct prefix_node *next['ú'-'à'+1];
};
static struct prefix_node *prefix_tree = 0;
static void
build_prefix_tree(int allow_he_hasheela){
int i;
const char *p;
struct prefix_node **n;
char **prefixes;
int *masks;
if(allow_he_hasheela){
prefixes=prefixes_H;
masks=masks_H;
} else {
prefixes=prefixes_noH;
masks=masks_noH;
}
for(i=0; prefixes[i]; i++){
p=prefixes[i];
n=&prefix_tree;
if(hspell_debug)
fprintf(stderr,"prefix %s ",p);
while(*p){
if(!(*n))
*n=(struct prefix_node *)
calloc(1,sizeof(struct prefix_node));
n=& ((*n)->next[*p-'à']);
p++;
}
/* define the mask (making sure the node exists). */
if(!*n)
*n=(struct prefix_node *)
calloc(1,sizeof(struct prefix_node));
(*n)->mask=masks[i];
if(hspell_debug)
fprintf(stderr,"mask=%d\n",(*n)->mask);
}
}
static void
free_prefix_tree(struct prefix_node *n)
{
/* free_prefix_tree recursively walk the tree, freeing all nodes */
int i;
if(!n)
return;
for(i=0; i< sizeof(n->next)/sizeof(n->next[0]); i++)
free_prefix_tree(n->next[i]);
free(n);
}
int
hspell_check_word(struct dict_radix *dict, const char *word, int *preflen)
{
int hashebrew;
const char *w=word;
struct prefix_node *n;
*preflen = 0;
/* ignore empty words: */
hashebrew=0;
while(*w){
if(*w>='à' && *w<='ú'){
hashebrew=1;
break;
}
(*preflen)++;
w++;
}
if(!hashebrew)
return 1; /* ignore (accept) empty words */
n=prefix_tree;
if(hspell_debug)
fprintf(stderr,"looking %s\n",w);
while(*w && n){
/* eat up the " if necessary, to recognize words like
* ä"ùèéç". or äéãéòä ù"äîéãò...".
* See the Academy's punctuation rules (see ìùåððå ìòí, èáú,
* úùñ"á) for an explanation of this rule (we're probably don't
* support here everything they suggested; in particular I
* don't recognize a single quote as valid form of merchaot).
*/
if(*w=='"'){
(*preflen)++;
w++;
continue;
}
/* The first case here is the Academia's "ha-ktiv hasar
* ha-niqqud" rule of doubling a consonant waw in the middle
* a word, unless it's already next to a waw. When adding a
* prefix, any initial waw in a word will nececessarily
* become a consonant waw in the middle of the word.
* The "else if" below is the normal check.
*/
if(n!=prefix_tree && *w=='å' && w[-1]!='å'){
if(w[1]=='å'){
if(w[2]!='å' && (lookup(dict,w+1) & n->mask)){
/* for example: äååòã */
if(hspell_debug)
fprintf(stderr,"found %s: double waw.\n",w);
return 1;
} else if(lookup(dict,w) & n->mask){
/* for example: äååéí */
if(hspell_debug)
fprintf(stderr,"found %s: nondouble waw.\n",w);
return 1;
}
}
} else {
if (hspell_debug) fprintf (stderr, "tried %s mask %d prefmask %d\n",w,lookup(dict,w), n->mask);
if(lookup(dict,w) & n->mask) return 1; /* found word! */
}
/* try the next prefix... */
if(*w>='à' && *w<='ú'){
n=n->next[*w-'à'];
(*preflen)++;
w++;
} else {
break;
}
}
if(n && !*w){
/* allow prefix followed by nothing (or a non-word like
* number, maqaf, etc.) */
if(hspell_debug) fprintf(stderr,"Accepting empty word\n");
return 1;
} else
return 0; /* unrecognized (mis-spelled) word */
}
/* this functions copies, in a less than inteligent fashion, the Nadav's code
* from hspell_check_word. TODO: use the same code for both functions. */
int hspell_enum_splits(struct dict_radix *dict, const char *word,
hspell_word_split_callback_func *enumf)
{
int preflen=0, count=0;
int hashebrew;
const char *w=word;
struct prefix_node *n;
/* ignore empty words: */
hashebrew=0;
while(*w){
if(*w>='à' && *w<='ú'){
hashebrew=1;
break;
}
preflen++;
w++;
}
if(!hashebrew)
return -1; /* ignore empty words */
n=prefix_tree;
if(hspell_debug)
fprintf(stderr,"enum_splits looking %s\n",w);
while(*w && n){
/* eat up the " if necessary, to recognize words like
* ä"ùèéç". or äéãéòä ù"äîéãò...".
* See the Academy's punctuation rules (see ìùåððå ìòí, èáú,
* úùñ"á) for an explanation of this rule (we're probably don't
* support here everything they suggested; in particular I
* don't recognize a single quote as valid form of merchaot).
*/
if(*w=='"'){
preflen++;
w++;
continue;
}
/* The first case here is the Academia's "ha-ktiv hasar
* ha-niqqud" rule of doubling a consonant waw in the middle
* a word, unless it's already next to a waw. When adding a
* prefix, any initial waw in a word will necessarily
* become a consonant waw in the middle of the word.
* The "else if" below is the normal check.
*/
if(n!=prefix_tree && *w=='å' && w[-1]!='å'){
if(w[1]=='å'){
if(w[2]!='å' && (lookup(dict,w+1) & n->mask)){
w++;
/* for example: äååòã */
if(hspell_debug)
fprintf(stderr,"found %s: double waw.\n",w);
enumf(word, w, preflen++, n->mask);
n=n->next[*w-'à']; w++;
count++;
continue;
} else if(lookup(dict,w) & n->mask){
/* for example: äååéí */
if(hspell_debug)
fprintf(stderr,"found %s: nondouble waw.\n",w);
enumf(word, w, preflen++, n->mask);
n=n->next[*w-'à']; w++;
count++;
continue;
}
}
} else {
if (hspell_debug) fprintf (stderr, "enum_splits: tried %s mask %d prefmask %d\n",w,lookup(dict,w), n->mask);
if(lookup(dict,w) & n->mask) {
enumf(word, w, preflen++, n->mask);
n=n->next[*w-'à']; w++;
count++;
continue;
} /* found word! */
}
/* try the next prefix... */
if(*w>='à' && *w<='ú'){
n=n->next[*w-'à'];
preflen++;
w++;
} else {
break;
}
}
if(n && !*w){
/* allow prefix followed by nothing (or a non-word like
* number, maqaf, etc.) */
if(hspell_debug) fprintf(stderr,"Accepting empty word\n");
enumf(word, w, preflen, n->mask);
count++;
} /* else
return 0; unrecognized (mis-spelled) word */
if (hspell_debug) fprintf(stderr, "enum_splits found %d splits\n", count);
return count;
}
/* try to find corrections for word */
void
hspell_trycorrect(struct dict_radix *dict, const char *w, struct corlist *cl)
{
char buf[30];
int i;
int len=strlen(w), preflen;
static char *similar[] = {"äòà", "âä", "ëç", "úè", "öñ", "ùñ",
"ë÷", "áå", "ôá"};
#define TRYBUF if(hspell_check_word(dict, buf, &preflen)) corlist_add(cl, buf)
/* try to add a missing em kri'a - yud or vav */
for(i=1;i<len;i++){
snprintf(buf,sizeof(buf),"%.*sé%s",i,w,w+i);
TRYBUF;
snprintf(buf,sizeof(buf),"%.*så%s",i,w,w+i);
TRYBUF;
}
/* try to remove an em kri'a - yud or vav */
/* NOTE: in hspell.pl the loop was from i=0 to i<len... */
for(i=1;i<len-1;i++){
if(w[i]=='é' || w[i]=='å'){
snprintf(buf,sizeof(buf),"%.*s%s",i,w,w+i+1);
TRYBUF;
}
}
/* try to add or remove an aleph (is that useful?) */
/* TODO: don't add an aleph next to yud or non-double vav,
* as it can't be an em kria there? */
for(i=1;i<len;i++){
snprintf(buf,sizeof(buf),"%.*sà%s",i,w,w+i);
TRYBUF;
}
for(i=1;i<len-1;i++){
if(w[i]=='à'){
snprintf(buf,sizeof(buf),"%.*s%s",i,w,w+i+1);
TRYBUF;
}
}
/* try to replace similarly sounding (for certain people) letters:
*/
for(i=0;i<len;i++){
int group;
char *g;
for(group=0; group< (sizeof(similar)/sizeof(similar[0]));
group++){
for(g=similar[group];*g && *g!=w[i];g++);
;
if(*g){
/* character in group - try the other ones
* in this group! */
for(g=similar[group];*g;g++){
if(*g==w[i]) continue;
if(i>0 && w[i]=='å' && w[i+1]=='å')
snprintf(buf,sizeof(buf),
"%.*s%c%s",i,w,*g,w+i+2);
else if(*g=='å')
snprintf(buf,sizeof(buf),
"%.*såå%s",i,w,w+i+1);
else
snprintf(buf,sizeof(buf),
"%.*s%c%s",i,w,*g,w+i+1);
TRYBUF;
}
}
}
}
/* try to replace a non-final letter at the end of the word by its
* final form and vice versa (useful check for abbreviations) */
strncpy(buf,w,sizeof(buf));
switch(w[len-1]){
case 'ê': buf[len-1]='ë'; break;
case 'í': buf[len-1]='î'; break;
case 'ï': buf[len-1]='ð'; break;
case 'õ': buf[len-1]='ö'; break;
case 'ó': buf[len-1]='ô'; break;
case 'ë': buf[len-1]='ê'; break;
case 'î': buf[len-1]='í'; break;
case 'ð': buf[len-1]='ï'; break;
case 'ö': buf[len-1]='õ'; break;
case 'ô': buf[len-1]='ó'; break;
}
if(buf[len-1]!=w[len-1]){ TRYBUF; }
/* try to make the word into an acronym (add " before last character */
if(len>=2){
snprintf(buf,sizeof(buf), "%.*s\"%c",len-1,w,w[len-1]);
TRYBUF;
}
/* try to make the word into an abbreviation (add ' at the end) */
snprintf(buf,sizeof(buf), "%s'",w);
TRYBUF;
}
/* hspell_init() reads the dictionary and initializes the necessary data
structures, into the an allocated dictp structure.
hspell_init() returns 0 on success, or negative numbers on errors:
-1: cannot read dictionary.
*/
int
hspell_init(struct dict_radix **dictp, int flags){
int ret;
ret=load_data(dictp);
if(ret<0) return ret;
build_prefix_tree(flags & HSPELL_OPT_HE_SHEELA);
#ifdef USE_LINGINFO
if (flags & HSPELL_OPT_LINGUISTICS) {
if (!linginfo_init(DICTIONARY_BASE)) return -1;
}
#endif
return 0;
}
/* TODO: hspell_init should use a new "hspell_context" structure, not
dict_radix. Because we might want to add more things like user dictionary.
The prefix tree should also sit in the hspell_context, instead of
being a global variable: the current mishmash of globals and non-globals
is ugly.
Linginfo's global variables (see linginfo_init and linginfo_free)
should also be in this context.
*/
/* hspell_uninit() undoes the effects of hspell_init, freeing memory that
was allocated during initialization. The dict pointer passed is no
longer valid after this call, and should not be used (i.e., hspell_uninit()
has similar semnatics to free()).
*/
void
hspell_uninit(struct dict_radix *dict)
{
delete_dict_radix(dict);
/* free prefix tree. Too bad this is a global variable, and not
something in a "context" given to us as a paramter. */
free_prefix_tree(prefix_tree);
prefix_tree=0;
#ifdef USE_LINGINFO
linginfo_free();
#endif
}
syntax highlighted by Code2HTML, v. 0.9.1