/* Copyright (C) 2003-2004 Nadav Har'El and Dan Kenigsberg */ #include #include #include #include #include "dict_radix.h" #include "hspell.h" #include "linginfo.h" /* TODO: compile out debug code in production version... */ int hspell_debug=0; /* Load the data files. Returns 0 on success, -1 if couldn't read the dictionary. */ static int load_data(struct dict_radix **dictp) { clock_t t1, t2; if(hspell_debug){ fprintf(stderr,"Loading data files... "); t1=clock(); } *dictp = new_dict_radix(); #ifndef DICTIONARY_BASE #define DICTIONARY_BASE "./hebrew.wgz" #endif if(!read_dict(*dictp, DICTIONARY_BASE)){ return -1; } if(hspell_debug){ t2=clock(); fprintf(stderr,"done (%d ms).\n", (int)((t2-t1)/(CLOCKS_PER_SEC/1000))); } return 0; } /* * The prefix tree "prefix_tree" is built by build_prefix_tree, from a list of * known combinations of prefixes. Each prefix also has a mask that determines * to what kind of words it can be applied. * * The list of known prefixes and masks were defined in the prefixes[] and * masks[] arrays in prefixes.c. That file is automatically generated by the * genprefixes.pl program. */ #include "prefixes.c" struct prefix_node { /* if a prefix has a certain 'mask', and lookup on a word returns * 'val' (a bitmask of prefixes allowed for it), our prefix is * allowed on this word if and only if (mask & val)!=0. * * This means that 'mask' defines the bits that this prefix "supplies" * and he 'val' defined for a word is the bits this words insists on * getting at least one of (i.e., val is the list of types of * prefixes that are allowed for this word). */ int mask; struct prefix_node *next['ú'-'à'+1]; }; static struct prefix_node *prefix_tree = 0; static void build_prefix_tree(int allow_he_hasheela){ int i; const char *p; struct prefix_node **n; char **prefixes; int *masks; if(allow_he_hasheela){ prefixes=prefixes_H; masks=masks_H; } else { prefixes=prefixes_noH; masks=masks_noH; } for(i=0; prefixes[i]; i++){ p=prefixes[i]; n=&prefix_tree; if(hspell_debug) fprintf(stderr,"prefix %s ",p); while(*p){ if(!(*n)) *n=(struct prefix_node *) calloc(1,sizeof(struct prefix_node)); n=& ((*n)->next[*p-'à']); p++; } /* define the mask (making sure the node exists). */ if(!*n) *n=(struct prefix_node *) calloc(1,sizeof(struct prefix_node)); (*n)->mask=masks[i]; if(hspell_debug) fprintf(stderr,"mask=%d\n",(*n)->mask); } } static void free_prefix_tree(struct prefix_node *n) { /* free_prefix_tree recursively walk the tree, freeing all nodes */ int i; if(!n) return; for(i=0; i< sizeof(n->next)/sizeof(n->next[0]); i++) free_prefix_tree(n->next[i]); free(n); } int hspell_check_word(struct dict_radix *dict, const char *word, int *preflen) { int hashebrew; const char *w=word; struct prefix_node *n; *preflen = 0; /* ignore empty words: */ hashebrew=0; while(*w){ if(*w>='à' && *w<='ú'){ hashebrew=1; break; } (*preflen)++; w++; } if(!hashebrew) return 1; /* ignore (accept) empty words */ n=prefix_tree; if(hspell_debug) fprintf(stderr,"looking %s\n",w); while(*w && n){ /* eat up the " if necessary, to recognize words like * ä"ùèéç". or äéãéòä ù"äîéãò...". * See the Academy's punctuation rules (see ìùåððå ìòí, èáú, * úùñ"á) for an explanation of this rule (we're probably don't * support here everything they suggested; in particular I * don't recognize a single quote as valid form of merchaot). */ if(*w=='"'){ (*preflen)++; w++; continue; } /* The first case here is the Academia's "ha-ktiv hasar * ha-niqqud" rule of doubling a consonant waw in the middle * a word, unless it's already next to a waw. When adding a * prefix, any initial waw in a word will nececessarily * become a consonant waw in the middle of the word. * The "else if" below is the normal check. */ if(n!=prefix_tree && *w=='å' && w[-1]!='å'){ if(w[1]=='å'){ if(w[2]!='å' && (lookup(dict,w+1) & n->mask)){ /* for example: äååòã */ if(hspell_debug) fprintf(stderr,"found %s: double waw.\n",w); return 1; } else if(lookup(dict,w) & n->mask){ /* for example: äååéí */ if(hspell_debug) fprintf(stderr,"found %s: nondouble waw.\n",w); return 1; } } } else { if (hspell_debug) fprintf (stderr, "tried %s mask %d prefmask %d\n",w,lookup(dict,w), n->mask); if(lookup(dict,w) & n->mask) return 1; /* found word! */ } /* try the next prefix... */ if(*w>='à' && *w<='ú'){ n=n->next[*w-'à']; (*preflen)++; w++; } else { break; } } if(n && !*w){ /* allow prefix followed by nothing (or a non-word like * number, maqaf, etc.) */ if(hspell_debug) fprintf(stderr,"Accepting empty word\n"); return 1; } else return 0; /* unrecognized (mis-spelled) word */ } /* this functions copies, in a less than inteligent fashion, the Nadav's code * from hspell_check_word. TODO: use the same code for both functions. */ int hspell_enum_splits(struct dict_radix *dict, const char *word, hspell_word_split_callback_func *enumf) { int preflen=0, count=0; int hashebrew; const char *w=word; struct prefix_node *n; /* ignore empty words: */ hashebrew=0; while(*w){ if(*w>='à' && *w<='ú'){ hashebrew=1; break; } preflen++; w++; } if(!hashebrew) return -1; /* ignore empty words */ n=prefix_tree; if(hspell_debug) fprintf(stderr,"enum_splits looking %s\n",w); while(*w && n){ /* eat up the " if necessary, to recognize words like * ä"ùèéç". or äéãéòä ù"äîéãò...". * See the Academy's punctuation rules (see ìùåððå ìòí, èáú, * úùñ"á) for an explanation of this rule (we're probably don't * support here everything they suggested; in particular I * don't recognize a single quote as valid form of merchaot). */ if(*w=='"'){ preflen++; w++; continue; } /* The first case here is the Academia's "ha-ktiv hasar * ha-niqqud" rule of doubling a consonant waw in the middle * a word, unless it's already next to a waw. When adding a * prefix, any initial waw in a word will necessarily * become a consonant waw in the middle of the word. * The "else if" below is the normal check. */ if(n!=prefix_tree && *w=='å' && w[-1]!='å'){ if(w[1]=='å'){ if(w[2]!='å' && (lookup(dict,w+1) & n->mask)){ w++; /* for example: äååòã */ if(hspell_debug) fprintf(stderr,"found %s: double waw.\n",w); enumf(word, w, preflen++, n->mask); n=n->next[*w-'à']; w++; count++; continue; } else if(lookup(dict,w) & n->mask){ /* for example: äååéí */ if(hspell_debug) fprintf(stderr,"found %s: nondouble waw.\n",w); enumf(word, w, preflen++, n->mask); n=n->next[*w-'à']; w++; count++; continue; } } } else { if (hspell_debug) fprintf (stderr, "enum_splits: tried %s mask %d prefmask %d\n",w,lookup(dict,w), n->mask); if(lookup(dict,w) & n->mask) { enumf(word, w, preflen++, n->mask); n=n->next[*w-'à']; w++; count++; continue; } /* found word! */ } /* try the next prefix... */ if(*w>='à' && *w<='ú'){ n=n->next[*w-'à']; preflen++; w++; } else { break; } } if(n && !*w){ /* allow prefix followed by nothing (or a non-word like * number, maqaf, etc.) */ if(hspell_debug) fprintf(stderr,"Accepting empty word\n"); enumf(word, w, preflen, n->mask); count++; } /* else return 0; unrecognized (mis-spelled) word */ if (hspell_debug) fprintf(stderr, "enum_splits found %d splits\n", count); return count; } /* try to find corrections for word */ void hspell_trycorrect(struct dict_radix *dict, const char *w, struct corlist *cl) { char buf[30]; int i; int len=strlen(w), preflen; static char *similar[] = {"äòà", "âä", "ëç", "úè", "öñ", "ùñ", "ë÷", "áå", "ôá"}; #define TRYBUF if(hspell_check_word(dict, buf, &preflen)) corlist_add(cl, buf) /* try to add a missing em kri'a - yud or vav */ for(i=1;i0 && w[i]=='å' && w[i+1]=='å') snprintf(buf,sizeof(buf), "%.*s%c%s",i,w,*g,w+i+2); else if(*g=='å') snprintf(buf,sizeof(buf), "%.*såå%s",i,w,w+i+1); else snprintf(buf,sizeof(buf), "%.*s%c%s",i,w,*g,w+i+1); TRYBUF; } } } } /* try to replace a non-final letter at the end of the word by its * final form and vice versa (useful check for abbreviations) */ strncpy(buf,w,sizeof(buf)); switch(w[len-1]){ case 'ê': buf[len-1]='ë'; break; case 'í': buf[len-1]='î'; break; case 'ï': buf[len-1]='ð'; break; case 'õ': buf[len-1]='ö'; break; case 'ó': buf[len-1]='ô'; break; case 'ë': buf[len-1]='ê'; break; case 'î': buf[len-1]='í'; break; case 'ð': buf[len-1]='ï'; break; case 'ö': buf[len-1]='õ'; break; case 'ô': buf[len-1]='ó'; break; } if(buf[len-1]!=w[len-1]){ TRYBUF; } /* try to make the word into an acronym (add " before last character */ if(len>=2){ snprintf(buf,sizeof(buf), "%.*s\"%c",len-1,w,w[len-1]); TRYBUF; } /* try to make the word into an abbreviation (add ' at the end) */ snprintf(buf,sizeof(buf), "%s'",w); TRYBUF; } /* hspell_init() reads the dictionary and initializes the necessary data structures, into the an allocated dictp structure. hspell_init() returns 0 on success, or negative numbers on errors: -1: cannot read dictionary. */ int hspell_init(struct dict_radix **dictp, int flags){ int ret; ret=load_data(dictp); if(ret<0) return ret; build_prefix_tree(flags & HSPELL_OPT_HE_SHEELA); #ifdef USE_LINGINFO if (flags & HSPELL_OPT_LINGUISTICS) { if (!linginfo_init(DICTIONARY_BASE)) return -1; } #endif return 0; } /* TODO: hspell_init should use a new "hspell_context" structure, not dict_radix. Because we might want to add more things like user dictionary. The prefix tree should also sit in the hspell_context, instead of being a global variable: the current mishmash of globals and non-globals is ugly. Linginfo's global variables (see linginfo_init and linginfo_free) should also be in this context. */ /* hspell_uninit() undoes the effects of hspell_init, freeing memory that was allocated during initialization. The dict pointer passed is no longer valid after this call, and should not be used (i.e., hspell_uninit() has similar semnatics to free()). */ void hspell_uninit(struct dict_radix *dict) { delete_dict_radix(dict); /* free prefix tree. Too bad this is a global variable, and not something in a "context" given to us as a paramter. */ free_prefix_tree(prefix_tree); prefix_tree=0; #ifdef USE_LINGINFO linginfo_free(); #endif }