ports//hebrew/hspell/work/hspell-0.8/libhspell.c

/* Copyright (C) 2003-2004 Nadav Har'El and Dan Kenigsberg */

#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "dict_radix.h"

#include "hspell.h"
#include "linginfo.h"

/* TODO: compile out debug code in production version... */
int hspell_debug=0;

/* Load the data files. Returns 0 on success, -1 if couldn't read the
   dictionary.
*/
static int
load_data(struct dict_radix **dictp)
{
	clock_t t1, t2;
	if(hspell_debug){
		fprintf(stderr,"Loading data files... ");
		t1=clock();
	}

	*dictp = new_dict_radix();
#ifndef DICTIONARY_BASE
#define DICTIONARY_BASE "./hebrew.wgz"
#endif
	if(!read_dict(*dictp, DICTIONARY_BASE)){
		return -1;
	}

	if(hspell_debug){
		t2=clock();
		fprintf(stderr,"done (%d ms).\n",
				(int)((t2-t1)/(CLOCKS_PER_SEC/1000)));
	}
	return 0;
}

/*
 * The prefix tree "prefix_tree" is built by build_prefix_tree, from a list of
 * known combinations of prefixes. Each prefix also has a mask that determines
 * to what kind of words it can be applied.
 *
 * The list of known prefixes and masks were defined in the prefixes[] and
 * masks[] arrays in prefixes.c. That file is automatically generated by the
 * genprefixes.pl program.
 */

#include "prefixes.c"

struct prefix_node {
	/* if a prefix has a certain 'mask', and lookup on a word returns
	 * 'val' (a bitmask of prefixes allowed for it), our prefix is
	 * allowed on this word if and only if (mask & val)!=0.
	 *
	 * This means that 'mask' defines the bits that this prefix "supplies"
	 * and he 'val' defined for a word is the bits this words insists on
	 * getting at least one of (i.e., val is the list of types of
	 * prefixes that are allowed for this word).
	 */
	int mask;
	struct prefix_node *next['ú'-'à'+1];
};
static struct prefix_node *prefix_tree = 0;

static void
build_prefix_tree(int allow_he_hasheela){
	int i;
	const char *p;
	struct prefix_node **n;
	char **prefixes;
	int *masks;
	if(allow_he_hasheela){
		prefixes=prefixes_H;
		masks=masks_H;
	} else {
		prefixes=prefixes_noH;
		masks=masks_noH;
	}

	for(i=0; prefixes[i]; i++){
		p=prefixes[i];
		n=&prefix_tree;
		if(hspell_debug)
			fprintf(stderr,"prefix %s ",p);
		while(*p){
			if(!(*n))
				*n=(struct prefix_node *)
					calloc(1,sizeof(struct prefix_node));
			n=& ((*n)->next[*p-'à']);
			p++;
		}
		/* define the mask (making sure the node exists). */
		if(!*n)
			*n=(struct prefix_node *)
				calloc(1,sizeof(struct prefix_node));
		(*n)->mask=masks[i];

		if(hspell_debug)
			fprintf(stderr,"mask=%d\n",(*n)->mask);
	}
}

static void
free_prefix_tree(struct prefix_node *n)
{
	/* free_prefix_tree recursively walk the tree, freeing all nodes */
	int i;
	if(!n)
		return;
	for(i=0; i< sizeof(n->next)/sizeof(n->next[0]); i++)
		free_prefix_tree(n->next[i]);
	free(n);
}


int
hspell_check_word(struct dict_radix *dict, const char *word, int *preflen)
{
	int hashebrew;
	const char *w=word;
	struct prefix_node *n;
	*preflen = 0;

	/* ignore empty words: */
	hashebrew=0;
	while(*w){
		if(*w>='à' && *w<='ú'){
			hashebrew=1;
			break;
		}
		(*preflen)++;
		w++;
	}
	if(!hashebrew)
		return 1; /* ignore (accept) empty words */


	n=prefix_tree;
	if(hspell_debug)
		fprintf(stderr,"looking %s\n",w);
	while(*w && n){
		/* eat up the " if necessary, to recognize words like
		 * ä"ùèéç".  or äéãéòä ù"äîéãò...".
		 * See the Academy's punctuation rules (see ìùåððå ìòí, èáú,
		 * úùñ"á) for an explanation of this rule (we're probably don't
		 * support here everything they suggested; in particular I
		 * don't recognize a single quote as valid form of merchaot).
		 */
		if(*w=='"'){
			(*preflen)++;
			w++;
			continue;
		}
		/* The first case here is the Academia's "ha-ktiv hasar
		 * ha-niqqud" rule of doubling a consonant waw in the middle
		 * a word, unless it's already next to a waw. When adding a
		 * prefix, any initial waw in a word will nececessarily
		 * become a consonant waw in the middle of the word.
		 * The "else if" below is the normal check.
		 */
		if(n!=prefix_tree && *w=='å' && w[-1]!='å'){
			if(w[1]=='å'){
				if(w[2]!='å' && (lookup(dict,w+1) & n->mask)){
					/* for example: äååòã */
					if(hspell_debug)
						fprintf(stderr,"found %s: double waw.\n",w);
					return 1;
				} else if(lookup(dict,w) & n->mask){
					/* for example: äååéí */
					if(hspell_debug)
						fprintf(stderr,"found %s: nondouble waw.\n",w);
					return 1;
				}
			}
		} else {
			if (hspell_debug) fprintf (stderr, "tried %s mask %d prefmask %d\n",w,lookup(dict,w), n->mask);
			if(lookup(dict,w) & n->mask) return 1; /* found word! */
		}

		/* try the next prefix... */
		if(*w>='à' && *w<='ú'){
			n=n->next[*w-'à'];
			(*preflen)++;
			w++;
		} else {
			break;
		}
	}
	if(n && !*w){
		/* allow prefix followed by nothing (or a non-word like
		 * number, maqaf, etc.) */
		if(hspell_debug) fprintf(stderr,"Accepting empty word\n");
		return 1;
	} else
		return 0; /* unrecognized (mis-spelled) word */
}

/* this functions copies, in a less than inteligent fashion, the Nadav's code
 * from hspell_check_word. TODO: use the same code for both functions. */
int hspell_enum_splits(struct dict_radix *dict, const char *word, 
	hspell_word_split_callback_func *enumf)
{
	int preflen=0, count=0;

	int hashebrew;
	const char *w=word;
	struct prefix_node *n;

	/* ignore empty words: */
	hashebrew=0;
	while(*w){
		if(*w>='à' && *w<='ú'){
			hashebrew=1;
			break;
		}
		preflen++;
		w++;
	}
	if(!hashebrew)
		return -1; /* ignore empty words */

	n=prefix_tree;
	if(hspell_debug)
		fprintf(stderr,"enum_splits looking %s\n",w);
	while(*w && n){
		/* eat up the " if necessary, to recognize words like
		 * ä"ùèéç".  or äéãéòä ù"äîéãò...".
		 * See the Academy's punctuation rules (see ìùåððå ìòí, èáú,
		 * úùñ"á) for an explanation of this rule (we're probably don't
		 * support here everything they suggested; in particular I
		 * don't recognize a single quote as valid form of merchaot).
		 */
		if(*w=='"'){
			preflen++;
			w++;
			continue;
		}
		/* The first case here is the Academia's "ha-ktiv hasar
		 * ha-niqqud" rule of doubling a consonant waw in the middle
		 * a word, unless it's already next to a waw. When adding a
		 * prefix, any initial waw in a word will necessarily
		 * become a consonant waw in the middle of the word.
		 * The "else if" below is the normal check.
		 */
		if(n!=prefix_tree && *w=='å' && w[-1]!='å'){
			if(w[1]=='å'){
				if(w[2]!='å' && (lookup(dict,w+1) & n->mask)){
					w++;
					/* for example: äååòã */
					if(hspell_debug)
						fprintf(stderr,"found %s: double waw.\n",w);
					enumf(word, w, preflen++, n->mask);
					n=n->next[*w-'à']; w++;
					count++;
					continue;
				} else if(lookup(dict,w) & n->mask){
					/* for example: äååéí */
					if(hspell_debug)
						fprintf(stderr,"found %s: nondouble waw.\n",w);
					enumf(word, w, preflen++, n->mask);
					n=n->next[*w-'à']; w++;
					count++;
					continue;
				}
			}
		} else {
			if (hspell_debug) fprintf (stderr, "enum_splits: tried %s mask %d prefmask %d\n",w,lookup(dict,w), n->mask);
			if(lookup(dict,w) & n->mask) {
				enumf(word, w, preflen++, n->mask);
				n=n->next[*w-'à']; w++;
				count++;
				continue;
			} /* found word! */
		}

		/* try the next prefix... */
		if(*w>='à' && *w<='ú'){
			n=n->next[*w-'à'];
			preflen++;
			w++;
		} else {
			break;
		}
	}
	if(n && !*w){
		/* allow prefix followed by nothing (or a non-word like
		 * number, maqaf, etc.) */
		if(hspell_debug) fprintf(stderr,"Accepting empty word\n");
		enumf(word, w, preflen, n->mask);
		count++;
	} /* else
		return 0;  unrecognized (mis-spelled) word */
	if (hspell_debug) fprintf(stderr, "enum_splits found %d splits\n", count);
	return count;
}



/* try to find corrections for word */
void
hspell_trycorrect(struct dict_radix *dict, const char *w, struct corlist *cl)
{
	char buf[30];
	int i;
	int len=strlen(w), preflen;
	static char *similar[] = {"äòà", "âä", "ëç", "úè", "öñ", "ùñ",
				  "ë÷", "áå", "ôá"};

#define TRYBUF if(hspell_check_word(dict, buf, &preflen)) corlist_add(cl, buf)
	/* try to add a missing em kri'a - yud or vav */
	for(i=1;i<len;i++){
		snprintf(buf,sizeof(buf),"%.*sé%s",i,w,w+i);
		TRYBUF;
		snprintf(buf,sizeof(buf),"%.*så%s",i,w,w+i);
		TRYBUF;
	}
	/* try to remove an em kri'a - yud or vav */
	/* NOTE: in hspell.pl the loop was from i=0 to i<len... */
	for(i=1;i<len-1;i++){
		if(w[i]=='é' || w[i]=='å'){
			snprintf(buf,sizeof(buf),"%.*s%s",i,w,w+i+1);
			TRYBUF;
		}
	}
	/* try to add or remove an aleph (is that useful?) */
	/* TODO: don't add an aleph next to yud or non-double vav,
	 * as it can't be an em kria there? */
	for(i=1;i<len;i++){
		snprintf(buf,sizeof(buf),"%.*sà%s",i,w,w+i);
		TRYBUF;
	}
	for(i=1;i<len-1;i++){
		if(w[i]=='à'){
			snprintf(buf,sizeof(buf),"%.*s%s",i,w,w+i+1);
			TRYBUF;
		}
	}
	/* try to replace similarly sounding (for certain people) letters:
	 */
	for(i=0;i<len;i++){
		int group;
		char *g;
		for(group=0; group< (sizeof(similar)/sizeof(similar[0]));
				group++){
			for(g=similar[group];*g && *g!=w[i];g++);
				;
			if(*g){
				/* character in group - try the other ones
				 * in this group! */
				for(g=similar[group];*g;g++){
					if(*g==w[i]) continue;
					if(i>0 && w[i]=='å' && w[i+1]=='å')
						snprintf(buf,sizeof(buf),
						    "%.*s%c%s",i,w,*g,w+i+2);
					else if(*g=='å')
						snprintf(buf,sizeof(buf),
						    "%.*såå%s",i,w,w+i+1);
					else
						snprintf(buf,sizeof(buf),
						   "%.*s%c%s",i,w,*g,w+i+1);
					TRYBUF;
				}
			}
		}
	}
	/* try to replace a non-final letter at the end of the word by its
	 * final form and vice versa (useful check for abbreviations) */
	strncpy(buf,w,sizeof(buf));
	switch(w[len-1]){
		case 'ê': buf[len-1]='ë'; break;
		case 'í': buf[len-1]='î'; break;
		case 'ï': buf[len-1]='ð'; break;
		case 'õ': buf[len-1]='ö'; break;
		case 'ó': buf[len-1]='ô'; break;
		case 'ë': buf[len-1]='ê'; break;
		case 'î': buf[len-1]='í'; break;
		case 'ð': buf[len-1]='ï'; break;
		case 'ö': buf[len-1]='õ'; break;
		case 'ô': buf[len-1]='ó'; break;
	}
	if(buf[len-1]!=w[len-1]){ TRYBUF; }
	/* try to make the word into an acronym (add " before last character */
	if(len>=2){
		snprintf(buf,sizeof(buf), "%.*s\"%c",len-1,w,w[len-1]);
		TRYBUF;
	}
	/* try to make the word into an abbreviation (add ' at the end) */
	snprintf(buf,sizeof(buf), "%s'",w);
	TRYBUF;
}

/* hspell_init() reads the dictionary and initializes the necessary data
   structures, into the an allocated dictp structure.

   hspell_init() returns 0 on success, or negative numbers on errors:
   -1: cannot read dictionary.
*/
int
hspell_init(struct dict_radix **dictp, int flags){
	int ret;
	ret=load_data(dictp);
	if(ret<0) return ret;
	build_prefix_tree(flags & HSPELL_OPT_HE_SHEELA);
#ifdef USE_LINGINFO
	if (flags & HSPELL_OPT_LINGUISTICS) {
		if (!linginfo_init(DICTIONARY_BASE)) return -1;
	}
#endif
	return 0;
}

/* TODO: hspell_init should use a new "hspell_context" structure, not
   dict_radix. Because we might want to add more things like user dictionary.
   The prefix tree should also sit in the hspell_context, instead of
   being a global variable: the current mishmash of globals and non-globals
   is ugly.
   Linginfo's global variables (see linginfo_init and linginfo_free)
   should also be in this context.
*/

/* hspell_uninit() undoes the effects of hspell_init, freeing memory that
   was allocated during initialization. The dict pointer passed is no
   longer valid after this call, and should not be used (i.e., hspell_uninit()
   has similar semnatics to free()).
*/
void
hspell_uninit(struct dict_radix *dict)
{
	delete_dict_radix(dict);
	/* free prefix tree. Too bad this is a global variable, and not
	   something in a "context" given to us as a paramter. */
	free_prefix_tree(prefix_tree);
	prefix_tree=0;
#ifdef USE_LINGINFO
	linginfo_free();
#endif
}
syntax highlighted by Code2HTML, v. 0.9.1