/* index.c -- * Created: Wed Oct 9 14:52:23 1996 by faith@dict.org * Revised: Tue Apr 23 09:14:43 2002 by faith@dict.org * Copyright 1996, 1997, 1998, 2000, 2002 Rickard E. Faith (faith@dict.org) * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 1, or (at your option) any * later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "dictP.h" #include "dictzip.h" #include "index.h" #include "include_regex.h" #include "strategy.h" #include "str.h" #ifdef USE_PLUGIN #include "plugin.h" #endif #include #include #include #if HAVE_WCTYPE_H #include #endif #if HAVE_WCHAR_H #include #endif #include #include extern int mmap_mode; #define FIND_PREV(begin, pt) while (pt > begin && pt [-1] != '\n') --pt; #define FIND_NEXT(pt, end) while (pt < end && *pt++ != '\n'); #define MAXWORDLEN 512 #define BMH_THRESHOLD 3 /* When to start using Boyer-Moore-Hoorspool */ #ifndef SYSTEM_UTF8_FUNCS /* defaults to run in UTF-8 mode */ int utf8_mode=1; /* dictd uses UTF-8 dictionaries */ #else /* defaults to run in ASCII mode */ int utf8_mode=0; /* dictd uses UTF-8 dictionaries */ #endif int bit8_mode = 0; /* dictd uses 8-BIT dictionaries */ int optStart_mode = 1; /* Optimize search range for constant start */ dictConfig *DictConfig; int _dict_comparisons; static int isspacealnumtab [UCHAR_MAX + 1]; static int isspacealnumtab_allchars[UCHAR_MAX + 1]; static int isspacepuncttab [UCHAR_MAX + 1]; static int char2indextab [UCHAR_MAX + 2]; static int index2chartab [UCHAR_MAX + 2]; static int tolowertab [UCHAR_MAX + 1]; char global_alphabet_8bit [UCHAR_MAX + 2]; char global_alphabet_ascii [UCHAR_MAX + 2]; static int chartab [UCHAR_MAX + 1]; static int charcount = 0; static int define_or_match = 0; /* 1 if define */ /* #define isspacealnum(x) (isspacealnumtab[(unsigned char)(x)]) */ #define c2i(x) (char2indextab[(unsigned char)(x)]) #define i2c(x) (index2chartab[(unsigned char)(x)]) #define c(x) (((x) < charcount) ? chartab[(unsigned char)(x)] : 0) /* compares two 8bit strings (containing one character) according to locate */ static int dict_table_init_compare_strcoll (const void *a, const void *b) { return strcoll(*(const char **)a, *(const char **)b); } /* compares two strings (containing one character) converting ASCII character to lower case. */ static int dict_table_init_compare_utf8 (const void *a, const void *b) { int c1 = ** (const unsigned char **) a; int c2 = ** (const unsigned char **) b; if (c1 <= CHAR_MAX) c1 = tolower (c1); if (c2 <= CHAR_MAX) c2 = tolower (c2); return c1 - c2; } static void dict_make_global_alphabet (void) { int i; int j = 0; int ch; for (i=0; i < charcount; ++i){ global_alphabet_8bit [i] = ch = c(i); if (ch < 128) global_alphabet_ascii [j++] = ch; } global_alphabet_8bit [charcount] = 0; global_alphabet_ascii [j] = 0; PRINTF(DBG_SEARCH, ( "global_alphabet_8bit = '%s'\n", global_alphabet_8bit)); PRINTF(DBG_SEARCH, ( "global_alphabet_ascii = '%s'\n", global_alphabet_ascii)); } static void dict_table_init(void) { int i; unsigned char s[2 * UCHAR_MAX + 2]; unsigned char *p[UCHAR_MAX + 1]; for (i = 0; i <= UCHAR_MAX; i++) { if (isspace(i) || isalnum(i) || (utf8_mode && i >= 0x80)){ isspacealnumtab [i] = 1; }else{ isspacealnumtab [i] = 0; } tolowertab [i] = tolower (i); if (i >= 0x80){ if (utf8_mode || (!utf8_mode && !bit8_mode)){ /* utf-8 or ASCII mode */ tolowertab [i] = i; } } if (utf8_mode && i >= 0x80){ isspacepuncttab [i] = 0; }else{ isspacepuncttab [i] = isspace(i) || ispunct(i); } isspacealnumtab_allchars [i] = 1; } isspacepuncttab['\t'] = isspacepuncttab['\n'] = 1; /* special */ isspacealnumtab['\t'] = isspacealnumtab['\n'] = 0; /* special */ isspacealnumtab_allchars['\t'] = isspacealnumtab_allchars['\n'] = 0; /* special */ charcount = 0; for (i = 0; i <= UCHAR_MAX; i++){ if (islower (i) || (utf8_mode && i >= 0xC0)) chartab [charcount++] = i; } /* Populate an array with length-1 strings */ for (i = 0; i <= UCHAR_MAX; i++) { if (!isupper (i)){ s[2 * i] = i; }else{ s[2 * i] = 0; } s[2 * i + 1] = '\0'; p[i] = &s[2 * i]; } /* Sort those strings in the locale */ if (utf8_mode) qsort(p, UCHAR_MAX + 1, sizeof(p[0]), dict_table_init_compare_utf8); else qsort(p, UCHAR_MAX + 1, sizeof(p[0]), dict_table_init_compare_strcoll); /* Extract our unordered arrays */ for (i = 0; i <= UCHAR_MAX; i++) { char2indextab[*p[i]] = i; index2chartab[i] = *p[i]; } char2indextab[UCHAR_MAX + 1] = UCHAR_MAX; index2chartab[UCHAR_MAX + 1] = UCHAR_MAX; /* we may index here in */ if (dbg_test(DBG_SEARCH)) { for (i = 0; i <= UCHAR_MAX; ++i){ if (p [i][0] <= CHAR_MAX){ PRINTF (DBG_SEARCH,("sorted list: %s\n", p [i])); }else{ PRINTF (DBG_SEARCH,("sorted list: %i\n", (unsigned char) p [i] [0])); } } } if (dbg_test(DBG_SEARCH)) { for (i = 0; i < charcount; i++) PRINTF(DBG_SEARCH,("%03d %d ('%c')\n", i, c(i), c(i))); for (i = 0; i <= UCHAR_MAX; i++) PRINTF(DBG_SEARCH,("c2i(%d/'%c') = %d; i2c(%d) = %d/'%c'\n", i, (char) isgraph(i) ? i : '.', c2i(i), c2i(i), i2c(c2i(i)), (char) i2c(c2i(i)) ? i2c(c2i(i)) : '.')); } dict_make_global_alphabet (); } static int compare_allchars( const char *word, const char *start, const char *end ) { int c1, c2; int result; PRINTF(DBG_SEARCH,(" We are inside index.c:compare_allchars\n")); while (*word && *word != '\t' && start < end && *start != '\t') { // if (!isspacealnum(*start)) { // ++start; // continue; // } #if 0 if (isspace( (unsigned char) *start )) c2 = ' '; else c2 = * (unsigned char *) start; if (isspace( (unsigned char) *word )) c1 = ' '; else c1 = * (unsigned char *) word; #else c2 = * (unsigned char *) start; c1 = * (unsigned char *) word; #endif if (c1 != c2) { if (c1 < c2){ result = -2; }else{ result = 1; } if (dbg_test(DBG_SEARCH)){ PRINTF(DBG_SEARCH,(" result = %d (%i != %i) \n", result, c1, c2)); } return result; } ++word; ++start; } result = (*word && *word != '\t' ? 1 : ((*start != '\t') ? -1 : 0)); PRINTF(DBG_SEARCH,(" result = %d\n", result)); return result; } static int compare_alnumspace( const char *word, const dictIndex *dbindex, const char *start, const char *end ) { int c1, c2; int result; int ret; assert (dbindex); PRINTF(DBG_SEARCH,(" We are inside index.c:compare_alnumspace\n")); /* FIXME. Optimize this inner loop. */ while (*word && *word != '\t' && start < end && *start != '\t') { if (!dbindex -> isspacealnum[* (const unsigned char *) start]) { ++start; continue; } c1 = (unsigned char) *word; c2 = (unsigned char) *start; if (!dbindex -> flag_casesensitive){ c2 = tolowertab [c2]; c1 = tolowertab [c1]; } if (c1 != c2) { if (utf8_mode){ if ( (c1 <= CHAR_MAX ? c2i (c1) : c1) < (c2 <= CHAR_MAX ? c2i (c2) : c2)) { result = -2; }else{ result = 1; } }else{ result = (c2i (c1) < c2i (c2) ? -2 : 1); } if (dbg_test(DBG_SEARCH)){ if (utf8_mode){ PRINTF (DBG_SEARCH,( " result = %d (%i != %i) \n", result, c1, c2)); }else{ PRINTF (DBG_SEARCH,( " result = %d ('%c'(c2i=%i) != '%c'(c2i=%i)) \n", result, c1, c2i (c1), c2, c2i (c2))); } } return result; } ++word; ++start; } while ( *start != '\t' && !dbindex -> isspacealnum[* (const unsigned char *) start]) { ++start; } ret = *word && *word != '\t' ? 1 : ((*start != '\t') ? -1 : 0); PRINTF(DBG_SEARCH,(" result = %d\n", ret)); return ret; } /* Compare: Return -2 if word < word-pointed-to-by-start -1 if word prefix word-pointed-to-by-start 0 if word == word-pointed-to-by-start 1 if word > word-pointed-to-by-start 2 if some kind of error happened The comparison must be the same as "sort -df" phone directory order: ignore all characters except letters, digits, and blanks; fold upper case into the equivalent lowercase. word already has all of the illegal characters removed */ static int compare( const char *word, const dictIndex *dbindex, const char *start, const char *end ) { char buf[80], *d; const char *s; assert (dbindex); if (dbg_test(DBG_SEARCH)) { for ( d = buf, s = start; d - buf + 1 < (int) sizeof (buf) && s < end && *s != '\t';) { *d++ = *s++; } *d = '\0'; PRINTF(DBG_SEARCH, ("compare \"%s\" with \"%s\" (sizes: %lu and %lu)\n", word, buf, (unsigned long) strlen( word ), (unsigned long) strlen( buf ) )); } ++_dict_comparisons; /* counter for profiling */ if (dbindex && (dbindex -> flag_allchars || dbindex -> flag_utf8 || dbindex -> flag_8bit)) { return compare_allchars( word, start, end ); }else{ return compare_alnumspace( word, dbindex, start, end ); } } static const char *binary_search( const char *word, const dictIndex *dbindex, const char *start, const char *end ) { const char *pt; assert (dbindex); PRINTF(DBG_SEARCH,("%s %p %p\n", word, start, end)); pt = start + (end-start)/2; FIND_PREV(start, pt); while (start < end) { switch (compare( word, dbindex, pt, end )){ case -2: case -1: case 0: end = pt; break; case 1: start = pt; FIND_NEXT(start, end) break; case 2: return end; /* ERROR!!! */ default: assert (0); } PRINTF(DBG_SEARCH,("%s %p %p\n",word,start,end)); pt = start + (end-start)/2; FIND_PREV(start, pt); } return start; } static const char *binary_search_8bit( const char *word, const dictIndex *dbindex, const char *start, const char *end ) { char buf[80], *d; const char *s; const char *pt; int cmp; assert (dbindex); PRINTF(DBG_SEARCH,("word/start/end %s/%p/%p\n",word,start,end)); pt = start + (end-start)/2; FIND_PREV(start, pt); while (start < end) { if (dbg_test(DBG_SEARCH)) { for ( d = buf, s = pt; s < end && *s != '\t' && d - buf + 1 < (int) sizeof (buf);) { *d++ = *s++; } *d = '\0'; printf( "compare \"%s\" with \"%s\" (sizes: %lu and %lu)\n", word, buf, (unsigned long) strlen( word ), (unsigned long) strlen( buf ) ); } if ( dbindex && (dbindex -> flag_utf8 || dbindex -> flag_allchars)) { cmp = compare_allchars ( word, pt, end ); }else{ cmp = compare_alnumspace ( word, dbindex, pt, end ); } switch (cmp){ case -2: case -1: case 0: end = pt; break; case 1: start = pt; FIND_NEXT(start, end) break; case 2: return end; /* ERROR!!! */ default: assert (0); } PRINTF(DBG_SEARCH,("%s %p %p\n",word,start,end)); pt = start + (end-start)/2; FIND_PREV(start, pt); } return start; } static const char *linear_search( const char *word, const dictIndex *dbindex, const char *start, const char *end ) { const char *pt; assert (dbindex); for (pt = start; pt < end;) { switch (compare( word, dbindex, pt, end )) { case -2: return NULL; /* less than and not prefix */ case -1: /* prefix */ case 0: return pt; /* exact */ case 1: break; /* greater than */ case 2: return NULL; /* ERROR!!! */ } FIND_NEXT(pt,end); } return NULL; } static const char *dict_index_search( const char *word, dictIndex *idx ) { const char *start; const char *end; int first, last; assert (idx); /* With optStart: 17071 comparisons, 1000 words 33946 comparisons, 2000 words Without optStart: 20889 comparisons, 1000 words 41668 comparisons, 2000 words Linear: 594527 comparisons, 1000 words 2097035 comparisons, 2000 words */ if (optStart_mode){ first = * (const unsigned char *) word; last = i2c(c2i(first)+1); if (dbg_test(DBG_SEARCH)) { if (!utf8_mode || (last <= CHAR_MAX && first <= CHAR_MAX)) printf("binary_search from %c to %c\n", first, last); else printf("binary_search from %i to %i\n", first, last); } end = idx->optStart [last]; start = idx->optStart [first]; #if 0 fprintf (stderr, "start1 = %p\n", start); fprintf (stderr, "end1 = %p\n", end); #endif }else{ start = idx->start; end = idx->end; } if (end < start) end = idx->end; start = binary_search( word, idx, start, end ); PRINTF(DBG_SEARCH,("binary_search returns %p\n",start)); start = linear_search( word, idx, start, idx->end ); PRINTF(DBG_SEARCH,("linear_search returns %p\n",start)); return start; } static dictWord *dict_word_create( const char *entry, const dictDatabase *database, const dictIndex *dbindex) { int offs_word = 0; int offs_offset = 0; int offs_length = 0; int offset = 0; int word_len = 0; dictWord *dw = xmalloc( sizeof( struct dictWord ) ); const char *pt = entry; char *d; assert (dbindex); assert (pt >= dbindex -> start && pt < dbindex -> end); memset (dw, 0, sizeof (*dw)); for (;pt < dbindex->end && *pt != '\n'; pt++, offset++) { if (*pt == '\t') { if (!offs_offset) offs_offset = offset + 1; else if (!offs_length) offs_length = offset + 1; else if (!offs_word) offs_word = offset + 1; else{ err_internal( __FUNCTION__, "Too many tabs in index entry \"%*.*s\"\n", offs_length, offs_length, entry ); } } } if (!offs_length) err_internal( __FUNCTION__, "Too few tabs in index entry \"%20.20s\"\n", entry ); dw->start = b64_decode_buf (entry + offs_offset, offs_length - offs_offset - 1); if (offs_word > 0){ word_len = offset - offs_word; dw->end = b64_decode_buf (entry + offs_length, offs_word - offs_length - 1); }else{ word_len = offs_offset - 1; dw->end = b64_decode_buf (entry + offs_length, offset - offs_length); } dw->def = NULL; dw->def_size = 0; dw->database = database; /* Apply quoting to word */ dw->word = xmalloc (word_len*2 + 1); entry += offs_word; for (d = (char *)dw->word; word_len--;) { switch (*entry) { case '"': case '\\': *d++ = '\\'; } *d++ = *entry++; } *d = '\0'; return dw; } static int dict_dump_datum( const void *datum ) { dictWord *dw = (dictWord *)datum; printf( "\"%s\" %lu/%lu %p/%i\n", dw->word, dw->start, dw->end, dw->def, dw->def_size ); return 0; } void dict_dump_list( lst_List list ) { lst_iterate( list, dict_dump_datum ); } int dict_destroy_datum( const void *datum ) { dictWord *dw = (dictWord *)datum; if (!datum) return 0; if (dw->word) xfree( (char *)dw->word ); dw->word = NULL; dw->start = 0; dw->end = 0; dw->database = NULL; xfree( dw ); return 0; } void dict_destroy_list( lst_List list ) { lst_iterate( list, dict_destroy_datum ); lst_destroy( list ); } /* returns NULL if limits exceeded */ static dictWord *dict_add_word_to_list ( lst_List l, const dictDatabase *database, const dictIndex *dbindex, const char *pt) { dictWord * datum; assert (l); if (define_or_match){ if (_dict_daemon_limit_defs && lst_length (l) >= _dict_daemon_limit_defs) return NULL; }else{ if (_dict_daemon_limit_matches && lst_length (l) >= _dict_daemon_limit_matches) return NULL; } datum = dict_word_create (pt, database, dbindex); lst_append (l, datum); return datum; } static int dict_search_exact( lst_List l, const char *word, const dictDatabase *database, dictIndex *dbindex, int uniq_only) { const char *pt = NULL; int count = 0; const char *previous = NULL; assert (dbindex); pt = dict_index_search( word, dbindex ); while (pt && pt < dbindex->end) { if (!compare( word, dbindex, pt, dbindex->end )) { if (!uniq_only || !previous || compare(previous, dbindex, pt, dbindex->end)) { ++count; if (l){ if (!dict_add_word_to_list (l, database, dbindex, previous = pt)) { break; } } } }else{ break; } FIND_NEXT( pt, dbindex->end ); } return count; } enum { BMH_SUBSTRING, BMH_PREFIX, BMH_SUFFIX, BMH_WORD, BMH_FIRST, BMH_LAST, }; static int dict_search_prefix_first( lst_List l, const char *word, const dictDatabase *database, dictIndex *dbindex, int flag, int skip_count, int item_count) { const char *pt = dict_index_search( word, dbindex ); int count = 0; const char *previous = NULL; int wordlen = strlen (word); int c = 0; assert (dbindex); if (item_count <= 0){ return 0; } while (pt && pt < dbindex->end) { switch (compare( word, dbindex, pt, dbindex->end )) { case -2: return count; case -1: case 0: if (!previous || compare(previous, dbindex, pt, dbindex->end)) { if (flag == BMH_FIRST){ c = (unsigned char) pt [wordlen]; if (c != '\t' && !isspacepuncttab [c]) break; } if (skip_count == 0){ ++count; if (!dict_add_word_to_list (l, database, dbindex, pt)) return count; --item_count; if (!item_count){ return count; } }else{ --skip_count; } } previous = pt; break; case 1: return count; case 2: return count; /* ERROR!!! */ default: assert (0); } FIND_NEXT( pt, dbindex->end ); } return count; } static int dict_search_prefix ( lst_List l, const char *word, const dictDatabase *database, dictIndex *dbindex, int skip_count, int item_count) { dict_search_prefix_first (l, word, database, dbindex, BMH_PREFIX, skip_count, item_count); } static int dict_search_first ( lst_List l, const char *word, const dictDatabase *database, dictIndex *dbindex) { dict_search_prefix_first (l, word, database, dbindex, BMH_FIRST, 0, INT_MAX); } static int dict_search_brute( lst_List l, const unsigned char *word, const dictDatabase *database, dictIndex *dbindex, int flag, int patlen ) { const unsigned char *const start = dbindex->start; const unsigned char *const end = dbindex->end; const unsigned char *p, *pt; int count = 0; int result; const char *previous = NULL; int c; assert (dbindex); p = start; while (p < end && !dbindex -> isspacealnum[*p]) ++p; for (; p < end; ++p) { if (*p == '\t') { while (p < end && *p != '\n') ++p; ++p; while (p < end && !dbindex -> isspacealnum[*p]) ++p; } c = *p; if (!dbindex -> flag_casesensitive){ c = tolowertab [c]; } if (c == *word) { result = compare( word, dbindex, p, end ); if (result == -1 || result == 0) { switch (flag){ case BMH_SUBSTRING: break; case BMH_SUFFIX: if (result) continue; break; case BMH_WORD: if (p > start && !isspacepuncttab [p [-1]]) continue; if (p+patlen < end && !isspacepuncttab [p [patlen]]) continue; break; case BMH_LAST: if (result) continue; if (p > start && !isspacepuncttab [p [-1]]) continue; break; default: abort (); } for (pt = p; pt >= start && *pt != '\n'; --pt) if (*pt == '\t') goto continue2; if (!previous || compare(previous, dbindex, pt + 1, end)) { ++count; if (!dict_add_word_to_list (l, database, dbindex, previous = pt + 1)) { break; } } FIND_NEXT(p,end); --p; } } continue2: ; } return count; } /* dict_search_bmh implements a version of the Boyer-Moore-Horspool text searching algorithm, as described in G. H. Gonnet and R. Baeza-Yates, HANDBOOK OF ALGORITHMS AND DATA STRUCTURES: IN PASCAL AND C (2nd ed). Addison-Wesley Publishing Co., 1991. Pages 258-9. */ static int dict_search_bmh( lst_List l, const unsigned char *word, const dictDatabase *database, dictIndex *dbindex, int flag ) { const unsigned char *const start = dbindex->start; const unsigned char *const end = dbindex->end; const int patlen = strlen( word ); int skip[UCHAR_MAX + 1]; int i; int j; int c; #if 0 int k; #endif const unsigned char *p, *pt, *ptr; int count = 0; const unsigned char *f = NULL; /* Boolean flag, but has to be a pointer */ const unsigned char *wpt; const unsigned char *previous = NULL; assert (dbindex); if (patlen < BMH_THRESHOLD) return dict_search_brute( l, word, database, dbindex, flag, patlen ); for (i = 0; i <= UCHAR_MAX; i++) { if (dbindex -> isspacealnum[i]) skip[i] = patlen; else skip[i] = 1; } for (i = 0; i < patlen-1; i++) skip[(unsigned char)word[i]] = patlen-i-1; for (p = start+patlen-1; p < end; ) { while (*p == '\t') { FIND_NEXT(p,end); p += patlen-1; if (p > end) return count; } ++_dict_comparisons; /* counter for profiling */ /* FIXME. Optimize this inner loop. */ for (j = patlen - 1, pt = p, wpt = word + patlen - 1; j >= 0; j--) { if (pt < start) break; while (pt >= start && !dbindex -> isspacealnum[*pt]) { if (*pt == '\n' || *pt == '\t') goto continue2; --pt; } c = *pt--; if (!dbindex -> flag_casesensitive){ c = tolowertab [c]; } if (c != *wpt--) break; } if (j == -1) { switch (flag){ case BMH_SUBSTRING: break; case BMH_SUFFIX: if (p[1] != '\t') goto continue2; break; case BMH_WORD: ptr = p - patlen + 1; if (ptr > start && !isspacepuncttab [ptr [-1]]) goto continue2; if (p < end && !isspacepuncttab [p [1]]) goto continue2; break; case BMH_LAST: if (p[1] != '\t') goto continue2; ptr = p - patlen + 1; if (ptr > start && !isspacepuncttab [ptr [-1]]) goto continue2; break; } for (; pt > start && *pt != '\n'; --pt) if (*pt == '\t') goto continue2; ++pt; assert (pt >= start && pt < end); if (!previous || compare(previous, dbindex, pt, dbindex->end)) { ++count; if (l){ if (!dict_add_word_to_list (l, database, dbindex, previous = pt)) { return count; } } } FIND_NEXT(p,end); f = p += patlen-1; /* Set boolean flag to non-NULL value */ if (p > end) return count; } continue2: if (f){ f = NULL; }else{ c = *p; if (!dbindex -> flag_casesensitive){ c = tolowertab [c]; } p += skip [c]; } } return count; } static int dict_search_substring( lst_List l, const char *word, const dictDatabase *database, dictIndex *dbindex) { return dict_search_bmh( l, word, database, dbindex, BMH_SUBSTRING ); } static int dict_search_word( lst_List l, const char *word, const dictDatabase *database) { lst_Position pos; dictWord *dw; const char *p; char *ptr; int ret1, ret2; int count; int len; assert (database); assert (database -> index); if (database->index_word){ ret2 = dict_search_exact( l, word, database, database->index, 0 ); if (ret2 < 0) return ret2; count = lst_length (l); ret1 = dict_search_exact( l, word, database, database->index_word, 0 ); if (ret1 < 0) return ret1; LST_ITERATE (l, pos, dw){ if (count-- <= 0){ xfree (dw -> word); p = database -> index -> start + dw -> start; assert (p == database -> index -> start || p [-1] == '\n'); len = strchr (p, '\t') - p; dw -> word = xmalloc (len + 1); memcpy (dw -> word, p, len); dw -> word [len] = 0; dw -> start = -2; dw -> end = 0; } } return ret1 + ret2; }else{ return dict_search_bmh( l, word, database, database->index, BMH_WORD ); } } /* return non-zero if success, 0 otherwise */ static int dict_match ( const regex_t *re, const char *word, size_t word_len, int eflags) { #if defined(REG_STARTEND) regmatch_t subs[1]; subs [0].rm_so = 0; subs [0].rm_eo = word_len; return !regexec(re, word, 1, subs, eflags | REG_STARTEND); #else char *word_copy = (char *) alloca (word_len + 1); memcpy (word_copy, word, word_len); word_copy [word_len] = 0; return !regexec(re, word_copy, 0, NULL, eflags); #endif } static int dict_search_regexpr( lst_List l, const char *word, const dictDatabase *database, dictIndex *dbindex, int type ) { const char *start = dbindex->start; const char *end = dbindex->end; const char *p, *pt; int count = 0; regex_t re; char erbuf[100]; int err; unsigned char first; const char *previous = NULL; assert (dbindex); #if 1 /* optimization code */ if (optStart_mode){ if ( *word == '^' && dbindex -> isspacealnum [(unsigned char) word[1]] && strchr (word, '|') == NULL) { first = word[1]; end = dbindex->optStart[i2c(c2i(first)+1)]; start = dbindex->optStart[first]; #if 0 fprintf (stderr, "optStart_regexp [%i] = %p\n", first, start); fprintf (stderr, "optStart_regexp [%i] = %p\n", i2c(c2i(first)+1), end); #endif if (end < start) end = dbindex->end; // FIND_NEXT(end, dbindex -> end); } } #endif if ((err = regcomp(&re, word, REG_ICASE|REG_NOSUB|type))) { regerror(err, &re, erbuf, sizeof(erbuf)); log_info( "regcomp(%s): %s\n", word, erbuf ); return 0; } pt = start; while (pt && pt < end) { for (p = pt; *p != '\t' && p < end; ++p); ++_dict_comparisons; if (dict_match (&re, pt, p - pt, 0)) { if (!previous || compare(previous, dbindex, pt, end)) { ++count; if (!dict_add_word_to_list (l, database, dbindex, previous = pt)) { break; } } } pt = p + 1; FIND_NEXT( pt, end ); } regfree(&re); return count; } static int dict_search_re( lst_List l, const char *word, const dictDatabase *database, dictIndex *dbindex) { return dict_search_regexpr( l, word, database, dbindex, REG_EXTENDED ); } static int dict_search_regexp( lst_List l, const char *word, const dictDatabase *database, dictIndex *dbindex) { return dict_search_regexpr( l, word, database, dbindex, 0 /*REG_BASIC*/ ); } static int dict_search_soundex( lst_List l, const char *word, const dictDatabase *database, dictIndex *dbindex) { const char *pt; const char *end; int count = 0; char soundex [10]; char soundex2 [5]; char buffer[MAXWORDLEN]; char *d; const unsigned char *s; int i; int c = (unsigned char)*word; const char *previous = NULL; assert (dbindex); if (optStart_mode){ pt = dbindex->optStart[ c ]; end = dbindex->optStart[ i2c(c2i(c)+1) ]; if (end < pt) end = dbindex->end; }else{ pt = dbindex->start; end = dbindex->end; } txt_soundex2 (word, soundex); while (pt && pt < end) { for (i = 0, s = pt, d = buffer; i < MAXWORDLEN - 1; i++, ++s) { if (*s == '\t') break; if (!dbindex -> isspacealnum [*s]) continue; *d++ = *s; } *d = '\0'; txt_soundex2 (buffer, soundex2); if (!strcmp (soundex, soundex2)) { if (!previous || compare(previous, dbindex, pt, end)) { if (!dict_add_word_to_list (l, database, dbindex, previous = pt)) { break; } ++count; } } FIND_NEXT(pt,end); } return count; } /* I was unable to locate V. I. Levenshtein, "Binary codes capable of correcting deletions, insertions, and reversals," Sov. Phys.-Dokt. 10(8): Feb. 1966, 707-710. So, I used Joseph J. Pollock and Antonio Zamora, "Automatic spelling correction in scientific and scholarly text," CACM, 27(4): Apr. 1985, 358-368. They point out (p. 363) that the precedence of these tests, when used for spelling correction, is OMISSION = TRANSPOSITION > INSERTION > SUBSTITUTION. If list is not sorted, then this order should be used. In this routine, we only consider corrections with a Levenshtein distance of 1. */ typedef struct lev_args_ { const dictDatabase *database; dictIndex *dbindex; lst_List l; } LEV_ARGS; #define CHECK(word, args) \ if ((pt = dict_index_search((word), (args) -> dbindex)) \ && !compare((word), (args) -> dbindex, pt, (args) -> dbindex -> end)) \ { \ if (!set_member(s,(word))) { \ ++count; \ set_insert(s,str_find((word))); \ if (!dict_add_word_to_list ((args) -> l, (args) -> database, (args) -> dbindex, pt)) return count; \ PRINTF(DBG_LEV,(" %s added\n",(word))); \ } \ } #define LEV_VARS \ char tmp; #include "lev.h" static int dict_search_levenshtein ( lst_List l, const char *word, const dictDatabase *database, dictIndex *dbindex) { LEV_ARGS lev_args = { database, dbindex, l }; assert (database); assert (dbindex); if (database -> alphabet){ return dict_search_lev ( word, database -> alphabet, dbindex -> flag_utf8, &lev_args); }else{ if (dbindex -> flag_utf8){ return dict_search_lev ( word, global_alphabet_ascii, 1, &lev_args); }else{ return dict_search_lev ( word, global_alphabet_8bit, 0, &lev_args); } } } /* makes anagram of the 8-bit string 'str' if length == -1 then str is 0-terminated string */ static void stranagram_8bit (char *str, int length) { char* i = str; char* j; char v; assert (str); if (length == -1) length = strlen (str); j = str + length - 1; while (i < j){ v = *i; *i = *j; *j = v; ++i; --j; } } #if HAVE_UTF8 /* makes anagram of the utf-8 string 'str' Returns non-zero if success, 0 otherwise */ static int stranagram_utf8 (char *str) { size_t len; char *p; mbstate_t ps; assert (str); memset (&ps, 0, sizeof (ps)); for (p = str; *p; ){ len = mbrlen__ (p, MB_CUR_MAX__, &ps); if ((int) len < 0) return 0; /* not a UTF-8 string */ if (len > 1) stranagram_8bit (p, len); p += len; } stranagram_8bit (str, -1); return 1; } #endif /* makes anagram of utf-8 string 'str' */ static int stranagram (char *str, int utf8_string) { assert (str); #if HAVE_UTF8 if (utf8_string){ return stranagram_utf8 (str); }else{ stranagram_8bit (str, -1); return 1; } #else stranagram_8bit (str, -1); return 1; #endif } static int dict_search_suffix( lst_List l, const char *word, const dictDatabase *database) { int ret; lst_Position p; dictWord *dw; char *buf = NULL; int count; assert (database); if (database->index_suffix){ buf = (char *) alloca (strlen (word)); strcpy (buf, word); PRINTF(DBG_SEARCH, ("anagram: '%s' ==> ", buf)); if (!stranagram (buf, utf8_mode)){ PRINTF(DBG_SEARCH, ("failed building anagram\n")); return 0; /* invalid utf8 string */ } count = lst_length (l); PRINTF(DBG_SEARCH, ("'%s'\n", buf)); ret = dict_search_prefix ( l, buf, database, database->index_suffix, 0, INT_MAX); LST_ITERATE (l, p, dw) { if (count <= 0){ stranagram (dw -> word, utf8_mode); } --count; } return ret; }else{ return dict_search_bmh( l, word, database, database -> index, BMH_SUFFIX ); } } static int dict_search_last ( lst_List l, const char *word, const dictDatabase *database) { int ret; lst_Position p; dictWord *dw; char *buf = NULL; int count; assert (database); if (database->index_suffix){ buf = (char *) alloca (strlen (word)); strcpy (buf, word); PRINTF(DBG_SEARCH, ("anagram: '%s' ==> ", buf)); if (!stranagram (buf, utf8_mode)){ PRINTF(DBG_SEARCH, ("failed building anagram\n")); return 0; /* invalid utf8 string */ } count = lst_length (l); PRINTF(DBG_SEARCH, ("'%s'\n", buf)); ret = dict_search_first (l, buf, database, database->index_suffix); LST_ITERATE (l, p, dw) { if (count-- <= 0){ stranagram (dw -> word, utf8_mode); } } return ret; }else{ return dict_search_bmh( l, word, database, database -> index, BMH_LAST ); } } #if HAVE_UTF8 static const char *utf8_err_msg = "\ error: The request is not a valid UTF-8 string"; #endif /* returns a number of matches ( >= 0 ) or negative value for invalid UTF-8 string */ int dict_search_database_ ( lst_List l, const char *word, const dictDatabase *database, int strategy_or_define ) { char *buf = NULL; #if HAVE_UTF8 dictWord *dw = NULL; #endif unsigned int skip_count = 0; unsigned int item_count = INT_MAX; int strategy = strategy_or_define & ~DICT_MATCH_MASK; define_or_match = (strategy == strategy_or_define); assert (database); assert (database -> index); if (strategy == DICT_STRAT_DOT){ strategy = database -> default_strategy; } if (strategy == DICT_STRAT_NPREFIX){ if (2 == sscanf (word, "%u#%u#", &skip_count, &item_count)){ while (*word++ != '#'); ++word; while (*word++ != '#'); } } buf = alloca( strlen( word ) + 1 ); #if HAVE_UTF8 if ( !strcmp(utf8_err_msg, word) || tolower_alnumspace ( word, buf, database -> index -> flag_allchars, database -> index -> flag_casesensitive, utf8_mode)) { PRINTF(DBG_SEARCH, ("tolower_... ERROR!!!\n")); dw = xmalloc (sizeof (dictWord)); memset (dw, 0, sizeof (dictWord)); dw -> database = database; dw -> def = utf8_err_msg; dw -> def_size = -1; dw -> word = strdup (word); lst_append (l, dw); return -1; } #else tolower_alnumspace ( word, buf, database -> index -> flag_allchars, database -> index -> flag_casesensitive, utf8_mode); #endif if (!buf [0] && word [0]){ /* This may happen because of invalid --locale specified. Without following line entire dictionary will be returned for non-ASCII words. */ return 0; } switch (strategy) { case DICT_STRAT_EXACT: return dict_search_exact( l, buf, database, database->index, strategy_or_define != strategy); case DICT_STRAT_PREFIX: case DICT_STRAT_NPREFIX: return dict_search_prefix (l, buf, database, database->index, skip_count, item_count); case DICT_STRAT_SUBSTRING: return dict_search_substring( l, buf, database, database->index ); case DICT_STRAT_SUFFIX: return dict_search_suffix( l, buf, database ); case DICT_STRAT_RE: return dict_search_re( l, word, database, database->index ); case DICT_STRAT_REGEXP: return dict_search_regexp( l, word, database, database->index ); case DICT_STRAT_SOUNDEX: return dict_search_soundex( l, buf, database, database->index ); case DICT_STRAT_LEVENSHTEIN: return dict_search_levenshtein( l, buf, database, database->index); case DICT_STRAT_WORD: return dict_search_word( l, buf, database); case DICT_STRAT_FIRST: return dict_search_first( l, buf, database, database->index ); case DICT_STRAT_LAST: return dict_search_last( l, buf, database ); default: /* plugins may support unusual search strategies */ return 0; } } /* Replaces invisible databases with db argument. */ static void replace_invisible_databases ( lst_Position pos, const dictDatabase *db) { dictWord *dw; while (pos){ dw = (dictWord *) lst_get_position (pos); if ( dw -> database && dw -> database -> invisible && !dw -> database_visible) { dw -> database_visible = db; } pos = lst_next_position (pos); } } /* returns a number of matches ( >= 0 ) or negative value for invalid UTF-8 string */ int dict_search ( lst_List l, const char *const word, const dictDatabase *database, int strategy, int option_mime, int *extra_result, const dictPluginData **extra_data, int *extra_data_size) { int count = 0; dictWord *dw; int norm_strategy = strategy & ~DICT_MATCH_MASK; if (extra_result) *extra_result = DICT_PLUGIN_RESULT_NOTFOUND; assert (word); assert (database); if ( database -> strategy_disabled && database -> strategy_disabled [norm_strategy]) { /* disable_strategy keyword from configuration file */ #if 0 PRINTF (DBG_SEARCH, ( ":S: strategy '%s' is disabled for database '%s'\n", get_strategies () [norm_strategy] -> name, database -> databaseName ? database -> databaseName : "(unknown)")); #endif return 0; } PRINTF (DBG_SEARCH, (":S: Searching in '%s'\n", database -> databaseName)); #if 0 fprintf (stderr, "STRATEGY: %x\n", strategy); #endif if (database -> index){ PRINTF (DBG_SEARCH, (":S: database search\n")); count = dict_search_database_ (l, word, database, strategy); } #ifdef USE_PLUGIN if (!count && database -> plugin){ PRINTF (DBG_SEARCH, (":S: plugin search\n")); count = dict_search_plugin ( l, word, database, strategy, extra_result, extra_data, extra_data_size); if (count) return count; } #endif if (!count && database -> virtual_db_list){ lst_Position db_list_pos; dictDatabase *db = NULL; int old_count = lst_length (l); assert (lst_init_position (database -> virtual_db_list)); LST_ITERATE (database -> virtual_db_list, db_list_pos, db){ count += dict_search ( l, word, db, strategy, option_mime, extra_result, extra_data, extra_data_size); } if (count > old_count){ replace_invisible_databases ( lst_nth_position (l, old_count + 1), database); } } if (!count && database -> mime_db){ int old_count = lst_length (l); count += dict_search ( l, word, (option_mime ? database -> mime_mimeDB : database -> mime_nomimeDB), strategy, 0, extra_result, extra_data, extra_data_size); if (count > old_count){ replace_invisible_databases ( lst_nth_position (l, old_count + 1), database); } } if (count > 0 && extra_result) *extra_result = DICT_PLUGIN_RESULT_FOUND; return count; } dictIndex *dict_index_open( const char *filename, int init_flags, const dictIndex *base) { struct stat sb; static int tabInit = 0; dictIndex *i; dictDatabase db; int j; char buf[2]; int old_8bit_format = 0; int first_char; int first_char_uc; if (!filename) return NULL; i = xmalloc( sizeof( struct dictIndex ) ); if (!tabInit) dict_table_init(); tabInit = 1; memset( i, 0, sizeof( struct dictIndex ) ); if ((i->fd = open( filename, O_RDONLY )) < 0) err_fatal_errno( __FUNCTION__, "Cannot open index file \"%s\"\n", filename ); if (fstat( i->fd, &sb )) err_fatal_errno( __FUNCTION__, "Cannot stat index file \"%s\"\n", filename ); i->size = sb.st_size; if (mmap_mode){ #ifdef HAVE_MMAP if (i->size) { i->start = mmap( NULL, i->size, PROT_READ, MAP_SHARED, i->fd, 0 ); if ((void *)i->start == (void *)(-1)) err_fatal_errno ( __FUNCTION__, "Cannot mmap index file \"%s\"\b", filename ); } else i->start = NULL; /* allow for /dev/null dummy index */ #else err_fatal (__FUNCTION__, "This should not happen"); #endif }else{ i->start = xmalloc (i->size); if (-1 == read (i->fd, (char *) i->start, i->size)) err_fatal_errno ( __FUNCTION__, "Cannot read index file \"%s\"\b", filename ); close (i -> fd); i -> fd = 0; } i->end = i->start + i->size; i->flag_8bit = 0; if (base){ i->flag_utf8 = base -> flag_utf8; i->flag_allchars = base -> flag_allchars; i->flag_casesensitive = base -> flag_casesensitive; } i->isspacealnum = isspacealnumtab; if (optStart_mode){ for (j = 0; j <= UCHAR_MAX; j++) i->optStart[j] = i->start; } if (init_flags){ memset (&db, 0, sizeof (db)); db.index = i; /* for exact search */ i->flag_allchars = 1; i->isspacealnum = isspacealnumtab_allchars; /* allchars flag */ i->flag_allchars = 0 != dict_search_database_ (NULL, DICT_FLAG_ALLCHARS, &db, DICT_STRAT_EXACT); PRINTF(DBG_INIT, (":I: \"%s\": flag_allchars=%i\n", filename, i->flag_allchars)); /* case-sensitive flag */ i->flag_casesensitive = 0 != dict_search_database_ (NULL, DICT_FLAG_CASESENSITIVE, &db, DICT_STRAT_EXACT); PRINTF(DBG_INIT, (":I: \"%s\": flag_casesensitive=%i\n", filename, i->flag_casesensitive)); /* utf8 flag */ if (!i -> flag_allchars) i -> isspacealnum = isspacealnumtab; i->flag_utf8 = 0 != dict_search_database_ (NULL, DICT_FLAG_UTF8, &db, DICT_STRAT_EXACT); PRINTF(DBG_INIT, (":I: \"%s\": flag_utf8=%i\n", filename, i->flag_utf8)); if (i->flag_utf8 && !utf8_mode){ log_info( ":E: locale '%s' can not be used for utf-8 dictionaries. Exiting\n", locale ); exit (1); } /* 8bit flag */ i->flag_8bit = 0 != dict_search_database_ (NULL, DICT_FLAG_8BIT_NEW, &db, DICT_STRAT_EXACT); old_8bit_format = 0 != dict_search_database_ (NULL, DICT_FLAG_8BIT_OLD, &db, DICT_STRAT_EXACT); if (old_8bit_format){ log_info( ":E: index file '%s' was created using dictfmt <1.9.15\n" ":E: and can not be used with dictd-1.9.15 or later\n" ":E: Rebuild it like this:\n" ":E: dictunformat db.index < db.dict | dictfmt -t --locale <8bit-locale> db-new\n", filename ); exit (1); } PRINTF(DBG_INIT, (":I: \"%s\": flag_8bit=%i\n", filename, i->flag_8bit)); if (i->flag_8bit && !bit8_mode){ log_info( ":E: locale '%s' can not be used for 8-bit dictionaries. Exiting\n", locale ); exit (1); } } if (optStart_mode){ buf[0] = ' '; buf[1] = '\0'; i->optStart[ ' ' ] = binary_search_8bit( buf, i, i->start, i->end ); for (j = 0; j < charcount; j++) { first_char = c(j); buf[0] = first_char; buf[1] = '\0'; i->optStart [first_char] = binary_search_8bit( buf, i, i->start, i->end ); if (!utf8_mode || first_char < 128){ first_char_uc = toupper (first_char); assert (first_char_uc >= 0 && first_char_uc <= UCHAR_MAX); i->optStart [first_char_uc] = i->optStart [first_char]; } } for (j = '0'; j <= '9'; j++) { buf[0] = j; buf[1] = '\0'; i->optStart[j] = binary_search_8bit( buf, i, i->start, i->end ); } i->optStart[UCHAR_MAX] = i->end; i->optStart[UCHAR_MAX+1] = i->end; if (dbg_test (DBG_SEARCH)){ for (j=0; j <= UCHAR_MAX; ++j){ if (!utf8_mode || j <= CHAR_MAX) printf ( "optStart [%c] = (%p) %10s\n", j, i->optStart [j], i->optStart [j]); else printf ( "optStart [%i] = (%p) %10s\n", j, i->optStart [j], i->optStart [j]); } } } return i; } void dict_index_close( dictIndex *i ) { if (!i) return; if (mmap_mode){ #ifdef HAVE_MMAP if (i->fd >= 0) { if(i->start) munmap( (void *)i->start, i->size ); close( i->fd ); i->fd = 0; } #else err_fatal (__FUNCTION__, "This should not happen"); #endif }else{ if (i -> start) xfree ((char *) i -> start); } xfree (i); }