/* dictdplugin_judy.c -- * Created: Tue Aug 5 19:19:48 2003 by vle@gmx.net * Copyright 2003 Aleksey Cheusov * This program comes with ABSOLUTELY NO WARRANTY. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 1, or (at your option) any * later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "dictP.h" #include "dictdplugin.h" #include "data.h" #include "str.h" #include "plugins_common.h" #include #include #if STRING_H #include #endif #include #include #if HAVE_LIMITS_H #include #endif #if WCTYPE_H #include #endif #define BUFSIZE 4096 #ifndef BOOL #define BOOL char #endif /**********************************************************/ //#define DONOT_USE_INTERNAL_HEAP /* internal heap may speed-up the plugin */ #include "heap.h" typedef struct global_data_s { char m_err_msg [BUFSIZE]; void *m_heap; void *m_heap2; int m_mres_count; const char ** m_mres; int *m_mres_sizes; int *m_offs_size_array; dictData *m_data; int m_strat_exact; int m_strat_prefix; int m_strat_lev; int m_strat_word; Pvoid_t m_judy_array; size_t m_max_hw_len; char m_conf_index_fn [NAME_MAX+1]; char m_conf_data_fn [NAME_MAX+1]; char m_default_db_dir [NAME_MAX+1]; BOOL m_conf_allchars; BOOL m_conf_utf8; BOOL m_flag_allchars; BOOL m_flag_utf8; char *m_alphabet_global_8bit; char *m_alphabet_global_ascii; char *m_alphabet; } global_data; int dictdb_close (void *dict_data); int dictdb_open ( const dictPluginData *init_data, int init_data_size, int *version, void ** dict_data); const char *dictdb_error (void *dict_data); int dictdb_free (void * dict_data); int dictdb_search ( void *dict_data, const char *word, int word_size, int search_strategy, int *ret, const char **result_extra, int *result_extra_size, const char * const* *result, const int **result_sizes, int *results_count); /**********************************************************/ static void plugin_error (global_data *dict_data, const char *err_msg) { strlcpy (dict_data -> m_err_msg, err_msg, BUFSIZE); } static global_data * global_data_create (void) { global_data *d = (global_data *) xmalloc (sizeof (*d)); memset (d, 0, sizeof (*d)); d -> m_strat_exact = -1; d -> m_strat_prefix = -1; d -> m_strat_lev = -1; d -> m_strat_word = -1; return d; } static void global_data_destroy (global_data *d) { dictdb_free (d); if (d -> m_offs_size_array) xfree (d -> m_offs_size_array); if (d -> m_alphabet) xfree (d -> m_alphabet); if (d -> m_alphabet_global_8bit) xfree (d -> m_alphabet_global_8bit); if (d -> m_alphabet_global_ascii) xfree (d -> m_alphabet_global_ascii); JudySLFreeArray (&d -> m_judy_array, 0); heap_destroy (&d -> m_heap); heap_destroy (&d -> m_heap2); dict_data_close (d -> m_data); if (d) xfree (d); } /********************************************************************/ static void set_strat ( const dictPluginData_strategy * strat_data, global_data * dict_data) { if (!strcmp (strat_data -> name, "exact")){ dict_data -> m_strat_exact = strat_data -> number; }else if (!strcmp (strat_data -> name, "prefix")){ dict_data -> m_strat_prefix = strat_data -> number; }else if (!strcmp (strat_data -> name, "lev")){ dict_data -> m_strat_lev = strat_data -> number; }else if (!strcmp (strat_data -> name, "word")){ dict_data -> m_strat_word = strat_data -> number; } } static void concat_dir_and_fn ( char *dest, size_t dest_size, const char *dir, const char *fn) { if (fn [0] != '/'){ strlcpy (dest, dir, dest_size); if (dest [strlen (dest) - 1] != '/') strlcat (dest, "/", dest_size); strlcat (dest, fn, dest_size); }else{ strlcpy (dest, fn, dest_size); } } static BOOL split_index ( global_data *dict_data, char *buf, unsigned long *def_offset, unsigned long *def_size) { char *tab = 0; char *def_offset_s = 0; char *def_size_s = 0; tab = strchr (buf, '\t'); if (!tab){ plugin_error (dict_data, "corrupted index file"); return 0; } *tab = 0; def_offset_s = tab + 1; tab = strchr (def_offset_s, '\t'); if (!tab){ plugin_error (dict_data, "corrupted index file"); return 0; } *tab = 0; def_size_s = tab + 1; *def_offset = b64_decode (def_offset_s); *def_size = b64_decode (def_size_s); return 1; } static void it_incr1 ( global_data *dict_data, PPvoid_t value, const char* word, unsigned long offs, unsigned long size) { if ( !strcmp (word, "00-database-utf8") || !strcmp (word, "00databaseutf8")) { dict_data -> m_flag_utf8 = 1; } if ( !strcmp (word, "00-database-allchars") || !strcmp (word, "00databaseallchars")) { dict_data -> m_flag_allchars = 1; } ++ *(PWord_t) value; } /* iterate over entries WORD/VALUE in judy array JUDY */ #define JUDY_ITERATE(JUDY,VALUE,WORD) \ for (; \ VALUE; \ VALUE = JudySLNext (JUDY, WORD, 0)) #define JUDY_ITERATE_ALL(JUDY,VALUE,WORD) \ WORD [0] = 0; \ VALUE = JudySLFirst (JUDY, WORD, 0); \ JUDY_ITERATE(JUDY,VALUE,WORD) /* static void debug_print (global_data *dict_data) { char word [BUFSIZE] = ""; PPvoid_t value; assert (sizeof (word) > dict_data -> m_max_hw_len); JUDY_ITERATE_ALL (dict_data -> m_judy_array, value, word){ fprintf (stderr, "%s --> %li\n", word, * (PWord_t) value); } } */ static Word_t count2offs (global_data *dict_data) { char word [BUFSIZE] = ""; PPvoid_t value; Word_t sum = 0; Word_t val; assert (sizeof (word) > dict_data -> m_max_hw_len); JUDY_ITERATE_ALL (dict_data -> m_judy_array, value, word){ val = (*(PWord_t) value); *(PWord_t) value = sum; sum += val; } return sum; } static void read_index_file ( global_data *dict_data, void (*fun) ( global_data *dict_data, PPvoid_t value, const char* word, unsigned long offs, unsigned long size)) { char buf [BUFSIZE]; FILE *fd = 0; PPvoid_t value = NULL; int word_count = 0; unsigned long def_offset; unsigned long def_size; size_t len; fd = fopen (dict_data -> m_conf_index_fn, "r"); if (!fd){ plugin_error (dict_data, strerror(errno)); return; } while (fgets (buf, BUFSIZE, fd) != (char *) NULL){ if ('\n' == buf [strlen (buf) - 1]) buf [strlen (buf) - 1] = 0; if (!split_index (dict_data, buf, &def_offset, &def_size)){ fclose (fd); return; } if ( tolower_alnumspace ( buf, buf, dict_data -> m_conf_allchars, 0, dict_data -> m_conf_utf8)) { plugin_error (dict_data, "tolower_alnumspace failed while reading .index file"); fclose (fd); return; } len = strlen (buf); if (len > dict_data -> m_max_hw_len) dict_data -> m_max_hw_len = len; value = JudySLIns (&dict_data -> m_judy_array, buf, 0); assert (value != (PPvoid_t) 0 && value != (PPvoid_t) -1); (*fun) (dict_data, value, buf, def_offset, def_size); ++word_count; } if (ferror (fd)){ fclose (fd); plugin_error (dict_data, "reading error"); return; } fclose (fd); } static void it_fill_array ( global_data *dict_data, PPvoid_t value, const char* word, unsigned long offs, unsigned long size) { Word_t val = * (PWord_t) value; while (dict_data -> m_offs_size_array [val + val + 0] != -1){ ++val; } dict_data -> m_offs_size_array [val + val + 0] = offs; dict_data -> m_offs_size_array [val + val + 1] = size; } static void init_index_file (global_data *dict_data) { Word_t sum = 0; size_t array_size = 0; char word [BUFSIZE]=""; PPvoid_t value; Word_t val; dict_data -> m_judy_array = NULL; read_index_file (dict_data, it_incr1); if (dict_data -> m_err_msg [0]) return; if (!dict_data -> m_conf_utf8 && dict_data -> m_flag_utf8){ plugin_error ( dict_data, "'utf-8' flag in plugin configuration and database files differ"); return; } if (dict_data -> m_conf_allchars != dict_data -> m_flag_allchars){ plugin_error ( dict_data, "'allchars' flag in the plugin configuration and database files differ"); return; } /* debug_print (dict_data); */ sum = count2offs (dict_data); array_size = 2 * (sum/* + 1*/) * sizeof (int); assert (sizeof (word) > dict_data -> m_max_hw_len); dict_data -> m_offs_size_array = xmalloc (array_size); memset (dict_data -> m_offs_size_array, -1, array_size); read_index_file (dict_data, it_fill_array); if (dict_data -> m_err_msg [0]) return; JUDY_ITERATE_ALL (dict_data -> m_judy_array, value, word){ val = *(PWord_t) value; *value = dict_data -> m_offs_size_array + val * 2; /* fprintf (stderr, "%s --> %p\n", word, *value); fprintf (stderr, "%s --> %li\n", word, *(PWord_t)value - (long)dict_data -> m_offs_size_array); */ } /* JUDY_ITERATE_ALL (dict_data -> m_judy_array, value, word){ fprintf ( stderr, "%s --> %i %i\n", word, ((int *) *value) [0], ((int *) *value) [1]); } */ } static void init_data_file (global_data *dict_data) { assert (!dict_data -> m_data); dict_data -> m_data = dict_data_open (dict_data -> m_conf_data_fn, 0); } static int process_name_value ( const char *option, const char *value, void *data) { global_data *dict_data = (global_data *) data; if (!strcmp(option, "allchars")){ if (strcmp (value, "0") && strcmp (value, "")){ dict_data -> m_conf_allchars = 1; } }else if (!strcmp(option, "utf8")){ if (strcmp (value, "0") && strcmp (value, "")){ dict_data -> m_conf_utf8 = 1; } }else if (!strcmp(option, "index")){ concat_dir_and_fn ( dict_data -> m_conf_index_fn, sizeof (dict_data -> m_conf_index_fn), dict_data -> m_default_db_dir, value); }else if (!strcmp(option, "data")){ concat_dir_and_fn ( dict_data -> m_conf_data_fn, sizeof (dict_data -> m_conf_data_fn), dict_data -> m_default_db_dir, value); } return 0; } static void on_error (const char *bad_line, void *data) { global_data *dict_data = (global_data *) data; snprintf ( dict_data -> m_err_msg, BUFSIZE, "invalid configure line: '%s'", bad_line); } static void init_alphabet (global_data *dict_data) { int ret = 0; int exit_code = 0; const char * const* defs; const int * defs_sizes; int count = 0; int len = 0; char *p = NULL; char *alphabet = NULL; assert (dict_data); exit_code = dictdb_search ( dict_data, "00-database-alphabet", -1, dict_data -> m_strat_exact, &ret, NULL, 0, &defs, &defs_sizes, &count); if (!exit_code && ret == DICT_PLUGIN_RESULT_FOUND && count > 0){ if (-1 == defs_sizes [0]) len = strlen (defs [0]); else len = defs_sizes [0]; alphabet = dict_data -> m_alphabet = xmalloc (len + 1); memcpy (alphabet, defs [0], len); alphabet [len] = 0; p = strchr (alphabet, '\n'); if (p) *p = 0; /* fprintf (stderr, "alphabet = `%s`\n", alphabet);*/ } dictdb_free (dict_data); } int dictdb_open ( const dictPluginData *init_data, int init_data_size, int *version, void ** data) { int i; int err; global_data *dict_data = global_data_create (); *data = (void *) dict_data; err = heap_create (&dict_data -> m_heap, NULL); if (err){ plugin_error (dict_data, heap_error (err)); return 2; } err = heap_create (&dict_data -> m_heap2, NULL); if (err){ plugin_error (dict_data, heap_error (err)); return 3; } if (version) *version = 0; for (i=0; i < init_data_size; ++i){ switch (init_data [i].id){ case DICT_PLUGIN_INITDATA_STRATEGY: set_strat ( (const dictPluginData_strategy * ) init_data [i].data, dict_data); break; case DICT_PLUGIN_INITDATA_DICT: { int len = init_data [i].size; char *buf = NULL; if (-1 == len) len = strlen (init_data [i].data); buf = xstrdup(init_data [i].data); process_lines (buf, len, dict_data, process_name_value, on_error); if (dict_data -> m_err_msg [0]){ dictdb_free (dict_data); return 4; } if (buf) xfree (buf); if (!dict_data -> m_conf_index_fn [0]){ plugin_error (dict_data, "missing 'index' option"); return 5; } if (!dict_data -> m_conf_data_fn [0]){ plugin_error (dict_data, "missing 'data' option"); return 6; } } break; case DICT_PLUGIN_INITDATA_DEFDBDIR: strlcpy ( dict_data -> m_default_db_dir, init_data [i].data, sizeof (dict_data -> m_default_db_dir)); break; case DICT_PLUGIN_INITDATA_ALPHABET_8BIT: dict_data -> m_alphabet_global_8bit = xstrdup (init_data [i].data); break; case DICT_PLUGIN_INITDATA_ALPHABET_ASCII: dict_data -> m_alphabet_global_ascii = xstrdup (init_data [i].data); break; default: break; } } init_index_file (dict_data); init_data_file (dict_data); if (dict_data -> m_err_msg [0]) return 7; if (dict_data -> m_max_hw_len > BUFSIZE - 100){ plugin_error (dict_data, "Index file contains too long word"); return 8; } init_alphabet (dict_data); /* debug_print (dict_data); */ return 0; } int dictdb_close (void *data) { global_data_destroy (data); return 0; } const char *dictdb_error (void *dict_data) { global_data *data = (global_data *)dict_data; if (data -> m_err_msg [0]) return data -> m_err_msg; else return NULL; } int dictdb_free (void * data) { int i; global_data *dict_data = (global_data *) data; /* fprintf (stderr, "dictdb_free\n"); */ if (dict_data){ free_minus1_array (dict_data -> m_mres_sizes); dict_data -> m_mres_sizes = NULL; for (i = 0; i < dict_data -> m_mres_count; ++i){ heap_free (dict_data -> m_heap, (void *) dict_data -> m_mres [i]); } dict_data -> m_mres_count = 0; heap_free (dict_data -> m_heap2, dict_data -> m_mres); dict_data -> m_mres = NULL; } return 0; } /* set dict_data->m_mres_count and dict_data->m_mres */ static int match_exact ( global_data *dict_data, const char *word) { int const * const *result_curr; if (!word [0]) return 0; result_curr = (int const *const *) JudySLGet ( dict_data -> m_judy_array, word, 0); if (!result_curr){ return 0; } dict_data -> m_mres = (const char **) heap_alloc (dict_data -> m_heap2, sizeof (dict_data -> m_mres [0])); dict_data -> m_mres [0] = heap_strdup (dict_data -> m_heap, word); dict_data -> m_mres_count = 1; return 0; } static int match_prefix ( global_data *dict_data, const char *word) { PPvoid_t value; char curr_word [BUFSIZE]; size_t len = strlen (word); int cmp_res; strlcpy (curr_word, word, sizeof (curr_word)); value = JudySLGet (dict_data -> m_judy_array, curr_word, 0); if (!value) value = JudySLNext (dict_data -> m_judy_array, curr_word, 0); /* fprintf (stderr, "first=%s %p\n", curr_word, value);*/ for ( ; value; value = JudySLNext (dict_data -> m_judy_array, curr_word, 0)) { cmp_res = strncmp (word, curr_word, len); if (cmp_res){ /* fprintf (stderr, "%s != %s\n", word, curr_word); */ break; } ++dict_data -> m_mres_count; dict_data -> m_mres = (const char **) heap_realloc ( dict_data -> m_heap2, dict_data -> m_mres, dict_data -> m_mres_count * sizeof (dict_data -> m_mres [0])); dict_data -> m_mres [dict_data -> m_mres_count - 1] = heap_strdup (dict_data -> m_heap, curr_word); } return 0; } #define CHECK(word, dict_data) \ if ((word) [0]){ \ value = JudySLGet ((dict_data) -> m_judy_array, (word), 0);\ if (value && strcmp (prev_buf, (word))){ \ strlcpy (prev_buf, (word), BUFSIZE); \ \ ++(dict_data) -> m_mres_count; \ \ (dict_data) -> m_mres = (const char **) \ heap_realloc ( \ (dict_data) -> m_heap2, \ (dict_data) -> m_mres, \ (dict_data) -> m_mres_count \ * sizeof ((dict_data) -> m_mres [0])); \ (dict_data) -> m_mres [(dict_data) -> m_mres_count - 1] = \ heap_strdup ((dict_data) -> m_heap, (word)); \ } \ } #define LEV_VARS \ PPvoid_t value; \ char prev_buf [BUFSIZE] = ""; \ char tmp; #define LEV_ARGS global_data static char const global_alphabet [] = "qwertyuiopasdfghjklzxcvbnm0123456789"; #include "lev.h" static int match_lev ( global_data *dict_data, const char *word) { const char *alphabet = dict_data -> m_alphabet; if (!alphabet) alphabet = global_alphabet; dict_search_lev (word, alphabet, dict_data -> m_flag_utf8, dict_data); return 0; } static int match_word ( global_data *dict_data, const char *word) { return 0; } int dictdb_search ( void *data, const char *word, int word_size, int search_strategy, int *ret, const char **result_extra, int *result_extra_size, const char * const* *result, const int **result_sizes, int *results_count) { int match_search_type; char word_copy2 [BUFSIZE]; int exit_code = 0; global_data *dict_data = (global_data *) data; if (result_extra) *result_extra = NULL; if (result_extra_size) *result_extra_size = 0; if (result_sizes) *result_sizes = NULL; *ret = DICT_PLUGIN_RESULT_NOTFOUND; if (-1 == word_size){ word_size = strlen (word); } match_search_type = search_strategy & DICT_MATCH_MASK; search_strategy &= ~DICT_MATCH_MASK; assert (!dict_data -> m_mres); assert (!dict_data -> m_mres_sizes); assert (!dict_data -> m_mres_count); assert (heap_isempty (dict_data -> m_heap)); strlcpy (word_copy2, word, sizeof (word_copy2)); if ( tolower_alnumspace ( word_copy2, word_copy2, dict_data -> m_conf_allchars, 0, dict_data -> m_conf_utf8)) { plugin_error (dict_data, "tolower_alnumspace in dictdb_search failed"); return 1; } if (word_size > dict_data -> m_max_hw_len){ /* fprintf (stderr, "This word is too long\n"); */ return 0; } if (match_search_type){ /* MATCH command */ dict_data -> m_mres_count = 0; if (search_strategy == dict_data -> m_strat_exact){ exit_code = match_exact ( dict_data, word_copy2); }else if (search_strategy == dict_data -> m_strat_word){ exit_code = match_word ( dict_data, word_copy2); }else if (search_strategy == dict_data -> m_strat_prefix){ exit_code = match_prefix ( dict_data, word_copy2); }else if (search_strategy == dict_data -> m_strat_lev){ exit_code = match_lev ( dict_data, word_copy2); } if (exit_code) return exit_code; if (!dict_data -> m_mres_count) return 0; dict_data -> m_mres_sizes = (int *) alloc_minus1_array (dict_data -> m_mres_count); *result = dict_data -> m_mres; *result_sizes = dict_data -> m_mres_sizes; *results_count = dict_data -> m_mres_count; *ret = DICT_PLUGIN_RESULT_FOUND; return 0; }else{ /* DEFINE command */ int const * const * offs_size; int const * const * offs_size_next; int cnt; int i; if (!word_copy2 [0]) return 0; offs_size = (int const *const *) JudySLGet ( dict_data -> m_judy_array, word_copy2, 0); if (!offs_size) return 0; offs_size_next = (const int *const *) JudySLNext ( dict_data -> m_judy_array, word_copy2, 0); if (offs_size_next){ cnt = (const int *) *offs_size_next - (const int *) *offs_size; cnt /= 2; }else{ cnt = 1;/* fix this */; } dict_data -> m_mres = (const char **) heap_alloc ( dict_data -> m_heap2, cnt * sizeof (dict_data -> m_mres [0])); dict_data -> m_mres_sizes = (int *) alloc_minus1_array (cnt); dict_data -> m_mres_count = cnt; for (i = 0; i < cnt; ++i){ dict_data -> m_mres [i] = dict_data_read_ ( dict_data -> m_data, (*offs_size) [i + i], (*offs_size) [i + i + 1], NULL, NULL); } *result = dict_data -> m_mres; *result_sizes = dict_data -> m_mres_sizes; *results_count = cnt; *ret = DICT_PLUGIN_RESULT_FOUND; return 0; } }