/* ==================================================================== * Copyright (c) 1999-2001 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * */ /* * * lm_3g.c -- Darpa Trigram LM module. * * HISTORY * * 28-Oct-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added lm3g_access_type() and necessary support. * * 15-Oct-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Bugfix: inclass_ugscore[lw3] changed to inclass_ugscore[w3] in * lm3g_tg_score(). (Thanks to dbansal@cs.) * * 15-Jul-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Corrected references to unigram_t.wid to unigram_t.mapid. * * 14-Apr-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added lm3g_n_lm() and lm3g_index2name(). * * 03-Apr-97 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added lm3g_raw_score() and lm_t.invlw. * Changed a number of function names from lm_... to lm3g_... * * 09-Jan-97 M K Ravishankar (rkm@cs) at Carnegie Mellon University * BUGFIX: Added check for lmp->unigrams[i].wid in lm_set_current(). * * 06-Dec-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Changed function name lmname_to_lm() to lm_name2lm(). * * 06-Dec-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Changed function name get_current_lm to lm_get_current. * Changed check for already existing word in lm_add_word, and added * condition to updating dictwid_map. * * 01-Jul-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Removed LM cache and replaced with find_bg and find_tg within the main * bigrams and trigram structures. No loss of speed and uses less memory. * * 24-Jun-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Fixed a number of memory leaks while deleting an LM. Added the global * dictwid_map, and allocated it once and never freed. Made sure lm_cache * is created only once. * * 14-Jun-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Modified lm_read to return 0 on success, and to delete any existing LM * with the new LM name (instead of reporting error and exiting). * Added backslash option in building filenames (for PC compatibility). * * $Log: lm_3g.c,v $ * Revision 1.11 2001/12/11 00:24:48 lenzo * Acknowledgement in License. * * Revision 1.10 2001/12/07 17:30:02 lenzo * Clean up and remove extra lines. * * Revision 1.9 2001/12/07 13:11:30 lenzo * Consolidate byte order code into byteorder.h. Note that there are still * two "senses" of byte swapping that are confusing, and all this should * be cleaned up and verified. * * Revision 1.8 2001/12/07 05:09:30 lenzo * License.xsxc * * Revision 1.7 2001/12/07 04:27:35 lenzo * License cleanup. Remove conditions on the names. Rationale: These * conditions don't belong in the license itself, but in other fora that * offer protection for recognizeable names such as "Carnegie Mellon * University" and "Sphinx." These changes also reduce interoperability * issues with other licenses such as the Mozilla Public License and the * GPL. This update changes the top-level license files and removes the * old license conditions from each of the files that contained it. * All files in this collection fall under the copyright of the top-level * LICENSE file. * * Revision 1.6 2001/10/23 22:20:30 lenzo * Change error logging and reporting to the E_* macros that call common * functions. This will obsolete logmsg.[ch] and they will be removed * or changed in future versions. * * Revision 1.5 2001/02/13 19:51:38 lenzo * *** empty log message *** * * Revision 1.4 2001/02/13 18:50:35 lenzo * Adding some more comments for a Solaris port. * * Revision 1.3 2000/12/12 23:01:42 lenzo * Rationalizing libs and names some more. Split a/d and fe libs out. * * Revision 1.2 2000/12/05 01:45:12 lenzo * Restructuring, hear rationalization, warning removal, ANSIfy * * Revision 1.1.1.1 2000/01/28 22:08:51 lenzo * Initial import of sphinx2 * * * Revision 8.9 94/10/11 12:36:28 rkm * Changed lm_tg_score to call lm_bg_score if no trigrams present or * the first word is invalid. * * Revision 8.8 94/07/29 11:54:23 rkm * Renamed lmSetParameters to lm_set_param and moved it into lm_add(). * Added functions lm_init_oov() to create initial list of OOVs, * and lm_add_word() to add new OOV at run time. * * Revision 8.7 94/05/19 14:19:59 rkm * Rewrote LM cache code for greater efficiency. * * Revision 8.6 94/05/10 10:47:58 rkm * Added lm_add() and lm_set_param() functions, for dynamically adding a new * in-memory LM to the set of available LMs. * * Revision 8.5 94/04/22 13:53:27 rkm * Added query_lm_cache_lines() to allow run-time spec of #cache lines. * * Revision 8.4 94/04/14 15:08:46 rkm * Added function lm_delete() to delete a named LM and reclaim space. * * Revision 8.3 94/04/14 14:40:27 rkm * Minor changes. * * Revision 8.1 94/02/15 15:09:22 rkm * Derived from v7. Includes multiple LMs for grammar switching. * * Revision 6.13 94/02/11 13:14:45 rkm * Added bigram and trigram multi-line caches, and functions, for v7. * Replaced sequential search in wstr2wid() with hash_lookup(). * * Revision 6.12 94/01/07 10:56:16 rkm * Corrected bug relating to input file format. * * Revision 6.11 93/12/17 13:14:52 rkm * *** empty log message *** * * Revision 6.10 93/12/03 17:09:59 rkm * Added ability to handle bigram-only dump files. * Added bigram -> MIN_PROB. * Added timestamp to dump files. * * Revision 6.9 93/12/01 12:29:55 rkm * Added ability to handle LM files containing only bigrams. * Excluded start_sym from interpolation of unigram prob with uniform prob. * * * 93/10/21 rkm@cs.cmu.edu * Added * * Revision 6.6 93/10/19 18:58:10 rkm * Added code to change bigram-prob(,) to very low value. The * Darpa LM file contains a spurious value to be ignored. * Fixed bug that dumps one trigram entry too many. * * Revision 6.5 93/10/15 15:00:14 rkm * * Revision 6.4 93/10/13 16:56:04 rkm * Added LM cache line stats. * Added bg_only option for lm_read parameter list * (but not yet implemented). * Changed proc name ilm_LOG_prob_of to lm3g_prob, to avoid conflict * with Roni's ILM function of the same name. * * Revision 6.3 93/10/09 17:01:55 rkm * M K Ravishankar (rkm@cs) at Carnegie Mellon * Cleaned up handling precompiled binary 3g LM file, * Added SWAP option for HP platforms. * * Revision 6.2 93/10/06 11:08:15 rkm * M K Ravishankar (rkm@cs) at Carnegie Mellon University * Darpa Trigram LM module. Created. */ #include #include #include #include #include #define QUIT(x) {fflush(stdout); fprintf x; exit(-1);} #include "s2types.h" #include "CM_macros.h" #include "basic_types.h" #include "assert.h" #include "strfuncs.h" #include "linklist.h" #include "list.h" #include "hash.h" #include "err.h" #include "lmclass.h" #include "lm_3g.h" #include "log.h" #include "search_const.h" #include "msd.h" #include "dict.h" #include "kb.h" #include "fbs.h" #include "byteorder.h" #ifdef NO_DICT #define NO_WORD -1 #else #include "c.h" #include "dict.h" static dictT *WordDict; #endif #define UG_MAPID(m,u) ((m)->unigrams[u].mapid) #define UG_PROB_F(m,u) ((m)->unigrams[u].prob1.f) #define UG_BO_WT_F(m,u) ((m)->unigrams[u].bo_wt1.f) #define UG_PROB_L(m,u) ((m)->unigrams[u].prob1.l) #define UG_BO_WT_L(m,u) ((m)->unigrams[u].bo_wt1.l) #define FIRST_BG(m,u) ((m)->unigrams[u].bigrams) #define LAST_BG(m,u) (FIRST_BG((m),(u)+1)-1) #define BG_WID(m,b) ((m)->bigrams[b].wid) #define BG_PROB_F(m,b) ((m)->prob2[(m)->bigrams[b].prob2].f) #define BG_BO_WT_F(m,b) ((m)->bo_wt2[(m)->bigrams[b].bo_wt2].f) #define BG_PROB_L(m,b) ((m)->prob2[(m)->bigrams[b].prob2].l) #define BG_BO_WT_L(m,b) ((m)->bo_wt2[(m)->bigrams[b].bo_wt2].l) #define TSEG_BASE(m,b) ((m)->tseg_base[(b)>>LOG_BG_SEG_SZ]) #define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->bigrams[b].trigrams)) #define LAST_TG(m,b) (FIRST_TG((m),(b)+1)-1) #define TG_WID(m,t) ((m)->trigrams[t].wid) #define TG_PROB_F(m,t) ((m)->prob3[(m)->trigrams[t].prob3].f) #define TG_PROB_L(m,t) ((m)->prob3[(m)->trigrams[t].prob3].l) /* FIXME: put this in a header file */ extern void quit (int status, char const *fmt, ...); static double oov_ugprob = -5.0; /* Actually, logprob */ static char const *start_sym = ""; static char const *end_sym = ""; static char const *darpa_hdr = "Darpa Trigram LM"; /* FIXME: Why does lm3g2dmp.c have its own versions of these functions? */ static int32 lmname_to_id (char const * name); static int32 lm3g_load (char const *file, lm_t *model, char const *lmfile, int32 mtime); static int32 lm3g_dump (char const *file, lm_t *model, char const *lmfile, int32 mtime); static void lm_set_param (lm_t *model, double lw, double uw, double wip, int32 word_pair); /* Structure for maintaining multiple, named LMs */ static struct lmset_s { char *name; lm_t *lm; } *lmset; static int32 n_lm = 0; /* Total #LMs (actual) */ static int32 n_lm_alloc = 0; /* Total #LMs (for which space has been allocated) */ /* The currently active LM */ static lm_t *lmp; /* Words in LM; used only for building internal LM from LM file */ static char **word_str; static int32 lm_last_access_type; /* Hack!! See ACCESS definitions in .h */ #define MIN_PROB_F -99.0 #define MAX_SORTED_ENTRIES 65534 /* Base values for ranges of unigram_t.mapid */ #define LM_DICTWID_BASE 0 /* Do not change this */ #define LM_CLASSID_BASE 0x01000000 /* Interpreted as LMclass ID */ #define LM_DICTWID_BADMAP -16000 /* An illegal mapping */ #define LM_CLASSID_TO_CLASS(m,i) ((model)->lmclass[(i)-LM_CLASSID_BASE]) /* * Bigram probs and bo-wts, and trigram probs are kept in separate tables * rather than within the bigram_t and trigram_t structures. These tables * hold unique prob and bo-wt values, and can be < 64K long (see lm_3g.h). * The following tree structure is used to construct these tables of unique * values. Whenever a new value is read from the LM file, the sorted tree * structure is searched to see if the value already exists, and inserted * if not found. */ typedef struct sorted_entry_s { log_t val; /* value being kept in this node */ uint16 lower; /* index of another entry. All descendants down this path have their val < this node's val. 0 => no son exists (0 is root index) */ uint16 higher; /* index of another entry. All descendants down this path have their val > this node's val 0 => no son exists (0 is root index) */ } sorted_entry_t; /* * The sorted list. list is a (64K long) array. The first entry is the * root of the tree and is created during initialization. */ typedef struct { sorted_entry_t *list; int32 free; /* first free element in list */ } sorted_list_t; /* Arrays of unique bigram probs and bo-wts, and trigram probs */ static sorted_list_t sorted_prob2; static sorted_list_t sorted_bo_wt2; static sorted_list_t sorted_prob3; /* * Initialize sorted list with the 0-th entry = MIN_PROB_F, which may be needed * to replace spurious values in the Darpa LM file. */ static void init_sorted_list (sorted_list_t *l) { l->list = (sorted_entry_t *) CM_calloc (MAX_SORTED_ENTRIES, sizeof (sorted_entry_t)); l->list[0].val.f = MIN_PROB_F; l->list[0].lower = 0; l->list[0].higher = 0; l->free = 1; } static void free_sorted_list (sorted_list_t *l) { free (l->list); } static log_t *vals_in_sorted_list (sorted_list_t *l) { log_t *vals; int32 i; vals = (log_t *) CM_calloc (l->free, sizeof (log_t)); for (i = 0; i < l->free; i++) vals[i].f = l->list[i].val.f; return (vals); } static int32 sorted_id (sorted_list_t *l, float *val) { int32 i = 0; for (;;) { if (*val == l->list[i].val.f) return (i); if (*val < l->list[i].val.f) { if (l->list[i].lower == 0) { if (l->free >= MAX_SORTED_ENTRIES) QUIT((stderr, "%s(%d): sorted list overflow\n", __FILE__, __LINE__)); l->list[i].lower = l->free; (l->free)++; i = l->list[i].lower; l->list[i].val.f = *val; return (i); } else i = l->list[i].lower; } else { if (l->list[i].higher == 0) { if (l->free >= MAX_SORTED_ENTRIES) QUIT((stderr, "%s(%d): sorted list overflow\n", __FILE__, __LINE__)); l->list[i].higher = l->free; (l->free)++; i = l->list[i].higher; l->list[i].val.f = *val; return (i); } else i = l->list[i].higher; } } } /* * allocate, initialize and return pointer to an array of unigram entries. */ static unigram_t *NewUnigramTable (int32 n_ug) { unigram_t *table; int32 i; table = (unigram_t *) CM_calloc (n_ug, sizeof (unigram_t)); for (i = 0; i < n_ug; i++) { table[i].mapid = NO_WORD; table[i].prob1.f = -99.0; table[i].bo_wt1.f = -99.0; } return table; } /* * returns a pointer to a new language model record. The size is passed in * as a parameter. */ lm_t * NewModel (n_ug, n_bg, n_tg, n_dict) int32 n_ug; int32 n_bg; int32 n_tg; int32 n_dict; { lm_t *model; model = (lm_t *) CM_calloc (1, sizeof (lm_t)); /* * Allocate one extra unigram and bigram entry: sentinels to terminate * followers (bigrams and trigrams, respectively) of previous entry. */ model->unigrams = NewUnigramTable (n_ug+1); model->bigrams = (bigram_t *) CM_calloc (n_bg+1, sizeof (bigram_t)); if (n_tg > 0) model->trigrams = (trigram_t *) CM_calloc (n_tg, sizeof (trigram_t)); /* Allocate space for mapping from dictionary word id to this LM unigram id */ model->dictwid_map = (int32 *) CM_calloc (n_dict, sizeof (int32)); if (n_tg > 0) { model->tseg_base = (int32 *) CM_calloc ((n_bg+1)/BG_SEG_SZ+1, sizeof (int32)); #if 0 E_INFO("%s(%d): %8d = tseg_base entries allocated\n", __FILE__, __LINE__, (n_bg+1)/BG_SEG_SZ+1); #endif } model->max_ucount = model->ucount = n_ug; model->bcount = n_bg; model->tcount = n_tg; model->dict_size = n_dict; model->HT.size = 0; model->HT.inuse = 0; model->HT.tab = NULL; return model; } #ifdef NO_DICT #define GET_WORD_IDX(w) wstr2wid (lmp, w) #else #define GET_WORD_IDX(w) dictStrToWordId (WordDict, w, FALSE) #endif static int32 wstr2wid (lm_t *model, char *w) { caddr_t val; if (hash_lookup (&(model->HT), w, &val) != 0) return NO_WORD; return ((int32) val); } /* * Read and return #unigrams, #bigrams, #trigrams as stated in input file. */ static void ReadNgramCounts (FILE *fp, int32 *n_ug, int32 *n_bg, int32 *n_tg) { char string[256]; int32 ngram, ngram_cnt; /* skip file until past the '\data\' marker */ do fgets (string, sizeof (string), fp); while ( (strcmp (string, "\\data\\\n") != 0) && (! feof (fp)) ); if (strcmp (string, "\\data\\\n") != 0) QUIT((stderr, "%s(%d): No \\data\\ mark in LM file\n", __FILE__, __LINE__)); *n_ug = *n_bg = *n_tg = 0; while (fgets (string, sizeof (string), fp) != NULL) { if (sscanf (string, "ngram %d=%d", &ngram, &ngram_cnt) != 2) break; switch (ngram) { case 1: *n_ug = ngram_cnt; break; case 2: *n_bg = ngram_cnt; break; case 3: *n_tg = ngram_cnt; break; default: QUIT((stderr, "%s(%d): Unknown ngram (%d)\n", __FILE__, __LINE__, ngram)); break; } } /* Position file to just after the unigrams header '\1-grams:\' */ while ( (strcmp (string, "\\1-grams:\n") != 0) && (! feof (fp)) ) fgets (string, sizeof (string), fp); /* Check counts; NOTE: #trigrams *CAN* be 0 */ if ((*n_ug <= 0) || (*n_bg <= 0) || (*n_tg < 0)) QUIT((stderr, "%s(%d): Bad or missing ngram count\n", __FILE__, __LINE__)); } /* * Read in the unigrams from given file into the LM structure model. On * entry to this procedure, the file pointer is positioned just after the * header line '\1-grams:'. */ static void ReadUnigrams (FILE *fp, lm_t *model) { char string[256]; char name[128]; int32 wcnt; float p1, bo_wt; E_INFO ("%s(%d): Reading unigrams\n", __FILE__, __LINE__); wcnt = 0; while ((fgets (string, sizeof(string), fp) != NULL) && (strcmp (string, "\\2-grams:\n") != 0)) { if (sscanf (string, "%f %s %f", &p1, name, &bo_wt) != 3) { if (string[0] != '\n') E_WARN ("%s(%d): Format error; unigram ignored:%s", __FILE__, __LINE__, string); continue; } if (wcnt >= model->ucount) QUIT((stderr, "%s(%d): Too many unigrams\n", __FILE__, __LINE__)); /* Associate name with word id */ word_str[wcnt] = (char *) salloc (name); hash_add (&(model->HT), word_str[wcnt], (caddr_t) wcnt); model->unigrams[wcnt].prob1.f = p1; model->unigrams[wcnt].bo_wt1.f = bo_wt; model->unigrams[wcnt].mapid = wcnt++; } if (model->ucount != wcnt) { E_WARN ("%s(%d): lm_t.ucount(%d) != #unigrams read(%d)\n", __FILE__, __LINE__, model->ucount, wcnt); model->ucount = wcnt; } } /* * Read bigrams from given file into given model structure. File may be arpabo * or arpabo-id format, depending on idfmt = 0 or 1. */ static void ReadBigrams (FILE *fp, lm_t *model, int32 idfmt) { char string[1024], word1[256], word2[256]; int32 w1, w2, prev_w1, bgcount, p; bigram_t *bgptr; float p2, bo_wt; int32 n_fld, n; E_INFO ("%s(%d): Reading bigrams\n", __FILE__, __LINE__); bgcount = 0; bgptr = model->bigrams; prev_w1 = -1; n_fld = (model->tcount > 0) ? 4 : 3; bo_wt = 0.0; while (fgets (string, sizeof(string), fp) != NULL) { if (! idfmt) n = sscanf (string, "%f %s %s %f", &p2, word1, word2, &bo_wt); else n = sscanf (string, "%f %d %d %f", &p2, &w1, &w2, &bo_wt); if (n < n_fld) { if (string[0] != '\n') break; continue; } if (! idfmt) { if ((w1 = wstr2wid (model, word1)) == NO_WORD) QUIT((stderr, "%s(%d): Unknown word: %s\n", __FILE__, __LINE__, word1)); if ((w2 = wstr2wid (model, word2)) == NO_WORD) QUIT((stderr, "%s(%d): Unknown word: %s\n", __FILE__, __LINE__, word2)); } else { if ((w1 >= model->ucount) || (w2 >= model->ucount) || (w1 < 0) || (w2 < 0)) QUIT((stderr, "%s(%d): Bad bigram: %s", __FILE__, __LINE__, string)); } /* HACK!! to quantize probs to 4 decimal digits */ p = p2*10000; p2 = p*0.0001; p = bo_wt*10000; bo_wt = p*0.0001; if (bgcount >= model->bcount) QUIT((stderr, "%s(%d): Too many bigrams\n", __FILE__, __LINE__)); bgptr->wid = w2; bgptr->prob2 = sorted_id (&sorted_prob2, &p2); if (model->tcount > 0) bgptr->bo_wt2 = sorted_id (&sorted_bo_wt2, &bo_wt); if (w1 != prev_w1) { if (w1 < prev_w1) QUIT((stderr, "%s(%d): Bigrams not in unigram order\n", __FILE__, __LINE__)); for (prev_w1++; prev_w1 <= w1; prev_w1++) model->unigrams[prev_w1].bigrams = bgcount; prev_w1 = w1; } bgcount++; bgptr++; if ((bgcount & 0x0000ffff) == 0) { E_INFO ("."); } } if ((strcmp (string, "\\end\\\n") != 0) && (strcmp (string, "\\3-grams:\n") != 0)) QUIT((stderr, "%s(%d): Bad bigram: %s\n", __FILE__, __LINE__, string)); for (prev_w1++; prev_w1 <= model->ucount; prev_w1++) model->unigrams[prev_w1].bigrams = bgcount; } /* * Very similar to ReadBigrams. */ static void ReadTrigrams (FILE *fp, lm_t *model, int32 idfmt) { char string[1024], word1[256], word2[256], word3[256]; int32 i, n, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg, p; int32 seg, prev_seg, prev_seg_lastbg; trigram_t *tgptr; bigram_t *bgptr; float p3; E_INFO ("%s(%d): Reading trigrams\n", __FILE__, __LINE__); tgcount = 0; tgptr = model->trigrams; prev_w1 = -1; prev_w2 = -1; prev_bg = -1; prev_seg = -1; while (fgets (string, sizeof(string), fp) != NULL) { if (! idfmt) n = sscanf (string, "%f %s %s %s", &p3, word1, word2, word3); else n = sscanf (string, "%f %d %d %d", &p3, &w1, &w2, &w3); if (n != 4) { if (string[0] != '\n') break; continue; } if (! idfmt) { if ((w1 = wstr2wid (model, word1)) == NO_WORD) QUIT((stderr, "%s(%d): Unknown word: %s\n", __FILE__, __LINE__, word1)); if ((w2 = wstr2wid (model, word2)) == NO_WORD) QUIT((stderr, "%s(%d): Unknown word: %s\n", __FILE__, __LINE__, word2)); if ((w3 = wstr2wid (model, word3)) == NO_WORD) QUIT((stderr, "%s(%d): Unknown word: %s\n", __FILE__, __LINE__, word3)); } else { if ((w1 >= model->ucount) || (w2 >= model->ucount) || (w3 >= model->ucount) || (w1 < 0) || (w2 < 0) || (w3 < 0)) QUIT((stderr, "%s(%d): Bad trigram: %s", __FILE__, __LINE__, string)); } /* HACK!! to quantize probs to 4 decimal digits */ p = p3*10000; p3 = p*0.0001; if (tgcount >= model->tcount) QUIT((stderr, "%s(%d): Too many trigrams\n", __FILE__, __LINE__)); tgptr->wid = w3; tgptr->prob3 = sorted_id (&sorted_prob3, &p3); if ((w1 != prev_w1) || (w2 != prev_w2)) { /* Trigram for a new bigram; update tg info for all previous bigrams */ if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2))) QUIT((stderr, "%s(%d): Trigrams not in bigram order\n", __FILE__, __LINE__)); bg = (w1 != prev_w1) ? model->unigrams[w1].bigrams : prev_bg+1; endbg = model->unigrams[w1+1].bigrams; bgptr = model->bigrams + bg; for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++); if (bg >= endbg) QUIT((stderr, "%s(%d): Missing bigram for trigram: %s", __FILE__, __LINE__, string)); /* bg = bigram entry index for . Update tseg_base */ seg = bg >> LOG_BG_SEG_SZ; for (i = prev_seg+1; i <= seg; i++) model->tseg_base[i] = tgcount; /* Update trigrams pointers for all bigrams until bg */ if (prev_seg < seg) { int32 tgoff = 0; if (prev_seg >= 0) { tgoff = tgcount - model->tseg_base[prev_seg]; if (tgoff > 65535) QUIT((stderr, "%s(%d): Offset from tseg_base > 65535\n", __FILE__, __LINE__)); } prev_seg_lastbg = ((prev_seg+1) << LOG_BG_SEG_SZ) - 1; bgptr = model->bigrams + prev_bg; for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg; prev_bg++, bgptr++) bgptr->trigrams = tgoff; for (; prev_bg <= bg; prev_bg++, bgptr++) bgptr->trigrams = 0; } else { int32 tgoff; tgoff = tgcount - model->tseg_base[prev_seg]; if (tgoff > 65535) QUIT((stderr, "%s(%d): Offset from tseg_base > 65535\n", __FILE__, __LINE__)); bgptr = model->bigrams + prev_bg; for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++) bgptr->trigrams = tgoff; } prev_w1 = w1; prev_w2 = w2; prev_bg = bg; prev_seg = seg; } tgcount++; tgptr++; if ((tgcount & 0x0000ffff) == 0) { E_INFO ("."); } } if (strcmp (string, "\\end\\\n") != 0) QUIT((stderr, "%s(%d): Bad trigram: %s\n", __FILE__, __LINE__, string)); for (prev_bg++; prev_bg <= model->bcount; prev_bg++) { if ((prev_bg & (BG_SEG_SZ-1)) == 0) model->tseg_base[prev_bg >> LOG_BG_SEG_SZ] = tgcount; if ((tgcount - model->tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535) QUIT((stderr, "%s(%d): Offset from tseg_base > 65535\n", __FILE__, __LINE__)); model->bigrams[prev_bg].trigrams = tgcount - model->tseg_base[prev_bg >> LOG_BG_SEG_SZ]; } } static FILE *lm_file_open (char const *filename, int32 usepipe) { char command[1024]; FILE *fp; if (usepipe) { #ifdef WIN32 sprintf (command, "D:\\compress\\gzip.exe -d -c %s", filename); if ((fp = _popen (command, "r")) == NULL) QUIT((stderr, "%s(%d): Cannot popen %s\n", __FILE__, __LINE__, command)); #else sprintf (command, "zcat %s", filename); if ((fp = popen (command, "r")) == NULL) QUIT((stderr, "%s(%d): Cannot popen %s\n", __FILE__, __LINE__, command)); #endif } else { fp = CM_fopen (filename, "r"); } return (fp); } static int32 lm_get_classid (lm_t *model, char *name) { int32 i; if (! model->lmclass) return -1; for (i = 0; i < model->n_lmclass; i++) { if (strcmp (lmclass_getname(model->lmclass[i]), name) == 0) return (i + LM_CLASSID_BASE); } return -1; } /* * Read in a trigram language model from the given file. The LM tokens can be word * classes. However a given actual word can belong to AT MOST ONE of the LM classes * used by this LM. */ int32 lm_read_clm (char const *filename, char const *lmname, double lw, double uw, double wip, lmclass_t *lmclass, int32 n_lmclass) { lm_t *model; FILE *fp = NULL; int32 usingPipe = FALSE; int32 n_unigram; int32 n_bigram; int32 n_trigram; int32 dict_size; int32 i, j, k, last_bg, last_tg; char *kbdumpdir, dumpfile[1024]; struct stat statbuf; int32 first_oov = 0, last_oov = -1; int32 max_new_oov = 0; int32 idfmt; int32 dictid, classid, notindict, maperr; lmclass_word_t lmclass_word; E_INFO ("%s(%d): Reading LM file %s (name \"%s\")\n", __FILE__, __LINE__, filename, lmname); /* Make sure no LM with same lmname already exists; if so, delete it */ if (lmname_to_id (lmname) >= 0) lm_delete (lmname); /* Check if a compressed file */ k = strlen(filename); #ifdef WIN32 usingPipe = (k > 3) && ((strcmp (filename+k-3, ".gz") == 0) || (strcmp (filename+k-3, ".GZ") == 0)); #else usingPipe = (k > 2) && ((strcmp (filename+k-2, ".Z") == 0) || (strcmp (filename+k-2, ".z") == 0)); #endif /* Check if an .arpabo-id format file; More HACK!! Hardwired check for -id */ if (usingPipe) k -= 2; idfmt = ((k > 3) && (strncmp (filename+k-3, "-id", 3) == 0)); fp = lm_file_open (filename, usingPipe); if (stat(filename, &statbuf) < 0) QUIT((stderr, "%s(%d): stat(%s) failed\n", __FILE__, __LINE__, filename)); #ifndef NO_DICT WordDict = kb_get_word_dict (); #endif /* Read #unigrams, #bigrams, #trigrams from file */ ReadNgramCounts (fp, &n_unigram, &n_bigram, &n_trigram); E_INFO ("%s(%d): ngrams 1=%d, 2=%d, 3=%d\n", __FILE__, __LINE__, n_unigram, n_bigram, n_trigram); /* Determine dictionary size (for dict-wid -> LM-wid map) */ #ifdef NO_DICT dict_size = n_unigram; #else dict_size = kb_get_num_words (); E_INFO("%s(%d): %d words in dictionary\n", __FILE__, __LINE__, dict_size); /* * If this is the "BASE" LM also count space for OOVs and words added at run time. * UGLY!! Assumes that OOVs will only be added to LM with no name. */ if (lmname[0] == '\0') { first_oov = dict_get_first_initial_oov (); last_oov = dict_get_last_initial_oov (); n_unigram += (last_oov - first_oov + 1); } /* Add space for words added in at run time */ max_new_oov = kb_get_max_new_oov (); n_unigram += max_new_oov; #endif if (dict_size >= 65535) QUIT((stderr, "%s(%d): #dict-words(%d) > 65534\n", __FILE__, __LINE__, dict_size)); /* Allocate space for LM, including initial OOVs and placeholders; initialize it */ model = lmp = NewModel (n_unigram, n_bigram, n_trigram, dict_size); word_str = (char **) CM_calloc (n_unigram, sizeof (char *)); /* Create name for binary dump form of Darpa LM file */ { #ifdef WIN32 for (i = strlen(filename)-1; (i >= 0) && (filename[i] != '\\') && (filename[i] != '/'); --i); #else for (i = strlen(filename)-1; (i >= 0) && (filename[i] != '/'); --i); #endif i++; kbdumpdir = kb_get_dump_dir(); /* form dumpfilename */ if (kbdumpdir) sprintf (dumpfile, "%s/%s.DMP", kbdumpdir, filename+i); } /* Load the precompiled binary dump form of the Darpa LM file if it exists */ if ((! kbdumpdir) || (! lm3g_load (dumpfile, model, filename, (int32)statbuf.st_mtime))) { ReadUnigrams (fp, model); E_INFO("%8d = #unigrams created\n", model->ucount); init_sorted_list (&sorted_prob2); if (model->tcount > 0) init_sorted_list (&sorted_bo_wt2); ReadBigrams (fp, model, idfmt); model->bcount = FIRST_BG(model,model->ucount); model->n_prob2 = sorted_prob2.free; model->prob2 = vals_in_sorted_list (&sorted_prob2); free_sorted_list (&sorted_prob2); E_INFO("\n%8d = #bigrams created\n", model->bcount); E_INFO("%8d = #prob2 entries\n", model->n_prob2); if (model->tcount > 0) { /* Create trigram bo-wts array */ model->n_bo_wt2 = sorted_bo_wt2.free; model->bo_wt2 = vals_in_sorted_list (&sorted_bo_wt2); free_sorted_list (&sorted_bo_wt2); E_INFO("%8d = #bo_wt2 entries\n", model->n_bo_wt2); init_sorted_list (&sorted_prob3); ReadTrigrams (fp, model, idfmt); model->tcount = FIRST_TG(model,model->bcount); model->n_prob3 = sorted_prob3.free; model->prob3 = vals_in_sorted_list (&sorted_prob3); E_INFO("\n%8d = #trigrams created\n", model->tcount); E_INFO("%8d = #prob3 entries\n", model->n_prob3); free_sorted_list (&sorted_prob3); } /* HACK!! to avoid unnecessarily creating dump files for small LMs */ if (kbdumpdir && (model->bcount + model->tcount > 200000)) lm3g_dump (dumpfile, model, filename, (int32)statbuf.st_mtime); } if (usingPipe) { #ifdef WIN32 _pclose (fp); #else pclose (fp); #endif } else fclose (fp); /* * Make a local copy of the LM Classes used by this LM. The unigrams_t.mapid * field can refer to the index of the local copy array. */ if (n_lmclass > 0) { model->lmclass = (lmclass_t *) CM_calloc (n_lmclass, sizeof(lmclass_t)); for (i = 0; i < n_lmclass; i++) model->lmclass[i] = lmclass[i]; } else model->lmclass = NULL; model->n_lmclass = n_lmclass; model->inclass_ugscore = (int32 *) CM_calloc (dict_size, sizeof(int32)); /* * Create mapping from dictionary ID to unigram index. And also mapping for * LM wids (to dictionary or LMclass ids) in unigram array. */ for (i = 0; i < model->dict_size; i++) model->dictwid_map[i] = LM_DICTWID_BADMAP; notindict = 0; maperr = 0; for (i = 0; i < model->ucount; i++) { #ifndef NO_DICT model->unigrams[i].mapid = kb_get_word_id (word_str[i]); #else E_FATAL("Cannot run standalone, without a dictionary\n"); #endif classid = lm_get_classid (model, word_str[i]); if (model->unigrams[i].mapid >= 0) { /* unigram[i] is a dictionary word */ if (classid >= 0) { E_ERROR("'%s' is both a word and an LM class name\n", word_str[i]); maperr = 1; } else model->dictwid_map[model->unigrams[i].mapid] = i; } else { if (classid >= 0) { /* unigram[i] is an LM class */ model->unigrams[i].mapid = classid; /* Create entries in dictwid_map for each word in the class */ lmclass_word = lmclass_firstword (LM_CLASSID_TO_CLASS(model,classid)); while (lmclass_isword(lmclass_word)) { dictid = lmclass_getwid(lmclass_word); if (dictid >= 0) { if (model->dictwid_map[dictid] >= 0) { E_ERROR("Multiple mappings of '%s' in LM\n", lmclass_getword(lmclass_word)); maperr = 1; } else { model->dictwid_map[dictid] = i; model->inclass_ugscore[dictid] = lmclass_getprob(lmclass_word)*lw; } } else notindict++; lmclass_word = lmclass_nextword (LM_CLASSID_TO_CLASS(model,classid), lmclass_word); } } else notindict++; } } if (maperr) E_FATAL("Errors in LM; exiting\n"); if (notindict > 0) E_WARN ("%s(%d): %d LM words not in dict; ignored\n", __FILE__, __LINE__, notindict); /* * Discourage expansion of end_sym and transition to start_sym. (The given * Darpa LM may contain some spurious values that don't reflect these * requirements.) */ /* bo_wt() = MIN_PROB_F */ for (i = 0; (i < model->ucount) && (strcmp (word_str[i], end_sym) != 0); i++); E_INFO("%s(%d): bo_wt(%s) changed from %.4f to %.4f\n", __FILE__, __LINE__, word_str[i], model->unigrams[i].bo_wt1.f, MIN_PROB_F); model->unigrams[i].bo_wt1.f = MIN_PROB_F; /* unigram prob() = MIN_PROB_F */ for (i = 0; (i < model->ucount) && (strcmp (word_str[i], start_sym) != 0); i++); E_INFO("%s(%d): prob(%s) changed from %.4f to %.4f\n", __FILE__, __LINE__, word_str[i], model->unigrams[i].prob1.f, MIN_PROB_F); model->unigrams[i].prob1.f = MIN_PROB_F; /* bigram prob(,) = MIN_PROB_F (if bigram exists) */ j = FIRST_BG(model,i); last_bg = LAST_BG(model,i); for (; (j<=last_bg) && (strcmp(word_str[BG_WID(model,j)],start_sym)!=0); j++); if (j <= last_bg) { E_INFO("%s(%d): prob(%s,%s) changed from %.4f to %.4f\n", __FILE__, __LINE__, word_str[i], word_str[BG_WID(model,j)], model->prob2[model->bigrams[j].prob2].f, model->prob2[0].f); model->bigrams[j].prob2 = 0; if (model->tcount > 0) { /* trigram prob(,,) = MIN_PROB_F (if trigram exists) */ k = FIRST_TG(model,j); last_tg = LAST_TG(model,j); for (; k <= last_tg; k++) { if (strcmp (word_str[TG_WID(model,k)], start_sym) == 0) break; } if (k <= last_tg) { E_INFO("%s(%d): prob(%s,%s,%s) changed from %.4f to %.4f\n", __FILE__, __LINE__, word_str[i], word_str[BG_WID(model,j)], word_str[TG_WID(model,k)], model->prob3[model->trigrams[k].prob3].f, model->prob3[0].f); model->trigrams[k].prob3 = 0; } } } /* bigram prob(,) = MIN_PROB_F (if bigram exists) */ j = FIRST_BG(model,i); last_bg = LAST_BG(model,i); for (; (j<=last_bg) && (strcmp(word_str[BG_WID(model,j)],end_sym)!=0); j++); if (j <= last_bg) { E_INFO("%s(%d): prob(%s,%s) changed from %.4f to %.4f\n", __FILE__, __LINE__, word_str[i], word_str[BG_WID(model,j)], model->prob2[model->bigrams[j].prob2].f, model->prob2[0].f); model->bigrams[j].prob2 = 0; } lm_add (lmname, model, lw, uw, wip); hash_free (&(model->HT)); for (i = 0; i < model->ucount; i++) free (word_str[i]); free (word_str); return 0; } int32 lm_read (char const *filename, char const *lmname, double lw, double uw, double wip) { return lm_read_clm(filename, lmname, lw, uw, wip, NULL, 0); } void lm_init_oov ( void ) { int32 i, j, baseid; int32 first_oov = 0, last_oov = -1; lm_t *model; model = lm_name2lm (""); /* Add initial list of OOV words to LM unigrams */ first_oov = dict_get_first_initial_oov (); last_oov = dict_get_last_initial_oov (); E_INFO ("%s(%d): Adding %d initial OOV words to LM\n", __FILE__, __LINE__, last_oov-first_oov+1); oov_ugprob = kb_get_oov_ugprob (); for (i = first_oov; i <= last_oov; i++) { /* Add only base pronunciations */ if ((baseid = dictid_to_baseid (WordDict, i)) == i) { if ((j = lm_add_word (model, i)) >= 0) model->dictwid_map[i] = j; } } } /* * Add new word with given dictionary wid and unigram prob = oov_ugprob to * model->unigrams. * Return LM wid of inserted word if successful, otherwise -1. * (Currently some problems with adding alternative pronunciations...) */ int32 lm_add_word (lm_t *model, int32 dictwid) { /* Make sure new word not already in LM */ if (model->dictwid_map[dictwid] >= 0) { E_WARN("lm_add_word: Word '%s' already in LM, ignored\n", dictid_to_str (WordDict, dictwid)); return model->dictwid_map[dictwid]; } if (model->ucount >= model->max_ucount) { E_ERROR ("%s(%d): lm_add_word(%s) failed; LM full\n", __FILE__, __LINE__, dictid_to_str (WordDict, dictwid)); return -1; } /* Append new word to unigrams */ model->unigrams[model->ucount].mapid = dictwid; model->unigrams[model->ucount].prob1.l = LOG10TOLOG(oov_ugprob) * model->lw + model->log_wip; model->unigrams[model->ucount].bo_wt1.l = LOG10TOLOG(0.0) * model->lw; /* Advance the sentinel unigram */ model->unigrams[model->ucount+1].bigrams = model->unigrams[model->ucount].bigrams; /* Update dictwid_map for this LM */ model->dictwid_map[dictwid] = model->ucount; return (model->ucount++); } /* * Add named model to list of models. If another with same name exists, delete it first. */ void lm_add (char const *lmname, lm_t *model, double lw, double uw, double wip) { if (lmname_to_id (lmname) >= 0) lm_delete (lmname); model->tginfo = (tginfo_t **) CM_calloc (model->max_ucount, sizeof(tginfo_t *)); if (n_lm == n_lm_alloc) { lmset = (struct lmset_s *) CM_recalloc (lmset, n_lm+15, sizeof(struct lmset_s)); n_lm_alloc += 15; } lmset[n_lm].lm = model; lmset[n_lm].name = salloc (lmname); lm_set_param (model, lw, uw, wip, FALSE); n_lm++; E_INFO ("%s(%d): LM(\"%s\") added\n", __FILE__, __LINE__, lmname); } /* * Delete named LM from list of LMs and reclaim all space. */ int32 lm_delete (char const *name) { int32 i, u; lm_t *model; tginfo_t *tginfo, *next_tginfo; if ((i = lmname_to_id (name)) < 0) return (-1); model = lmset[i].lm; free (model->unigrams); free (model->bigrams); free (model->prob2); if (model->tcount > 0) { free (model->trigrams); free (model->tseg_base); free (model->bo_wt2); free (model->prob3); } if (model->HT.tab != NULL) hash_free (&model->HT); for (u = 0; u < model->max_ucount; u++) for (tginfo = model->tginfo[u]; tginfo; tginfo = next_tginfo) { next_tginfo = tginfo->next; listelem_free ((void *)tginfo, sizeof(tginfo_t)); } free (model->tginfo); if (model->lmclass) free (model->lmclass); free (model->inclass_ugscore); free (model->dictwid_map); free (model); free (lmset[i].name); for (; i < n_lm-1; i++) lmset[i] = lmset[i+1]; --n_lm; E_INFO ("%s(%d): LM(\"%s\") deleted\n", __FILE__, __LINE__, name); return (0); } /* * Set the active LM to the one identified by "name". Return 0 if successful, * -1 otherwise. */ int32 lm_set_current (char const *name) { int32 i; if ((i = lmname_to_id (name)) < 0) return (-1); lmp = lmset[i].lm; #ifdef USE_ILM ilm_set_lm (lmp); #endif return (0); } static int32 lmname_to_id (char const *name) { int32 i; for (i = 0; (i < n_lm) && (strcmp (lmset[i].name, name) != 0); i++); return ((i < n_lm) ? i : -1); } lm_t *lm_name2lm (char const *name) { int32 i; i = lmname_to_id (name); return ((i >= 0) ? lmset[i].lm : NULL); } char *get_current_lmname () { int32 i; for (i = 0; (i < n_lm) && (lmset[i].lm != lmp); i++); return ((i < n_lm) ? lmset[i].name : NULL); } lm_t *lm_get_current () { return (lmp); } int32 get_n_lm () { return (n_lm); } int32 lm3g_n_lm ( void ) { return n_lm; } char *lm3g_index2name (int k) { if ((k >= 0) && (k < n_lm)) return (lmset[k].name); else return NULL; } /* * dict base wid; check if present in LM. * return TRUE if present, FALSE otherwise. */ int32 dictwd_in_lm (wid) int32 wid; { return (lmp->dictwid_map[wid] >= 0); } static int32 fread_int32(FILE *fp, int32 min, int32 max, char const *name) { int32 k; if (fread (&k, sizeof (int32), 1, fp) != 1) QUIT((stderr, "%s(%d): fread(%s) failed\n", __FILE__, __LINE__, name)); SWAP_L(k); if ((min > k) || (max < k)) QUIT((stderr, "%s(%d): %s outside range [%d,%d]\n", __FILE__, __LINE__, name, min, max)); return (k); } static void fwrite_int32 (FILE *fp, int32 val) { SWAP_L(val); fwrite (&val, sizeof(int32), 1, fp); } static void fwrite_ug (FILE *fp, unigram_t *ug) { unigram_t tmp_ug = *ug; SWAP_L(tmp_ug.mapid); SWAP_L(tmp_ug.prob1.l); SWAP_L(tmp_ug.bo_wt1.l); SWAP_L(tmp_ug.bigrams); fwrite (&tmp_ug, sizeof(unigram_t), 1, fp); } static void fwrite_bg (FILE *fp, bigram_t *bg) { bigram_t tmp_bg = *bg; SWAP_W(tmp_bg.wid); SWAP_W(tmp_bg.prob2); SWAP_W(tmp_bg.bo_wt2); SWAP_W(tmp_bg.trigrams); fwrite (&tmp_bg, sizeof(bigram_t), 1, fp); } static void fwrite_tg (FILE *fp, trigram_t *tg) { trigram_t tmp_tg = *tg; SWAP_W(tmp_tg.wid); SWAP_W(tmp_tg.prob3); fwrite (&tmp_tg, sizeof(trigram_t), 1, fp); } /* * Load pre-compiled trigram LM file, if it exists, into model. If file * does not exist return 0. Otherwise, if successful, return 1. */ static int32 lm3g_load (char const *file, lm_t *model, char const *lmfile, int32 mtime) { int32 i, j, k, vn, ts, err; FILE *fp; char str[1024]; unigram_t *ugptr; bigram_t *bgptr; trigram_t *tgptr; char *tmp_word_str; err = 0; E_INFO ("%s(%d): Looking for precompiled LM dump file %s\n", __FILE__, __LINE__, file); if ((fp = fopen (file, "rb")) == NULL) { /* Not even really a warning. */ E_INFO("%s(%d): Precompiled file not found; continue with LM file\n", __FILE__, __LINE__); return (0); } k = fread_int32 (fp, strlen(darpa_hdr)+1, strlen(darpa_hdr)+1, "header size"); if (fread (str, sizeof (char), k, fp) != (size_t) k) QUIT((stderr, "%s(%d): Cannot read header\n", __FILE__, __LINE__)); if (strncmp (str, darpa_hdr, k) != 0) QUIT((stderr, "%s(%d): Wrong header %s\n", __FILE__, __LINE__, darpa_hdr)); E_INFO("%s(%d): %s\n", __FILE__, __LINE__, str); k = fread_int32 (fp, 1, 1023, "LM filename size"); if (fread (str, sizeof (char), k, fp) != (size_t) k) QUIT((stderr, "%s(%d): Cannot read LM filename in header\n", __FILE__, __LINE__)); #if 0 if (strncmp (str, lmfile, k) != 0) fprintf (stdout, "%s(%d): **WARNING** LM filename in header = %s\n", __FILE__, __LINE__, str); #endif /* read version#, if present (must be <= 0) */ vn = fread_int32 (fp, (int32)0x80000000, 0x7fffffff, "version#"); if (vn <= 0) { /* read and compare timestamps */ ts = fread_int32 (fp, (int32)0x80000000, 0x7fffffff, "timestamp"); if (ts < mtime) { E_WARN("%s(%d): **WARNING** LM file newer than dump file\n", __FILE__, __LINE__); } /* read and skip format description */ for (;;) { k = fread_int32 (fp, 0, 1023, "string length"); if (k == 0) break; if (fread (str, sizeof(char), k, fp) != (size_t) k) QUIT((stderr, "%s(%d): fread(word) failed\n", __FILE__, __LINE__)); } /* read model->ucount */ model->ucount = fread_int32 (fp, 1, model->ucount, "LM.ucount"); } else { /* oldest dump file version has no version# or any of the above */ if (vn > model->ucount) QUIT((stderr, "%s(%d): LM.ucount(%d) out of range [1..%d]\n", __FILE__, __LINE__, vn, model->ucount)); model->ucount = vn; } /* read model->bcount, tcount */ model->bcount = fread_int32 (fp, 1, model->bcount, "LM.bcount"); model->tcount = fread_int32 (fp, 0, model->tcount, "LM.tcount"); E_INFO ("%s(%d): ngrams 1=%d, 2=%d, 3=%d\n", __FILE__, __LINE__, model->ucount, model->bcount, model->tcount); /* read unigrams */ if (fread (model->unigrams, sizeof(unigram_t), model->ucount+1, fp) != (size_t) model->ucount+1) QUIT((stderr, "%s(%d): fread(unigrams) failed\n", __FILE__, __LINE__)); for (i = 0, ugptr = model->unigrams; i <= model->ucount; i++, ugptr++) { SWAP_L(ugptr->mapid); SWAP_L(ugptr->prob1.l); SWAP_L(ugptr->bo_wt1.l); SWAP_L(ugptr->bigrams); } for (i = 0, ugptr = model->unigrams; i < model->ucount; i++, ugptr++) { if (ugptr->mapid != i) err = 1; ugptr->mapid = i; } if (err) E_WARN ("%s(%d): Corrected corrupted dump file created by buggy fbs8\n", __FILE__, __LINE__); E_INFO("%s(%d): %8d = LM.unigrams(+trailer) read\n", __FILE__, __LINE__, model->ucount); /* read bigrams */ if (fread (model->bigrams, sizeof(bigram_t), model->bcount+1, fp) != (size_t) model->bcount+1) QUIT((stderr, "%s(%d): fread(bigrams) failed\n", __FILE__, __LINE__)); for (i = 0, bgptr = model->bigrams; i <= model->bcount; i++, bgptr++) { SWAP_W(bgptr->wid); SWAP_W(bgptr->prob2); SWAP_W(bgptr->bo_wt2); SWAP_W(bgptr->trigrams); } E_INFO("%s(%d): %8d = LM.bigrams(+trailer) read\n", __FILE__, __LINE__, model->bcount); /* read trigrams */ if (model->tcount > 0) { if (fread(model->trigrams, sizeof(trigram_t), model->tcount, fp) != (size_t) model->tcount) QUIT((stderr, "%s(%d): fread(trigrams) failed\n", __FILE__, __LINE__)); for (i = 0, tgptr = model->trigrams; i < model->tcount; i++, tgptr++) { SWAP_W(tgptr->wid); SWAP_W(tgptr->prob3); } E_INFO("%s(%d): %8d = LM.trigrams read\n", __FILE__, __LINE__, model->tcount); } /* read n_prob2 and prob2 array */ model->n_prob2 = k = fread_int32 (fp, 1, 65535, "LM.n_prob2"); model->prob2 = (log_t *) CM_calloc (k, sizeof (log_t)); if (fread (model->prob2, sizeof (log_t), k, fp) != (size_t) k) QUIT((stderr, "%s(%d): fread(prob2) failed\n", __FILE__, __LINE__)); for (i = 0; i < k; i++) SWAP_L(model->prob2[i].l); E_INFO("%s(%d): %8d = LM.prob2 entries read\n", __FILE__, __LINE__, k); /* read n_bo_wt2 and bo_wt2 array */ if (model->tcount > 0) { k = fread_int32 (fp, 1, 65535, "LM.n_bo_wt2"); model->n_bo_wt2 = k; model->bo_wt2 = (log_t *) CM_calloc (k, sizeof (log_t)); if (fread (model->bo_wt2, sizeof (log_t), k, fp) != (size_t) k) QUIT((stderr, "%s(%d): fread(bo_wt2) failed\n", __FILE__, __LINE__)); for (i = 0; i < k; i++) SWAP_L(model->bo_wt2[i].l); E_INFO("%s(%d): %8d = LM.bo_wt2 entries read\n", __FILE__, __LINE__, k); } /* read n_prob3 and prob3 array */ if (model->tcount > 0) { k = fread_int32 (fp, 1, 65535, "LM.n_prob3"); model->n_prob3 = k; model->prob3 = (log_t *) CM_calloc (k, sizeof (log_t)); if (fread (model->prob3, sizeof (log_t), k, fp) != (size_t) k) QUIT((stderr, "%s(%d): fread(prob3) failed\n", __FILE__, __LINE__)); for (i = 0; i < k; i++) SWAP_L(model->prob3[i].l); E_INFO("%s(%d): %8d = LM.prob3 entries read\n", __FILE__, __LINE__, k); } /* read tseg_base size and tseg_base */ if (model->tcount > 0) { k = (model->bcount+1)/BG_SEG_SZ + 1; k = fread_int32 (fp, k, k, "tseg_base size"); if (fread (model->tseg_base, sizeof(int32), k, fp) != (size_t) k) QUIT((stderr, "%s(%d): fread(tseg_base) failed\n", __FILE__, __LINE__)); for (i = 0; i < k; i++) SWAP_L(model->tseg_base[i]); E_INFO("%s(%d): %8d = LM.tseg_base entries read\n", __FILE__, __LINE__, k); } /* read ascii word strings */ k = fread_int32 (fp, 1, 0x7fffffff, "words string-length"); tmp_word_str = (char *) CM_calloc (k, sizeof (char)); if (fread (tmp_word_str, sizeof(char), k, fp) != (size_t) k) QUIT((stderr, "%s(%d): fread(word-string) failed\n", __FILE__, __LINE__)); /* First make sure string just read contains ucount words (PARANOIA!!) */ for (i = 0, j = 0; i < k; i++) if (tmp_word_str[i] == '\0') j++; if (j != model->ucount) QUIT((stderr, "%s(%d): Error reading word strings\n", __FILE__, __LINE__)); /* Break up string just read into words */ j = 0; for (i = 0; i < model->ucount; i++) { word_str[i] = (char *) salloc (tmp_word_str+j); j += strlen(word_str[i]) + 1; } free (tmp_word_str); E_INFO("%s(%d): %8d = ascii word strings read\n", __FILE__, __LINE__, i); return (1); } static char const *fmtdesc[] = { "BEGIN FILE FORMAT DESCRIPTION", "Header string length (int32) and string (including trailing 0)", "Original LM filename string-length (int32) and filename (including trailing 0)", "(int32) version number (present iff value <= 0)", "(int32) original LM file modification timestamp (iff version# present)", "(int32) string-length and string (including trailing 0) (iff version# present)", "... previous entry continued any number of times (iff version# present)", "(int32) 0 (terminating sequence of strings) (iff version# present)", "(int32) lm_t.ucount (must be > 0)", "(int32) lm_t.bcount", "(int32) lm_t.tcount", "lm_t.ucount+1 unigrams (including sentinel)", "lm_t.bcount+1 bigrams (including sentinel)", "lm_t.tcount trigrams (present iff lm_t.tcount > 0)", "(int32) lm_t.n_prob2", "(int32) lm_t.prob2[]", "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)", "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)", "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)", "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)", "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)", "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)", "(int32) Sum(all word string-lengths, including trailing 0 for each)", "All word strings (including trailing 0 for each)", "END FILE FORMAT DESCRIPTION", NULL, }; /* * Dump internal LM to file. Format described above. * Remember to swap bytes if necessary. */ static int32 lm3g_dump (char const *file, lm_t *model, char const *lmfile, int32 mtime) { int32 i, k; FILE *fp; E_INFO ("%s(%d): Dumping LM to %s\n", __FILE__, __LINE__, file); if ((fp = fopen (file, "wb")) == NULL) { E_ERROR ("%s(%d): Cannot create file %s\n", __FILE__, __LINE__, file); return 0; } k = strlen(darpa_hdr)+1; fwrite_int32 (fp, k); fwrite (darpa_hdr, sizeof(char), k, fp); k = strlen(lmfile)+1; fwrite_int32 (fp, k); fwrite (lmfile, sizeof(char), k, fp); /* Write version# and LM file modification date */ fwrite_int32 (fp, -1); /* version # */ fwrite_int32 (fp, mtime); /* Write file format description into header */ for (i = 0; fmtdesc[i] != NULL; i++) { k = strlen(fmtdesc[i])+1; fwrite_int32 (fp, k); fwrite (fmtdesc[i], sizeof(char), k, fp); } fwrite_int32 (fp, 0); fwrite_int32 (fp, model->ucount); fwrite_int32 (fp, model->bcount); fwrite_int32 (fp, model->tcount); for (i = 0; i <= model->ucount; i++) fwrite_ug (fp, &(model->unigrams[i])); for (i = 0; i <= model->bcount; i++) fwrite_bg (fp, &(model->bigrams[i])); for (i = 0; i < model->tcount; i++) fwrite_tg (fp, &(model->trigrams[i])); fwrite_int32 (fp, model->n_prob2); for (i = 0; i < model->n_prob2; i++) fwrite_int32 (fp, model->prob2[i].l); if (model->tcount > 0) { fwrite_int32 (fp, model->n_bo_wt2); for (i = 0; i < model->n_bo_wt2; i++) fwrite_int32 (fp, model->bo_wt2[i].l); fwrite_int32 (fp, model->n_prob3); for (i = 0; i < model->n_prob3; i++) fwrite_int32 (fp, model->prob3[i].l); k = (model->bcount+1)/BG_SEG_SZ + 1; fwrite_int32 (fp, k); for (i = 0; i < k; i++) fwrite_int32 (fp, model->tseg_base[i]); } k = 0; for (i = 0; i < model->ucount; i++) k += strlen(word_str[i])+1; fwrite_int32 (fp, k); for (i = 0; i < model->ucount; i++) fwrite (word_str[i], sizeof(char), strlen(word_str[i])+1, fp); fclose (fp); return 0; } void lmSetStartSym (char const *sym) /*----------------------------* * Description - reconfigure the start symbol */ { start_sym = (char *) salloc(sym); } void lmSetEndSym (char const *sym) /*----------------------------* * Description - reconfigure the end symbol */ { end_sym = (char *) salloc(sym); } /* * Convert probs and backoff weights to LOG quantities, add language weight * and insertion penalty. */ static void lm_set_param (lm_t *model, double lw, double uw, double wip, int32 word_pair) { int32 i; int32 tmp1, tmp2; int32 logUW, logOneMinusUW, logUniform; int16 *at = Addition_Table; int32 ts = Table_Size; model->lw = lw; model->invlw = 1.0/lw; model->uw = uw; model->log_wip = LOG(wip); E_INFO ("%8.2f = Language Weight\n", model->lw); E_INFO ("%8.2f = Unigram Weight\n", model->uw); E_INFO ("%8d = LOG (Insertion Penalty (%.2f))\n", model->log_wip, wip); logUW = LOG(model->uw); logOneMinusUW = LOG(1.0 - model->uw); logUniform = LOG(1.0/(model->ucount-1)); /* -1 for ignoring */ if (word_pair) QUIT((stderr, "%s(%d): word-pair LM not implemented\n", __FILE__, __LINE__)); for (i = 0; i < model->ucount; i++) { model->unigrams[i].bo_wt1.l = (LOG10TOLOG(UG_BO_WT_F(model,i))) * model->lw; /* Interpolate LM unigram prob with uniform prob (except start_sym) */ if (strcmp (word_str[i], start_sym) == 0) { model->unigrams[i].prob1.l = (LOG10TOLOG(UG_PROB_F(model,i)))*model->lw + model->log_wip; } else { tmp1 = (LOG10TOLOG(UG_PROB_F(model,i))) + logUW; tmp2 = logUniform + logOneMinusUW; FAST_ADD (tmp1,tmp1,tmp2,at,ts); model->unigrams[i].prob1.l = (tmp1 * model->lw) + model->log_wip; } } for (i = 0; i < model->n_prob2; i++) { model->prob2[i].l = (LOG10TOLOG(model->prob2[i].f))*model->lw + model->log_wip; } if (model->tcount > 0) { for (i = 0; i < model->n_bo_wt2; i++) { model->bo_wt2[i].l = (LOG10TOLOG(model->bo_wt2[i].f))*model->lw; } } if (model->tcount > 0) { for (i = 0; i < model->n_prob3; i++) { model->prob3[i].l = (LOG10TOLOG(model->prob3[i].f))*model->lw + model->log_wip; } } } #ifdef NO_DICT main (argc, argv) int32 argc; char *argv[]; { lm_t *model; int32 i, n, score; float lw, uw, wip; char wd[3][100], line[1000]; int32 wid[3]; int32 unkwid; if (argc < 4) QUIT((stderr, "Usage: %s [LM-file]\n", argv[0])); if (sscanf(argv[1], "%f", &lw) != 1) QUIT((stderr, "Usage: %s [LM-file]\n", argv[0])); if (sscanf(argv[2], "%f", &uw) != 1) QUIT((stderr, "Usage: %s [LM-file]\n", argv[0])); if (sscanf(argv[3], "%f", &wip) != 1) QUIT((stderr, "Usage: %s [LM-file]\n", argv[0])); if (argc == 5) lm_read(argv[4], lw, uw, wip, 1.0); else lm_read("/net/alf8/cdrom/wsj1/grammar/tgboc20o.nvp", lw, uw, wip, 1.0); lmSetParameters (model, 0); unkwid = wstr2wid (model, ""); for (;;) { printf ("Enter 1, 2, or 3 words: "); fgets (line, sizeof(line), stdin); if (((n = sscanf (line, "%s %s %s", wd[0], wd[1], wd[2])) < 1) || (n > 3)) break; for (i = 0; i < n; i++) if ((wid[i] = wstr2wid (model, wd[i])) == NO_WORD) { printf (" %s -> \n", wd[i]); wid[i] = unkwid; } switch (n) { case 1: score = unigram_score (wid[0]); printf ("unigram_score(%d) = %d\n", wid[0], score); break; case 2: score = bigram_score (wid[0], wid[1]); printf ("bigram_score(%d, %d) = %d\n", wid[0], wid[1], score); break; case 3: score = trigram_score (wid[0], wid[1], wid[2]); printf ("trigram_score(%d, %d, %d) = %d\n", wid[0], wid[1], wid[2], score); break; default: break; } } } #endif #define BINARY_SEARCH_THRESH 16 int32 lm3g_ug_score (int32 wid) { int32 lwid; if ((lwid = lmp->dictwid_map[wid]) < 0) quit(-1, "%s(%d): dictwid[%d] not in LM\n", __FILE__, __LINE__, wid); lm_last_access_type = LM3G_ACCESS_UG; return (lmp->unigrams[lwid].prob1.l + lmp->inclass_ugscore[wid]); } /* Locate a specific bigram within a bigram list */ static int32 find_bg (bigram_t *bg, int32 n, int32 w) { int32 i, b, e; /* Binary search until segment size < threshold */ b = 0; e = n; while (e-b > BINARY_SEARCH_THRESH) { i = (b+e)>>1; if (bg[i].wid < w) b = i+1; else if (bg[i].wid > w) e = i; else return i; } /* Linear search within narrowed segment */ for (i = b; (i < e) && (bg[i].wid != w); i++); return ((i < e) ? i : -1); } /* w1, w2 are dictionary (base-)word ids */ int32 lm3g_bg_score (int32 w1, int32 w2) { int32 lw1, lw2, i, n, b, score; lm_t *lm; bigram_t *bg; lm = lmp; /* lm->n_bg_score++; */ if ((lw1 = lm->dictwid_map[w1]) < 0) quit(-1, "%s(%d): dictwid[%d] not in LM\n", __FILE__, __LINE__, w1); if ((lw2 = lm->dictwid_map[w2]) < 0) quit(-1, "%s(%d): dictwid[%d] not in LM\n", __FILE__, __LINE__, w2); b = FIRST_BG(lm, lw1); n = FIRST_BG(lm, lw1+1) - b; bg = lm->bigrams + b; if ((i = find_bg (bg, n, lw2)) >= 0) { score = lm->prob2[bg[i].prob2].l; lm_last_access_type = LM3G_ACCESS_BG; } else { /* lm->n_bg_bo++; */ score = lm->unigrams[lw1].bo_wt1.l + lm->unigrams[lw2].prob1.l; lm_last_access_type = LM3G_ACCESS_UG; } score += lm->inclass_ugscore[w2]; #if 0 printf (" %5d %5d -> %8d (%16s %16s)\n", w1, w2, score, WordDict->dict_list[UG_MAPID(lm,lw1)]->word, WordDict->dict_list[UG_MAPID(lm,lw2)]->word); #endif return (score); } static void load_tginfo (lm_t *lm, int32 lw1, int32 lw2) { int32 i, n, b, t; bigram_t *bg; tginfo_t *tginfo; /* First allocate space for tg information for bg lw1,lw2 */ tginfo = (tginfo_t *) listelem_alloc (sizeof(tginfo_t)); tginfo->w1 = lw1; tginfo->tg = NULL; tginfo->next = lm->tginfo[lw2]; lm->tginfo[lw2] = tginfo; /* Locate bigram lw1,lw2 */ b = lm->unigrams[lw1].bigrams; n = lm->unigrams[lw1+1].bigrams - b; bg = lm->bigrams + b; if ((n > 0) && ((i = find_bg (bg, n, lw2)) >= 0)) { tginfo->bowt = lm->bo_wt2[bg[i].bo_wt2].l; /* Find t = Absolute first trigram index for bigram lw1,lw2 */ b += i; /* b = Absolute index of bigram lw1,lw2 on disk */ t = FIRST_TG(lm, b); tginfo->tg = lm->trigrams + t; /* Find #tg for bigram w1,w2 */ tginfo->n_tg = FIRST_TG(lm, b+1) - t; } else { /* No bigram w1,w2 */ tginfo->bowt = 0; tginfo->n_tg = 0; } } /* Similar to find_bg */ static int32 find_tg (trigram_t *tg, int32 n, int32 w) { int32 i, b, e; b = 0; e = n; while (e-b > BINARY_SEARCH_THRESH) { i = (b+e)>>1; if (tg[i].wid < w) b = i+1; else if (tg[i].wid > w) e = i; else return i; } for (i = b; (i < e) && (tg[i].wid != w); i++); return ((i < e) ? i : -1); } /* w1, w2, w3 are dictionary wids */ int32 lm3g_tg_score (int32 w1, int32 w2, int32 w3) { int32 lw1, lw2, lw3, i, n, score; lm_t *lm; trigram_t *tg; tginfo_t *tginfo, *prev_tginfo; lm = lmp; if ((lm->tcount <= 0) || (w1 < 0)) return (lm3g_bg_score (w2, w3)); /* lm->n_tg_score++; */ if ((lw1 = lm->dictwid_map[w1]) < 0) quit(-1, "%s(%d): dictwid[%d] not in LM\n", __FILE__, __LINE__, w1); if ((lw2 = lm->dictwid_map[w2]) < 0) quit(-1, "%s(%d): dictwid[%d] not in LM\n", __FILE__, __LINE__, w2); if ((lw3 = lm->dictwid_map[w3]) < 0) quit(-1, "%s(%d): dictwid[%d] not in LM\n", __FILE__, __LINE__, w3); prev_tginfo = NULL; for (tginfo = lm->tginfo[lw2]; tginfo; tginfo = tginfo->next) { if (tginfo->w1 == lw1) break; prev_tginfo = tginfo; } if (! tginfo) { load_tginfo (lm, lw1, lw2); tginfo = lm->tginfo[lw2]; } else if (prev_tginfo) { prev_tginfo->next = tginfo->next; tginfo->next = lm->tginfo[lw2]; lm->tginfo[lw2] = tginfo; } tginfo->used = 1; /* Trigrams for w1,w2 now pointed to by tginfo */ n = tginfo->n_tg; tg = tginfo->tg; if ((i = find_tg (tg, n, lw3)) >= 0) { score = lm->prob3[tg[i].prob3].l + lm->inclass_ugscore[w3]; lm_last_access_type = LM3G_ACCESS_TG; } else { /* lm->n_tg_bo++; */ score = tginfo->bowt + lm3g_bg_score(w2, w3); } #if 0 printf ("%5d %5d %5d -> %8d (%16s %16s %16s)\n", w1, w2, w3, score, WordDict->dict_list[UG_MAPID(lm,lw1)]->word, WordDict->dict_list[UG_MAPID(lm,lw2)]->word, WordDict->dict_list[UG_MAPID(lm,lw3)]->word); #endif return (score); } void lm3g_cache_reset ( void ) { int32 i; lm_t *lm; tginfo_t *tginfo, *next_tginfo, *prev_tginfo; lm = lmp; for (i = 0; i < lm->ucount; i++) { prev_tginfo = NULL; for (tginfo = lm->tginfo[i]; tginfo; tginfo = next_tginfo) { next_tginfo = tginfo->next; if (! tginfo->used) { /* lm->n_tg_inmem -= tginfo->n_tg; */ listelem_free ((void *) tginfo, sizeof(tginfo_t)); if (prev_tginfo) prev_tginfo->next = next_tginfo; else lm->tginfo[i] = next_tginfo; /* n_tgfree++; */ } else { tginfo->used = 0; prev_tginfo = tginfo; } } } } void lm3g_cache_stats_dump (FILE *file) { /* FIXME: does nothing! */ } void lm_next_frame ( void ) { } int32 lm3g_raw_score (int32 score) { score -= lmp->log_wip; score *= lmp->invlw; return score; } int32 lm3g_access_type ( void ) { return lm_last_access_type; }