/* ==================================================================== * Copyright (c) 1999-2001 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * */ /* * cache_lm.c -- Dynamic cache language model based on Roni Rosenfeld's work. * * HISTORY * * 01-Apr-97 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Started, based on earlier FBS6 version. */ #include #include #include #include #include #include "s2types.h" #include "CM_macros.h" #include "basic_types.h" #include "search_const.h" #include "msd.h" #include "log.h" #include "cache_lm.h" #include "linklist.h" #include "list.h" #include "hash.h" #include "dict.h" #include "lmclass.h" #include "lm_3g.h" #include "kb.h" #include "err.h" static int32 log0; static int32 *log_count_tbl = NULL; #define LOG_COUNT_TBLSIZE 4096 #define LOG_COUNT(x) (((x) >= LOG_COUNT_TBLSIZE) ? LOG((double)(x)) : log_count_tbl[(x)]) /* * Cache LM initialize. */ cache_lm_t *cache_lm_init (double ug_thresh, double min_uw, double max_uw, int32 nwd_uwrange, double bw) { cache_lm_t *lm; int32 i; lm = (cache_lm_t *) CM_calloc (1, sizeof (cache_lm_t)); lm->ugprob_thresh = LOG(ug_thresh); lm->min_uw = min_uw; lm->max_uw = max_uw; lm->per_word_uw = (max_uw - min_uw) / (double)nwd_uwrange; lm->uw_ugcount_limit = nwd_uwrange; lm->uw = min_uw; lm->log_uw = LOG(min_uw); lm->bw = bw; lm->log_bw = LOG(bw); lm->log_remwt = LOG(1.0 - min_uw - bw); lm->n_word = dict_maxsize (); lm->clm_ug = (clm_ug_t *) CM_calloc (lm->n_word, sizeof(clm_ug_t)); lm->sum_ugcount = 0; log0 = LOG(0.0); if (! log_count_tbl) { log_count_tbl = (int32 *) CM_calloc (LOG_COUNT_TBLSIZE, sizeof(int32)); for (i = 0; i < LOG_COUNT_TBLSIZE; i++) log_count_tbl[i] = LOG((double) i); } return lm; } void cache_lm_reset (cache_lm_t *lm) { clm_bg_t *bg, *nextbg; int32 i; for (i = 0; i < lm->n_word; i++) { for (bg = lm->clm_ug[i].w2list; bg; bg = nextbg) { nextbg = bg->next; listelem_free (bg, sizeof(clm_bg_t)); } lm->clm_ug[i].w2list = NULL; lm->clm_ug[i].count = 0; lm->clm_ug[i].sum_w2count = 0; } lm->sum_ugcount = 0; lm->log_uw = LOG(lm->min_uw); lm->log_remwt = LOG(1.0 - lm->min_uw - lm->bw); } void cache_lm_add_ug (cache_lm_t *lm, int32 w) { lm->clm_ug[w].count++; lm->sum_ugcount++; if (lm->sum_ugcount > lm->uw_ugcount_limit) return; /* Still within linear uw region; update unigram cache weight */ lm->uw += lm->per_word_uw; lm->log_uw = LOG(lm->uw); lm->log_remwt = LOG(1.0 - lm->uw - lm->bw); } void cache_lm_add_bg (cache_lm_t *lm, int32 w1, int32 w2) { clm_bg_t *bg; /* Find cache LM bigram entry for w1,w2 */ for (bg = lm->clm_ug[w1].w2list; bg && (bg->w2 != w2); bg = bg->next); if (! bg) { /* First encounter of w1,w2 */ bg = (clm_bg_t *) listelem_alloc (sizeof(clm_bg_t)); bg->w2 = w2; bg->count = 1; bg->next = lm->clm_ug[w1].w2list; lm->clm_ug[w1].w2list = bg; } else bg->count++; lm->clm_ug[w1].sum_w2count++; } /* NOTE: Some approximations in the way the relative language weights are applied */ int32 cache_lm_score (cache_lm_t *lm, int32 w1, int32 w2, int32 *remwt) { int32 bgscr, ugscr, clmscr; clm_bg_t *bg; /* Unigram cache component */ if (lm->clm_ug[w2].count > 0) ugscr = LOG_COUNT(lm->clm_ug[w2].count) - LOG_COUNT(lm->sum_ugcount); else ugscr = log0; ugscr += lm->log_uw; /* Bigram cache component */ for (bg = lm->clm_ug[w1].w2list; bg && (bg->w2 != w2); bg = bg->next); if (bg) bgscr = LOG_COUNT(bg->count) - LOG_COUNT(lm->clm_ug[w1].sum_w2count); else bgscr = log0; bgscr += lm->log_bw; /* Combine unigram and bigram cache component scores */ if ((ugscr > log0) || (bgscr > log0)) { FAST_ADD (clmscr, ugscr, bgscr, Addition_Table, Table_Size); } else clmscr = log0; *remwt = lm->log_remwt; return (clmscr); } void cache_lm_dump (cache_lm_t *lm, char *file) { FILE *fp; int32 i; clm_bg_t *bg; if ((fp = fopen(file, "w")) == NULL) { E_ERROR("fopen(%s,w) failed\n", file); return; } fprintf (fp, "#CacheLMDump\n"); fprintf (fp, "#Unigrams\n"); for (i = 0; i < lm->n_word; i++) { if (lm->clm_ug[i].count > 0) fprintf (fp, "%d %s\n", lm->clm_ug[i].count, kb_get_word_str (i)); } fprintf (fp, "#Bigrams\n"); for (i = 0; i < lm->n_word; i++) { for (bg = lm->clm_ug[i].w2list; bg; bg = bg->next) { fprintf (fp, "%d %s %s\n", bg->count, kb_get_word_str (i), kb_get_word_str (bg->w2)); } } fprintf (fp, "#End\n"); fclose (fp); } void cache_lm_load (cache_lm_t *lm, char *file) { FILE *fp; int32 i, n, w, w2; char line[16384], wd[4096], wd2[4096]; if ((fp = fopen(file, "r")) == NULL) { E_ERROR("fopen(%s,r) failed\n", file); return; } if (fgets (line, sizeof(line), fp) == NULL) { E_ERROR("%s: No header\n", file); fclose (fp); return; } if (strcmp (line, "#CacheLMDump\n") != 0) { E_ERROR("%s: Bad header line: %s\n", file, line); fclose (fp); return; } if ((fgets (line, sizeof(line), fp) == NULL) || (strcmp (line, "#Unigrams\n") != 0)) { E_ERROR("%s: Missing #Unigrams keyword\n", file); fclose (fp); return; } while (fgets (line, sizeof(line), fp) != NULL) { if (sscanf (line, "%d %s", &n, wd) != 2) break; w = kb_get_word_id (wd); if ((w < 0) || (w >= lm->n_word)) { E_ERROR("%s: Unknown word(%s); ignored\n", file, wd); continue; } for (i = 0; i < n; i++) cache_lm_add_ug (lm, w); } if (strcmp (line, "#Bigrams\n") != 0) { E_ERROR("%s: Missing #Bigrams keyword: %s\n", file, line); fclose (fp); return; } while (fgets (line, sizeof(line), fp) != NULL) { if (sscanf (line, "%d %s %s", &n, wd, wd2) != 3) break; w = kb_get_word_id (wd); w2 = kb_get_word_id (wd2); if ((w < 0) || (w >= lm->n_word)) { E_ERROR("%s: Unknown word(%s); ignored\n", file, wd); continue; } if ((w2 < 0) || (w2 >= lm->n_word)) { E_ERROR("%s: Unknown word(%s); ignored\n", file, wd2); continue; } for (i = 0; i < n; i++) cache_lm_add_bg (lm, w, w2); } if (strcmp (line, "#End\n") != 0) E_ERROR("%s: Missing #End keyword: %s\n", file, line); fclose (fp); }