/* ==================================================================== * Copyright (c) 1999-2001 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * */ /* * search.c -- HMM-tree version * * HISTORY * * 30-Oct-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Generalized the implementation of pscr based allphone. * Added phone_conf option at the end of FWDTREE search to produce pscr-based * rescoring of fwdtree result. * * 24-Mar-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added phone perplexity measure into search_hyp_t structure hypothesis * for each utterance. * * 08-Mar-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added lattice density measure into search_hyp_t structure generated * as hypothesis for each utterance. * * 04-Apr-97 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added search_remove_context() call in search_postprocess_bptable. * * 03-Apr-97 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Changed lm_cache_reset to lm3g_cache_reset, and lm_cache_stats_dump * to lm3g_cache_stats_dump. * * 08-Dec-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Changed search_hyp_t hyp[] result to contain actual frame ids. * instead of post-silence-compression ids. * Added functions search_bptbl_wordlist() and search_bptbl_pred(). * * 12-Jul-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Commented out lm_cache_reset in search_fwdflat_start(). * * 19-Jun-95 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Changed strings phone_active to npa (phone_active is too generic). * * 19-Jun-95 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added bestpscr[] and modified compute_phone_active() to use best phone * scores (bestpscr) returned by SCVQ. * * 15-Jun-95 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Modified to always rebuild search tree when search_set_current_lm called. * * 22-May-95 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Changed search_result and search_partial_result interfaces to simplify * network client interfaces for these two. * * 09-Dec-94 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added flat forward pass after tree forward pass. * * Revision 8.10 94/10/11 12:39:52 rkm * Print back trace conditionally, depending on -backtrace argument. * * Revision 8.9 94/07/29 12:01:40 rkm * Added code, in building search tree, to take care of growth in * dictionary size owing to dynamic addition of OOVs. * * Revision 8.8 94/05/26 16:49:41 rkm * Rewrote lattice rescoring non-recursively to reduce LM thrashing, * and moved that code to a separate file. Moved some data structure * declaration (BPTBL_T) to search.h. * * Revision 8.7 94/05/19 14:21:36 rkm * Reordered LM accesses in last_phone_transition and word_transition for * greater efficiency. * * Revision 8.6 94/05/10 10:48:43 rkm * Changed various list memory management routines to use the generic * listelem_alloc() and listelem_free() functions. * Added last_ltrans array for caching the best LM score info during * last_phone_transition(). * * Revision 8.5 94/04/22 13:56:30 rkm * Added search_hyp_t for collecting output hypothesis-related info in * one place. Directed output of both passes to this structure, so the * match file will contain the final result. * * Revision 8.4 94/04/14 14:43:52 rkm * Added optional second pass for lattice-rescoring. * Added option to dump forward pass bptable for postprocessing. * Added option to skip inter-channel transitions in alternate frames. * Fixed bug in init_search_tree in allocating first_phone_rchan_map. * * Revision 8.1 94/02/15 15:13:07 rkm * Derived from v7. Includes multiple start symbols for the LISTEN * project. Includes multiple LMs for grammar switching. * * Revision 1.14 94/02/11 13:12:48 rkm * Added multiple start words for the LISTEN project. * Corrected minor error in statistics gathering. * * Revision 1.13 94/02/03 18:38:12 rkm * Fixed debugging and tracing code. * * Revision 1.12 94/02/01 10:46:54 rkm * Mark active senones only if topN=4; otherwise SCVQ computes all senone scores. * * Revision 1.11 94/02/01 10:23:02 rkm * Lookup trigram LM values through trigram LM cache instead of directly. * * Revision 1.10 94/01/31 14:27:21 rkm * Added code to mark the active senones in each frame. * * Revision 1.9 94/01/25 12:36:45 rkm * Look up LM values through bigram cache instead of directly. * * Revision 1.8 94/01/24 10:01:38 rkm * Include LM score when entering last phone of any word, rather than on * exiting word. Special case for handling single-phone words. * * Revision 1.7 94/01/21 15:22:10 rkm * Minor changes. * * Revision 1.6 94/01/21 15:06:45 rkm * Bug fix in word_transition in compacting BPTable. * * Revision 1.5 94/01/21 13:47:48 rkm * Bug fix in alloc_all_rc(). * * Revision 1.4 94/01/19 11:25:48 rkm * Before rescoring last phone with LM scores. * */ /* * NOTE: this module assumes that the dictionary is organized as follows: * Main, real dictionary words * * ... (possibly more than one of these) * * noise-words... * In particular, note that comes before since occurs in the LM, but * not (well, there's no transition to in the LM). */ #include #include #include #include #include "s2types.h" #include "CM_macros.h" #include "basic_types.h" #include "linklist.h" #include "list.h" #include "hash.h" #include "search_const.h" #include "err.h" #include "dict.h" #include "msd.h" #include "lm.h" #include "lmclass.h" #include "lm_3g.h" #include "phone.h" #include "kb.h" #include "log.h" #include "c.h" #include "assert.h" #include "scvq.h" #include "fbs.h" #include "search.h" #include "hmm_tied_r.h" #ifdef USE_ILM #define lm_bg_score ilm_bg_score #define lm_tg_score ilm_tg_score #endif #define ISA_FILLER_WORD(x) ((x) >= SilenceWordId) #define ISA_REAL_WORD(x) ((x) < FinishWordId) #define SEARCH_PROFILE 1 #define SEARCH_TRACE_CHAN 0 #define SEARCH_TRACE_CHAN_DETAILED 0 #define SEARCH_SELFTEST_DETAILED 0 /* * Search structure of HMM instances (channels; see CHAN_T and ROOT_CHAN_T definitions): * The word triphone sequences (HMM instances) are transformed into tree structures, * one tree per unique left triphone in the entire dictionary (actually diphone, since * its left context varies dyamically during the search process). The entire set of * trees of channels is allocated once and for all during initialization (since * dynamic management of active CHANs is time consuming), with one exception: the * last phones of words, that need multiple right context modelling, are not maintained * in this static structure since there are too many of them and few are active at any * time. Instead they are maintained as linked lists of CHANs, one list per word, * and each CHAN in this set is allocated only on demand and freed if inactive. */ static ROOT_CHAN_T *root_chan; /* one per unique root channel */ static int32 n_root_chan_alloc; /* total number of root channels allocated */ static int32 n_root_chan; /* number of root channels valid for a given utt; depends on words in the LM for that utt */ static int32 n_nonroot_chan; /* #non-root channels in search tree */ /* MAX #non-root channels in search tree for allocating active_chan_list[]... */ static int32 max_nonroot_chan = 0; static int32 n_phone_eval; static int32 n_root_chan_eval; static int32 n_nonroot_chan_eval; static int32 n_last_chan_eval; static int32 n_word_lastchan_eval; static int32 n_lastphn_cand_utt; static int32 n_phn_in_topsen; /* * word_chan[w] = separate linked list of channels for each word w, normally used only * to model the last phone of w, with multiple channels representing different right * context phones. */ static CHAN_T **word_chan; /* word_active[w] = 1 if word w active in current frame, 0 otherwise */ static int32 *word_active; /* * Each node in the HMM tree structure may point to a set of words whose last phone * would follow that node in the tree structure (but is not included in the tree * structure for reasons explained above). The channel node points to one word in this * set of words. The remaining words are linked through homophone_set[]. * * Single-phone words are not represented in the HMM tree; they are kept in word_chan. * * Specifically, homophone_set[w] = wid of next word in the same set as w. */ static WORD_ID *homophone_set; /* * In any frame, only some HMM tree nodes are active. active_chan_list[f mod 2] = * list of nonroot channels in the HMM tree active in frame f. * Similarly, active_word_list[f mod 2] = list of word ids for which active channels * exist in word_chan in frame f. */ static CHAN_T **active_chan_list[2] = {NULL, NULL}; static int32 n_active_chan[2]; /* #entries in active_chan_list */ static WORD_ID *active_word_list[2]; static int32 n_active_word[2]; /* #entries in active_word_list */ static int32 NumWords; /* Total #words in dictionary */ static int32 NumMainDictWords; /* #words in main dictionary, excluding fillers (i.e., , , , and noise words). These come first in WordDict. */ static int32 NumHmmModels; static int32 NumCiPhones; static int32 TotalDists; static SMD *Models; static char **PhoneList; static dictT *WordDict; static LM LangModel = NULL; static int32 UsingDarpaLM; static int32 NoLangModel; static int32 AllWordTProb; /* static int32 AllWordMode; */ static int32 StartWordId; static int32 FinishWordId; static int32 SilenceWordId; static int32 SilencePhoneId; static int32 **LeftContextFwd; static int32 **RightContextFwd; static int32 **RightContextFwdPerm; static int32 *RightContextFwdSize; static int32 **LeftContextBwd; static int32 **LeftContextBwdPerm; static int32 *LeftContextBwdSize; static int32 **RightContextBwd; static int32 *distScores; /* SC scores for current frame being searched */ static int32 **sc_scores; /* SC scores for several frames in advance */ static int32 BestScore; /* Best among all phones */ static int32 LastPhoneBestScore; /* Best among last phones only */ static int32 LogBeamWidth; static int32 NewPhoneLogBeamWidth; static int32 NewWordLogBeamWidth; static int32 LastPhoneAloneLogBeamWidth; static int32 LastPhoneLogBeamWidth; static int32 FwdflatLogBeamWidth; static int32 FwdflatLogWordBeamWidth; static int32 FillerWordPenalty = 0; static int32 SilenceWordPenalty = 0; static int32 LogInsertionPenalty = 0; static int32 logPhoneInsertionPenalty = 0; static double fwdtree_lw = 6.5; static double fwdflat_lw = 8.5; static double bestpath_lw = 9.5; static int32 newword_penalty = 0; /* BestScoreTable[CurrentFrame] === BestScore */ static int32 BestScoreTable[MAX_FRAMES]; static int32 ForcedRecMode; static int32 compute_all_senones = TRUE; static int32 ChannelsPerFrameTarget = 0; /* #channels to eval / frame */ static int32 scVqTopN = 4; static int32 CurrentFrame; static int32 LastFrame; extern int32 use_3g_in_fwd_pass; int32 *senone_active; /* list of active senones in current frame */ int32 n_senone_active; char *senone_active_flag; static int32 n_senone_active_utt; static BPTBL_T *BPTable; /* Forward pass lattice */ static int32 BPIdx; /* First free BPTable entry */ static int32 BPTableSize; static int32 *BScoreStack; /* Score stack for all possible right contexts */ static int32 BSSHead; /* First free BScoreStack entry */ static int32 BScoreStackSize; static int32 *BPTableIdx; /* First BPTable entry for each frame */ static int32 *WordLatIdx; /* BPTable index for any word in current frame; cleared before each frame */ static int32 BPTblOflMsg; /* Whether BPtable overflow msg has been printed */ static int32 *lattice_density; /* #words/frame in lattice */ static double *phone_perplexity;/* How sharply phones are discriminated/frame */ static int32 *zeroPermTab; /* Word-id sequence hypothesized by decoder */ static search_hyp_t hyp[HYP_SZ]; /* no , , or filler words */ static char hyp_str[4096]; /* hyp as string of words sep. by blanks */ static int32 hyp_wid[4096]; static int32 n_hyp_wid; static int32 HypTotalScore; static int32 TotalLangScore; static int32 hyp_alternates = FALSE; int32 ForceIds[256]; int32 ForceLen = 0; static WORD_ID *single_phone_wid; /* list of single-phone word ids */ static int32 n_1ph_words; /* #single phone words in dict (total) */ static int32 n_1ph_LMwords; /* #single phone dict words also in LM; these come first in single_phone_wid */ static void seg_back_trace (int32); static void renormalize_scores (int32); static void fwdflat_renormalize_scores (int32); static int32 renormalized; static int32 skip_alt_frm = 0; /* * Two word context prior to utterance, to be used instead of . Faked by entering * and the context into BPTable before starting search (see search_start_fwd). * If no context, both should be -ve; if only one context word, [1] should be -ve. */ static int32 context_word[2]; #if 0 static dump_search_tree_root (); static dump_search_tree (); #endif /* * Declarations for ci-phone prediction based on top senones (applied only if * topsen_window > 1). In each frame, lookahead topsen_window frames and pick the best * senones (i.e. senones within topsen_thresh of best senone in respective frames). * It is the potential set of future active phones. So restrict phone transitions in * the current frame to this set. (But filler phones are always active.) */ static int32 topsen_window = 1; /* Lookahead window of frames over which top senones used to predict ciphones. No prediction if topsen_window == 1 */ static int32 n_topsen_frm; /* #frames evaluated so far (lookahead version of CurrentFrame) */ static int32 topsen_thresh = -60000; /* Threshold for determining active phones from top senone (initial value is a HACK!!) */ static int32 **npa_frm; /* per frame next-phone-active[ciphone] flag, as determined by top senones */ static int32 *npa; /* npa_frm summed over topsen_window; next-phone-active flag for activating phone transitions. */ static int32 *filler_phone; /* filler_phone[p] = 1 iff p is filler */ static int32 *topsen_score; /* Top senone score in each frame */ static int32 *bestpscr; /* Best senone score within each phone in frame */ static uint16 **utt_pscr = NULL; /* bestpscr for entire utt; scaled */ static void topsen_init ( void ); static void compute_phone_active (int32 topsenscr, int32 npa_th); /* FIXME: put this in a header file */ extern void quit (int status, char const *fmt, ...); int32 context_frames ( void ) { if (context_word[0] < 0) return 0; if (context_word[1] < 0) return 2; return 3; } /* * Candidates words for entering their last phones. Cleared and rebuilt in each * frame. * NOTE: candidates can only be multi-phone, real dictionary words. */ typedef struct lastphn_cand_s { int32 wid; int32 score; int32 bp; int32 next; /* next candidate starting at the same frame */ } lastphn_cand_t; static lastphn_cand_t *lastphn_cand; static int32 n_lastphn_cand; extern int32 print_back_trace; #if 0 /* * Evaluate arcprobs of all active HMMs (actually sseqids) in current frame. */ void evaluateModels (int32 fwd) /* True for the forward direction */ { long i, j, k; int32 *ap; int32 *tp; int32 *dist, tmp, *apl; for (j = n_phone_active, apl = phone_active; j > 0; --j, apl++) { i = *apl; ap = Models[i].arcProb; tp = Models[i].tp; dist = Models[i].dist; tmp = distScores[dist[0]]; ap[0] = tp[0] + tmp; ap[1] = tp[1] + tmp; ap[2] = tp[2] + tmp; tmp = distScores[dist[3]]; ap[3] = tp[3] + tmp; ap[4] = tp[4] + tmp; ap[5] = tp[5] + tmp; tmp = distScores[dist[6]]; ap[6] = tp[6] + tmp; ap[7] = tp[7] + tmp; ap[8] = tp[8] + tmp; tmp = distScores[dist[9]]; ap[9] = tp[9] + tmp; ap[10] = tp[10] + tmp; ap[11] = tp[11] + tmp; tmp = distScores[dist[12]]; ap[12] = tp[12] + tmp; ap[13] = tp[13] + tmp; } k = n_phone_active; #if SEARCH_PROFILE n_phone_eval += k; #endif #if SEARCH_TRACE_CHAN E_INFO ("[%4d] %8d models evaluated\n", CurrentFrame, k); #endif } /* * Evaluate arc-probs of a single hmm (actually sseqid). */ void eval_hmm_arcprob (int32 ssid) { int32 *ap; int32 *tp; int32 *dist, tmp; ap = Models[ssid].arcProb; tp = Models[ssid].tp; dist = Models[ssid].dist; tmp = distScores[dist[0]]; ap[0] = tp[0] + tmp; ap[1] = tp[1] + tmp; ap[2] = tp[2] + tmp; tmp = distScores[dist[3]]; ap[3] = tp[3] + tmp; ap[4] = tp[4] + tmp; ap[5] = tp[5] + tmp; tmp = distScores[dist[6]]; ap[6] = tp[6] + tmp; ap[7] = tp[7] + tmp; ap[8] = tp[8] + tmp; tmp = distScores[dist[9]]; ap[9] = tp[9] + tmp; ap[10] = tp[10] + tmp; ap[11] = tp[11] + tmp; tmp = distScores[dist[12]]; ap[12] = tp[12] + tmp; ap[13] = tp[13] + tmp; } #endif /* Node Plus Arc (mpx channel) */ #define NPA(d,s,a) (s + d->tp[a]) void root_chan_v_mpx_eval (ROOT_CHAN_T *chan) { int32 bestScore; int32 s5, s4, s3, s2, s1, s0, t2, t1, t0; SMD *smd0, *smd1, *smd2, *smd3, *smd4; s4 = chan->score[4]; smd4 = &Models[chan->sseqid[4]]; s4 += distScores[smd4->dist[12]]; s3 = chan->score[3]; smd3 = &Models[chan->sseqid[3]]; s3 += distScores[smd3->dist[9]]; t1 = NPA(smd4,s4,13); t2 = NPA(smd3,s3,11); if (t1 > t2) { s5 = t1; chan->path[5] = chan->path[4]; } else { s5 = t2; chan->path[5] = chan->path[3]; } chan->score[5] = s5; bestScore = s5; s2 = chan->score[2]; smd2 = &Models[chan->sseqid[2]]; s2 += distScores[smd2->dist[6]]; t0 = NPA(smd4,s4,12); t1 = NPA(smd3,s3,10); t2 = NPA(smd2,s2,8); if (t0 > t1) { if (t2 > t0) { s4 = t2; chan->path[4] = chan->path[2]; chan->sseqid[4] = chan->sseqid[2]; } else s4 = t0; } else { if (t2 > t1) { s4 = t2; chan->path[4] = chan->path[2]; chan->sseqid[4] = chan->sseqid[2]; } else { s4 = t1; chan->path[4] = chan->path[3]; chan->sseqid[4] = chan->sseqid[3]; } } if (s4 > bestScore) bestScore = s4; chan->score[4] = s4; s1 = chan->score[1]; smd1 = &Models[chan->sseqid[1]]; s1 += distScores[smd1->dist[3]]; t0 = NPA(smd3,s3,9); t1 = NPA(smd2,s2,7); t2 = NPA(smd1,s1,5); if (t0 > t1) { if (t2 > t0) { s3 = t2; chan->path[3] = chan->path[1]; chan->sseqid[3] = chan->sseqid[1]; } else s3 = t0; } else { if (t2 > t1) { s3 = t2; chan->path[3] = chan->path[1]; chan->sseqid[3] = chan->sseqid[1]; } else { s3 = t1; chan->path[3] = chan->path[2]; chan->sseqid[3] = chan->sseqid[2]; } } if (s3 > bestScore) bestScore = s3; chan->score[3] = s3; s0 = chan->score[0]; smd0 = &Models[chan->sseqid[0]]; s0 += distScores[smd0->dist[0]]; t0 = NPA(smd2,s2,6); t1 = NPA(smd1,s1,4); t2 = NPA(smd0,s0,2); if (t0 > t1) { if (t2 > t0) { s2 = t2; chan->path[2] = chan->path[0]; chan->sseqid[2] = chan->sseqid[0]; } else s2 = t0; } else { if (t2 > t1) { s2 = t2; chan->path[2] = chan->path[0]; chan->sseqid[2] = chan->sseqid[0]; } else { s2 = t1; chan->path[2] = chan->path[1]; chan->sseqid[2] = chan->sseqid[1]; } } if (s2 > bestScore) bestScore = s2; chan->score[2] = s2; t0 = NPA(smd1,s1,3); t1 = NPA(smd0,s0,1); if (t0 > t1) { s1 = t0; } else { s1 = t1; chan->path[1] = chan->path[0]; chan->sseqid[1] = chan->sseqid[0]; } if (s1 > bestScore) bestScore = s1; chan->score[1] = s1; s0 = NPA(smd0,s0,0); if (s0 > bestScore) bestScore = s0; chan->score[0] = s0; chan->bestscore = bestScore; } #define CHAN_V_EVAL(chan,smd) { \ int32 bestScore; \ int32 s5, s4, s3, s2, s1, s0, t2, t1, t0; \ \ s4 = chan->score[4]; \ s4 += distScores[smd->dist[12]]; \ s3 = chan->score[3]; \ s3 += distScores[smd->dist[9]]; \ \ t1 = NPA(smd,s4,13); \ t2 = NPA(smd,s3,11); \ if (t1 > t2) { \ s5 = t1; \ chan->path[5] = chan->path[4]; \ } else { \ s5 = t2; \ chan->path[5] = chan->path[3]; \ } \ chan->score[5] = s5; \ bestScore = s5; \ \ s2 = chan->score[2]; \ s2 += distScores[smd->dist[6]]; \ \ t0 = NPA(smd,s4,12); \ t1 = NPA(smd,s3,10); \ t2 = NPA(smd,s2,8); \ if (t0 > t1) { \ if (t2 > t0) { \ s4 = t2; \ chan->path[4] = chan->path[2]; \ } else \ s4 = t0; \ } else { \ if (t2 > t1) { \ s4 = t2; \ chan->path[4] = chan->path[2]; \ } else { \ s4 = t1; \ chan->path[4] = chan->path[3]; \ } \ } \ if (s4 > bestScore) bestScore = s4; \ chan->score[4] = s4; \ \ s1 = chan->score[1]; \ s1 += distScores[smd->dist[3]]; \ \ t0 = NPA(smd,s3,9); \ t1 = NPA(smd,s2,7); \ t2 = NPA(smd,s1,5); \ if (t0 > t1) { \ if (t2 > t0) { \ s3 = t2; \ chan->path[3] = chan->path[1]; \ } else \ s3 = t0; \ } else { \ if (t2 > t1) { \ s3 = t2; \ chan->path[3] = chan->path[1]; \ } else { \ s3 = t1; \ chan->path[3] = chan->path[2]; \ } \ } \ if (s3 > bestScore) bestScore = s3; \ chan->score[3] = s3; \ \ s0 = chan->score[0]; \ s0 += distScores[smd->dist[0]]; \ \ t0 = NPA(smd,s2,6); \ t1 = NPA(smd,s1,4); \ t2 = NPA(smd,s0,2); \ if (t0 > t1) { \ if (t2 > t0) { \ s2 = t2; \ chan->path[2] = chan->path[0]; \ } else \ s2 = t0; \ } else { \ if (t2 > t1) { \ s2 = t2; \ chan->path[2] = chan->path[0]; \ } else { \ s2 = t1; \ chan->path[2] = chan->path[1]; \ } \ } \ if (s2 > bestScore) bestScore = s2; \ chan->score[2] = s2; \ \ t0 = NPA(smd,s1,3); \ t1 = NPA(smd,s0,1); \ if (t0 > t1) { \ s1 = t0; \ } else { \ s1 = t1; \ chan->path[1] = chan->path[0]; \ } \ if (s1 > bestScore) bestScore = s1; \ chan->score[1] = s1; \ \ s0 = NPA(smd,s0,0); \ if (s0 > bestScore) bestScore = s0; \ chan->score[0] = s0; \ \ chan->bestscore = bestScore; \ } \ void root_chan_v_eval (ROOT_CHAN_T *chan) { SMD *smd0; smd0 = &(Models[chan->sseqid[0]]); CHAN_V_EVAL(chan,smd0); } void chan_v_eval (CHAN_T *chan) { SMD *smd0; smd0 = &(Models[chan->sseqid]); CHAN_V_EVAL(chan,smd0); } int32 eval_root_chan (void) { ROOT_CHAN_T *rhmm; int32 i, cf, bestscore, k; cf = CurrentFrame; bestscore = WORST_SCORE; k = 0; for (i = n_root_chan, rhmm = root_chan; i > 0; --i, rhmm++) { if (rhmm->active == cf) { if (rhmm->mpx) { root_chan_v_mpx_eval (rhmm); } else { root_chan_v_eval (rhmm); } if (bestscore < rhmm->bestscore) bestscore = rhmm->bestscore; #if (SEARCH_PROFILE || SEARCH_TRACE_CHAN) k++; #endif } } #if SEARCH_PROFILE n_root_chan_eval += k; #endif #if SEARCH_TRACE_CHAN E_INFO (" %3d #root(%10d)", cf, k, bestscore); #endif return (bestscore); } int32 eval_nonroot_chan (void) { CHAN_T *hmm, **acl; int32 i, cf, bestscore, k; cf = CurrentFrame; i = n_active_chan[cf & 0x1]; acl = active_chan_list[cf & 0x1]; bestscore = WORST_SCORE; k = i; for (hmm = *(acl++); i > 0; --i, hmm = *(acl++)) { assert(hmm->active==cf); chan_v_eval (hmm); if (bestscore < hmm->bestscore) bestscore = hmm->bestscore; } #if SEARCH_PROFILE n_nonroot_chan_eval += k; #endif #if SEARCH_TRACE_CHAN E_INFO (" %5d #non-root(%10d)", k, bestscore); #endif return (bestscore); } int32 eval_word_chan (void) { ROOT_CHAN_T *rhmm; CHAN_T *hmm; int32 i, w, cf, bestscore, *awl, j, k; k = 0; cf = CurrentFrame; bestscore = WORST_SCORE; awl = active_word_list[cf & 0x1]; i = n_active_word[cf & 0x1]; for (w = *(awl++); i > 0; --i, w = *(awl++)) { assert(word_active[w] != 0); word_active[w] = 0; assert(word_chan[w] != NULL); for (hmm = word_chan[w]; hmm; hmm = hmm->next) { assert(hmm->active == cf); chan_v_eval (hmm); if (bestscore < hmm->bestscore) bestscore = hmm->bestscore; #if (SEARCH_PROFILE || SEARCH_TRACE_CHAN) k++; #endif } } /* Similarly for statically allocated single-phone words */ j = 0; for (i = 0; i < n_1ph_words; i++) { w = single_phone_wid[i]; rhmm = (ROOT_CHAN_T *) word_chan[w]; if (rhmm->active < cf) continue; if (rhmm->mpx) { root_chan_v_mpx_eval (rhmm); } else { root_chan_v_eval (rhmm); } if ((bestscore < rhmm->bestscore) && (w != FinishWordId)) bestscore = rhmm->bestscore; #if (SEARCH_PROFILE || SEARCH_TRACE_CHAN) j++; #endif } #if (SEARCH_PROFILE || SEARCH_TRACE_CHAN) k += j; #endif #if SEARCH_PROFILE n_last_chan_eval += k; n_nonroot_chan_eval += k; n_word_lastchan_eval += n_active_word[cf & 0x1] + j; #endif #if SEARCH_TRACE_CHAN printf (" %5d #leaf(%10d)\n", k, bestscore); #endif return (bestscore); } static void cache_bptable_paths (int32 bp) { int32 w, prev_bp; BPTBL_T *bpe; bpe = &(BPTable[bp]); prev_bp = bp; w = bpe->wid; while (ISA_FILLER_WORD(w)) { prev_bp = BPTable[prev_bp].bp; w = BPTable[prev_bp].wid; } bpe->real_fwid = WordDict->dict_list[w]->fwid; if (use_3g_in_fwd_pass) { prev_bp = BPTable[prev_bp].bp; bpe->prev_real_fwid = (prev_bp != NO_BP) ? BPTable[prev_bp].real_fwid : -1; } else bpe->prev_real_fwid = -1; } void save_bwd_ptr (WORD_ID w, int32 score, int32 path, int32 rc) { int32 _bp_; _bp_ = WordLatIdx[w]; if (_bp_ != NO_BP) { if (BPTable[_bp_].score < score) { if (BPTable[_bp_].bp != path) { BPTable[_bp_].bp = path; cache_bptable_paths (_bp_); } BPTable[_bp_].score = score; } BScoreStack[BPTable[_bp_].s_idx + rc] = score; } else { int32 i, rcsize, *bss; dict_entry_t *de; BPTBL_T *bpe; if ((BPIdx >= BPTableSize) || (BSSHead >= BScoreStackSize-NumCiPhones)) { if (! BPTblOflMsg) { E_WARN("%s(%d): BPTable OVERFLOWED; IGNORING REST OF UTTERANCE!!\n", __FILE__, __LINE__); BPTblOflMsg = 1; } return; } de = WordDict->dict_list[w]; WordLatIdx[w] = BPIdx; bpe = &(BPTable[BPIdx]); bpe->wid = w; bpe->frame = CurrentFrame; bpe->bp = path; bpe->score = score; bpe->s_idx = BSSHead; if ((de->len != 1) && (de->mpx)) { bpe->r_diph = de->phone_ids[de->len - 1]; rcsize = RightContextFwdSize[bpe->r_diph]; } else { bpe->r_diph = -1; rcsize = 1; } for (i = rcsize, bss = BScoreStack+BSSHead; i > 0; --i, bss++) *bss = WORST_SCORE; BScoreStack[BSSHead + rc] = score; cache_bptable_paths (BPIdx); BPIdx++; BSSHead += rcsize; } } /* * Prune currently active root channels for next frame. Also, perform exit * transitions out of them and activate successors. * score[] of pruned root chan set to WORST_SCORE elsewhere. */ void prune_root_chan (void) { ROOT_CHAN_T *rhmm; CHAN_T *hmm; int32 i, cf, nf, w, pip; int32 thresh, newphone_thresh, lastphn_thresh, newphone_score; CHAN_T **nacl; /* next active list */ lastphn_cand_t *candp; dict_entry_t *de; cf = CurrentFrame; nf = cf+1; thresh = BestScore + LogBeamWidth; newphone_thresh = BestScore + NewPhoneLogBeamWidth; lastphn_thresh = BestScore + LastPhoneLogBeamWidth; pip = logPhoneInsertionPenalty; #if 0 printf ("%10d =Th, %10d =NPTh, %10d =LPTh, %10d =bestscore\n", thresh, newphone_thresh, lastphn_thresh, BestScore); #endif nacl = active_chan_list[nf & 0x1]; for (i = 0, rhmm = root_chan; i < n_root_chan; i++, rhmm++) { /* First check if this channel was active in current frame */ if (rhmm->active < cf) continue; if (rhmm->bestscore > thresh) { rhmm->active = nf; /* rhmm will be active in next frame */ if (skip_alt_frm && (! (cf % skip_alt_frm))) continue; /* transitions out of this root channel */ newphone_score = rhmm->score[HMM_LAST_STATE] + pip; if (newphone_score > newphone_thresh) { /* transition to all next-level channels in the HMM tree */ for (hmm = rhmm->next; hmm; hmm = hmm->alt) { if (npa[hmm->ciphone]) { if ((hmm->active < cf) || (hmm->score[0] < newphone_score)) { hmm->score[0] = newphone_score; hmm->path[0] = rhmm->path[HMM_LAST_STATE]; hmm->active = nf; *(nacl++) = hmm; } } } /* * Transition to last phone of all words for which this is the * penultimate phone (the last phones may need multiple right contexts). * Remember to remove the temporary newword_penalty. */ if (newphone_score > lastphn_thresh) { for (w = rhmm->penult_phn_wid; w >= 0; w = homophone_set[w]) { de = WordDict->dict_list[w]; if (npa[de->ci_phone_ids[de->len-1]]) { candp = lastphn_cand + n_lastphn_cand; n_lastphn_cand++; candp->wid = w; candp->score = newphone_score - newword_penalty; candp->bp = rhmm->path[HMM_LAST_STATE]; } } } } } } n_active_chan[nf & 0x1] = nacl - active_chan_list[nf & 0x1]; } /* * Prune currently active nonroot channels in HMM tree for next frame. Also, perform * exit transitions out of such channels and activate successors. */ void prune_nonroot_chan (void) { CHAN_T *hmm, *nexthmm; int32 cf, nf, w, i, pip; int32 thresh, newphone_thresh, lastphn_thresh, newphone_score; CHAN_T **acl, **nacl; /* active list, next active list */ lastphn_cand_t *candp; dict_entry_t *de; cf = CurrentFrame; nf = cf+1; thresh = BestScore + LogBeamWidth; newphone_thresh = BestScore + NewPhoneLogBeamWidth; lastphn_thresh = BestScore + LastPhoneLogBeamWidth; pip = logPhoneInsertionPenalty; acl = active_chan_list[cf & 0x1]; /* currently active HMMs in tree */ nacl = active_chan_list[nf & 0x1] + n_active_chan[nf & 0x1]; for (i = n_active_chan[cf & 0x1], hmm = *(acl++); i > 0; --i, hmm = *(acl++)) { assert(hmm->active >= cf); if (hmm->bestscore > thresh) { /* retain this channel in next frame */ if (hmm->active != nf) { hmm->active = nf; *(nacl++) = hmm; } if (skip_alt_frm && (! (cf % skip_alt_frm))) continue; /* transitions out of this channel */ newphone_score = hmm->score[HMM_LAST_STATE] + pip; if (newphone_score > newphone_thresh) { /* transition to all next-level channel in the HMM tree */ for (nexthmm = hmm->next; nexthmm; nexthmm = nexthmm->alt) { if (npa[nexthmm->ciphone]) { if ((nexthmm->active < cf)||(nexthmm->score[0] < newphone_score)) { nexthmm->score[0] = newphone_score; nexthmm->path[0] = hmm->path[HMM_LAST_STATE]; if (nexthmm->active != nf) { nexthmm->active = nf; *(nacl++) = nexthmm; } } } } /* * Transition to last phone of all words for which this is the * penultimate phone (the last phones may need multiple right contexts). * Remember to remove the temporary newword_penalty. */ if (newphone_score > lastphn_thresh) { for (w = hmm->info.penult_phn_wid; w >= 0; w = homophone_set[w]) { de = WordDict->dict_list[w]; if (npa[de->ci_phone_ids[de->len-1]]) { candp = lastphn_cand + n_lastphn_cand; n_lastphn_cand++; candp->wid = w; candp->score = newphone_score - newword_penalty; candp->bp = hmm->path[HMM_LAST_STATE]; } } } } } else if (hmm->active != nf) { /* hmm->active = -1; */ hmm->bestscore = WORST_SCORE; hmm->score[0] = WORST_SCORE; hmm->score[1] = WORST_SCORE; hmm->score[2] = WORST_SCORE; #if HMM_5_STATE hmm->score[3] = WORST_SCORE; hmm->score[4] = WORST_SCORE; #endif } } n_active_chan[nf & 0x1] = nacl - active_chan_list[nf & 0x1]; } /* * Since the same instance of a word (i.e., ) reaches its last * phone several times, we can compute its best BP and LM transition score info * just the first time and cache it for future occurrences. Structure for such * a cache. */ typedef struct { int32 sf; /* Start frame */ int32 dscr; /* Delta-score upon entering last phone */ int32 bp; /* Best BP */ } last_ltrans_t; static last_ltrans_t *last_ltrans; /* one per word */ #define CAND_SF_ALLOCSIZE 32 typedef struct { int32 bp_ef; int32 cand; } cand_sf_t; static int32 cand_sf_alloc = 0; static cand_sf_t *cand_sf; /* * Execute the transition into the last phone for all candidates words emerging from * the HMM tree. Attach LM scores to such transitions. * (Executed after pruning root and non-root, but before pruning word-chan.) */ void last_phone_transition (void) { int32 i, j, k, cf, nf, bp, bplast, w; lastphn_cand_t *candp; int32 *nawl; int32 fwid2, thresh; int32 *rcpermtab, ciph0; int32 bestscore, dscr; dict_entry_t *de; CHAN_T *hmm; BPTBL_T *bpe; int32 n_cand_sf = 0; cf = CurrentFrame; nf = cf+1; nawl = active_word_list[nf & 0x1]; #if SEARCH_PROFILE n_lastphn_cand_utt += n_lastphn_cand; #endif /* If best LM score and bp for candidate known use it, else sort cands by startfrm */ for (i = 0, candp = lastphn_cand; i < n_lastphn_cand; i++, candp++) { bpe = &(BPTable[candp->bp]); rcpermtab = (bpe->r_diph >= 0) ? RightContextFwdPerm[bpe->r_diph] : zeroPermTab; /* Subtract starting score for candidate, leave it with only word score */ de = WordDict->dict_list[candp->wid]; ciph0 = de->ci_phone_ids[0]; candp->score -= BScoreStack[bpe->s_idx + rcpermtab[ciph0]]; /* * If this candidate not occurred in an earlier frame, prepare for finding * best transition score into last phone; sort by start frame. */ if (last_ltrans[candp->wid].sf != bpe->frame+1) { for (j = 0; j < n_cand_sf; j++) { if (cand_sf[j].bp_ef == bpe->frame) break; } if (j < n_cand_sf) candp->next = cand_sf[j].cand; else { if (n_cand_sf >= cand_sf_alloc) { if (cand_sf_alloc == 0) { cand_sf = (cand_sf_t *) CM_calloc (CAND_SF_ALLOCSIZE, sizeof(cand_sf_t)); cand_sf_alloc = CAND_SF_ALLOCSIZE; } else { cand_sf_alloc += CAND_SF_ALLOCSIZE; cand_sf = (cand_sf_t *) CM_recalloc (cand_sf, cand_sf_alloc, sizeof(cand_sf_t)); E_INFO("%s(%d): cand_sf[] increased to %d entries\n", __FILE__, __LINE__, cand_sf_alloc); } } j = n_cand_sf++; candp->next = -1; cand_sf[j].bp_ef = bpe->frame; } cand_sf[j].cand = i; last_ltrans[candp->wid].dscr = WORST_SCORE; last_ltrans[candp->wid].sf = bpe->frame+1; } } /* Compute best LM score and bp for new cands entered in the sorted lists above */ for (i = 0; i < n_cand_sf; i++) { /* For the i-th unique end frame... */ bp = BPTableIdx[cand_sf[i].bp_ef]; bplast = BPTableIdx[cand_sf[i].bp_ef+1]-1; for (bpe = &(BPTable[bp]); bp <= bplast; bp++, bpe++) { /* For each bp entry in the i-th end frame... */ rcpermtab = (bpe->r_diph >= 0) ? RightContextFwdPerm[bpe->r_diph] : zeroPermTab; /* For each candidate at the start frame find bp->cand transition-score */ for (j = cand_sf[i].cand; j >= 0; j = candp->next) { candp = &(lastphn_cand[j]); de = WordDict->dict_list[candp->wid]; ciph0 = de->ci_phone_ids[0]; fwid2 = de->fwid; dscr = BScoreStack[bpe->s_idx + rcpermtab[ciph0]]; dscr += lm_tg_score(bpe->prev_real_fwid, bpe->real_fwid, fwid2); if (last_ltrans[candp->wid].dscr < dscr) { last_ltrans[candp->wid].dscr = dscr; last_ltrans[candp->wid].bp = bp; } } } } /* Update best transitions for all candidates; also update best lastphone score */ bestscore = LastPhoneBestScore; for (i = 0, candp = lastphn_cand; i < n_lastphn_cand; i++, candp++) { candp->score += last_ltrans[candp->wid].dscr; candp->bp = last_ltrans[candp->wid].bp; if (bestscore < candp->score) bestscore = candp->score; } LastPhoneBestScore = bestscore; /* At this pt, we know the best entry score (with LM component) for all candidates */ thresh = bestscore + LastPhoneAloneLogBeamWidth; for (i = n_lastphn_cand, candp = lastphn_cand; i > 0; --i, candp++) { if (candp->score > thresh) { w = candp->wid; alloc_all_rc (w); k = 0; for (hmm = word_chan[w]; hmm; hmm = hmm->next) { if ((hmm->active < cf) || (hmm->score[0] < candp->score)) { hmm->score[0] = candp->score; hmm->path[0] = candp->bp; assert(hmm->active != nf); hmm->active = nf; k++; } } if (k > 0) { assert(! word_active[w]); *(nawl++) = w; word_active[w] = 1; } } } n_active_word[nf & 0x1] = nawl - active_word_list[nf & 0x1]; } /* * Prune currently active word channels for next frame. Also, perform exit * transitions out of such channels and active successors. */ void prune_word_chan (void) { ROOT_CHAN_T *rhmm; CHAN_T *hmm, *thmm; CHAN_T**phmmp; /* previous HMM-pointer */ int32 cf, nf, w, i, k; int32 newword_thresh, lastphn_thresh; int32 *awl, *nawl; cf = CurrentFrame; nf = cf+1; newword_thresh = LastPhoneBestScore + NewWordLogBeamWidth; lastphn_thresh = LastPhoneBestScore + LastPhoneAloneLogBeamWidth; awl = active_word_list[cf & 0x1]; nawl = active_word_list[nf & 0x1] + n_active_word[nf & 0x1]; /* Dynamically allocated last channels of multi-phone words */ for (i = n_active_word[cf & 0x1], w = *(awl++); i > 0; --i, w = *(awl++)) { k = 0; phmmp = &(word_chan[w]); for (hmm = word_chan[w]; hmm; hmm = thmm) { assert(hmm->active >= cf); thmm = hmm->next; if (hmm->bestscore > lastphn_thresh) { /* retain this channel in next frame */ hmm->active = nf; k++; phmmp = &(hmm->next); /* Could if ((! skip_alt_frm) || (cf & 0x1)) the following */ if (hmm->score[HMM_LAST_STATE] > newword_thresh) { /* can exit channel and recognize word */ save_bwd_ptr (w, hmm->score[HMM_LAST_STATE], hmm->path[HMM_LAST_STATE], hmm->info.rc_id); } } else if (hmm->active == nf) { phmmp = &(hmm->next); } else { listelem_free (hmm, sizeof(CHAN_T)); *phmmp = thmm; } } if ((k > 0) && (! word_active[w])) { *(nawl++) = w; word_active[w] = 1; } } n_active_word[nf & 0x1] = nawl - active_word_list[nf & 0x1]; /* * Prune permanently allocated single-phone channels. * NOTES: score[] of pruned channels set to WORST_SCORE elsewhere. */ for (i = 0; i < n_1ph_words; i++) { w = single_phone_wid[i]; rhmm = (ROOT_CHAN_T *) word_chan[w]; if (rhmm->active < cf) continue; if (rhmm->bestscore > lastphn_thresh) { rhmm->active = nf; /* Could if ((! skip_alt_frm) || (cf & 0x1)) the following */ if (rhmm->score[HMM_LAST_STATE] > newword_thresh) { save_bwd_ptr (w, rhmm->score[HMM_LAST_STATE], rhmm->path[HMM_LAST_STATE], 0); } } } } /* * Allocate last phone channels for all possible right contexts for word w. (Some * may already exist.) * (NOTE: Assume that w uses context!!) */ void alloc_all_rc (int32 w) { dict_entry_t *de; CHAN_T *hmm, *thmm; int32 *sseq_rc; /* list of sseqid for all possible right context for w */ int32 i; de = WordDict->dict_list[w]; assert(de->mpx); sseq_rc = RightContextFwd[de->phone_ids[de->len-1]]; hmm = word_chan[w]; if ((hmm == NULL) || (hmm->sseqid != *sseq_rc)) { hmm = (CHAN_T *) listelem_alloc (sizeof(CHAN_T)); hmm->next = word_chan[w]; word_chan[w] = hmm; hmm->info.rc_id = 0; hmm->bestscore = WORST_SCORE; hmm->score[0] = WORST_SCORE; hmm->score[1] = WORST_SCORE; hmm->score[2] = WORST_SCORE; #if HMM_5_STATE hmm->score[3] = WORST_SCORE; hmm->score[4] = WORST_SCORE; #endif hmm->active = -1; hmm->sseqid = *sseq_rc; } for (i = 1, sseq_rc++; *sseq_rc >= 0; sseq_rc++, i++) { if ((hmm->next == NULL) || (hmm->next->sseqid != *sseq_rc)) { thmm = (CHAN_T *) listelem_alloc (sizeof(CHAN_T)); thmm->next = hmm->next; hmm->next = thmm; hmm = thmm; hmm->info.rc_id = i; hmm->bestscore = WORST_SCORE; hmm->score[0] = WORST_SCORE; hmm->score[1] = WORST_SCORE; hmm->score[2] = WORST_SCORE; #if HMM_5_STATE hmm->score[3] = WORST_SCORE; hmm->score[4] = WORST_SCORE; #endif hmm->active = -1; hmm->sseqid = *sseq_rc; } else hmm = hmm->next; } } void free_all_rc (int32 w) { CHAN_T *hmm, *thmm; for (hmm = word_chan[w]; hmm; hmm = thmm) { thmm = hmm->next; listelem_free (hmm, sizeof(CHAN_T)); } word_chan[w] = NULL; } /* * Structure for reorganizing the BP table entries in the current frame according * to distinct right context ci-phones. Each entry contains the best BP entry for * a given right context. Each successor word will pick up the correct entry based * on its first ci-phone. */ struct bestbp_rc_s { int32 score; int32 path; /* BP table index corresponding to this entry */ int32 lc; /* right most ci-phone of above BP entry word */ } *bestbp_rc; void word_transition (void) { int32 i, k, bp, w, cf, nf; /* int32 prev_bp, prev_wid, prev_endframe, prev2_bp, prev2_wid; */ int32 /* rcsize, */ rc; int32 *rcss; /* right context score stack */ int32 *rcpermtab; int32 thresh, /* newword_thresh, */ newscore; BPTBL_T *bpe; dict_entry_t *pde, *de; /* previous dict entry, dict entry */ ROOT_CHAN_T *rhmm; /* CHAN_T *hmm; */ struct bestbp_rc_s *bestbp_rc_ptr; int32 last_ciph; int32 /* fwid0, fwid1, */ fwid2; int32 pip; int32 ssid; cf = CurrentFrame; /* * Transition to start of new word instances (HMM tree roots); but only if words * other than finished here. * But, first, find the best starting score for each possible right context phone. */ for (i = NumCiPhones-1; i >= 0; --i) bestbp_rc[i].score = WORST_SCORE; k = 0; for (bp = BPTableIdx[cf]; bp < BPIdx; bp++) { bpe = &(BPTable[bp]); WordLatIdx[bpe->wid] = NO_BP; if (bpe->wid == FinishWordId) continue; k++; de = WordDict->dict_list[bpe->wid]; rcpermtab = (bpe->r_diph >= 0) ? RightContextFwdPerm[bpe->r_diph] : zeroPermTab; last_ciph = de->ci_phone_ids[de->len-1]; rcss = &(BScoreStack[bpe->s_idx]); for (rc = NumCiPhones-1; rc >= 0; --rc) { if (rcss[rcpermtab[rc]] > bestbp_rc[rc].score) { bestbp_rc[rc].score = rcss[rcpermtab[rc]]; bestbp_rc[rc].path = bp; bestbp_rc[rc].lc = last_ciph; } } } if (k == 0) return; nf = cf+1; thresh = BestScore + LogBeamWidth; pip = logPhoneInsertionPenalty; /* * Hypothesize successors to words finished in this frame. * Main dictionary, multi-phone words transition to HMM-trees roots. */ for (i = n_root_chan, rhmm = root_chan; i > 0; --i, rhmm++) { bestbp_rc_ptr = &(bestbp_rc[rhmm->ciphone]); if (npa[rhmm->ciphone]) { newscore = bestbp_rc_ptr->score + newword_penalty + pip; if (newscore > thresh) { if ((rhmm->active < cf) || (rhmm->score[0] < newscore)) { ssid = LeftContextFwd[rhmm->diphone][bestbp_rc_ptr->lc]; rhmm->score[0] = newscore; rhmm->path[0] = bestbp_rc_ptr->path; rhmm->active = nf; rhmm->sseqid[0] = ssid; } } } } /* * Single phone words; no right context for these. Cannot use bestbp_rc as * LM scores have to be included. First find best transition to these words. */ for (i = 0; i < n_1ph_LMwords; i++) { w = single_phone_wid[i]; last_ltrans[w].dscr = (int32) 0x80000000; } for (bp = BPTableIdx[cf]; bp < BPIdx; bp++) { bpe = &(BPTable[bp]); rcpermtab = (bpe->r_diph >= 0) ? RightContextFwdPerm[bpe->r_diph] : zeroPermTab; rcss = BScoreStack + bpe->s_idx; for (i = 0; i < n_1ph_LMwords; i++) { w = single_phone_wid[i]; de = WordDict->dict_list[w]; fwid2 = de->fwid; newscore = rcss[rcpermtab[de->ci_phone_ids[0]]]; newscore += lm_tg_score(bpe->prev_real_fwid, bpe->real_fwid, fwid2); if (last_ltrans[w].dscr < newscore) { last_ltrans[w].dscr = newscore; last_ltrans[w].bp = bp; } } } /* Now transition to in-LM single phone words */ for (i = 0; i < n_1ph_LMwords; i++) { w = single_phone_wid[i]; rhmm = (ROOT_CHAN_T *) word_chan[w]; if ((w != FinishWordId) && (! npa[rhmm->ciphone])) continue; if ((newscore = last_ltrans[w].dscr + pip) > thresh) { bpe = BPTable + last_ltrans[w].bp; pde = WordDict->dict_list[bpe->wid]; if ((rhmm->active < cf) || (rhmm->score[0] < newscore)) { rhmm->score[0] = newscore; rhmm->path[0] = last_ltrans[w].bp; if (rhmm->mpx) rhmm->sseqid[0] = LeftContextFwd[rhmm->diphone][pde->ci_phone_ids[pde->len-1]]; rhmm->active = nf; } } } /* Remaining words: , noise words. No mpx for these! */ bestbp_rc_ptr = &(bestbp_rc[SilencePhoneId]); newscore = bestbp_rc_ptr->score + SilenceWordPenalty + pip; if (newscore > thresh) { w = SilenceWordId; rhmm = (ROOT_CHAN_T *) word_chan[w]; if ((rhmm->active < cf) || (rhmm->score[0] < newscore)) { rhmm->score[0] = newscore; rhmm->path[0] = bestbp_rc_ptr->path; rhmm->active = nf; } } newscore = bestbp_rc_ptr->score + FillerWordPenalty + pip; if (newscore > thresh) { for (w = SilenceWordId+1; w < NumWords; w++) { rhmm = (ROOT_CHAN_T *) word_chan[w]; if ((rhmm->active < cf) || (rhmm->score[0] < newscore)) { rhmm->score[0] = newscore; rhmm->path[0] = bestbp_rc_ptr->path; rhmm->active = nf; } } } } #if 0 static void dump_hmm_tp ( void ) { int32 p, ssid, t; for (p = 0; p < NumCiPhones; p++) { ssid = hmm_pid2sid (p); printf ("sseqid(%s)= %d, ", phone_from_id (p), ssid); for (t = 0; t < 14; t++) printf (" %6d", Models[ssid].tp[t]); printf ("\n"); } } #endif void search_initialize (void) { int32 bptable_size = query_lattice_size(); #if SEARCH_TRACE_CHAN_DETAILED static void load_trace_wordlist (); #endif ForcedRecMode = FALSE; NumWords = kb_get_num_words (); NumHmmModels = kb_get_num_models (); TotalDists = kb_get_total_dists (); Models = kb_get_models (); PhoneList = kb_get_phone_list (); WordDict = kb_get_word_dict (); StartWordId = kb_get_word_id (kb_get_lm_start_sym()); FinishWordId = kb_get_word_id (kb_get_lm_end_sym()); SilenceWordId = kb_get_word_id ("SIL"); UsingDarpaLM = kb_get_darpa_lm_flag (); AllWordTProb = kb_get_aw_tprob(); NoLangModel = kb_get_no_lm_flag(); SilencePhoneId = phone_to_id ("SIL", TRUE); NumCiPhones = phoneCiCount(); LeftContextFwd = dict_left_context_fwd (); RightContextFwd = dict_right_context_fwd (); RightContextFwdPerm = dict_right_context_fwd_perm (); RightContextFwdSize = dict_right_context_fwd_size (); LeftContextBwd = dict_left_context_bwd (); LeftContextBwdPerm = dict_left_context_bwd_perm (); LeftContextBwdSize = dict_left_context_bwd_size (); RightContextBwd = dict_right_context_bwd (); NumMainDictWords = dict_get_num_main_words (WordDict); word_chan = (CHAN_T **) CM_calloc (NumWords, sizeof (CHAN_T *)); WordLatIdx = (int32 *) CM_calloc (NumWords, sizeof(int32)); zeroPermTab = (int32 *) CM_calloc (phoneCiCount(), sizeof(int32)); word_active = (int32 *) CM_calloc (NumWords, sizeof(int32)); BPTableSize = MAX (25, NumWords/1000) * MAX_FRAMES; BScoreStackSize = BPTableSize*20; if ((bptable_size > 0) && (bptable_size < 0x7fffffff)) { BPTableSize = bptable_size; BScoreStackSize = BPTableSize*20; /* 20 = ave. rc fanout */ } BPTable = (BPTBL_T *) CM_calloc (BPTableSize, sizeof (BPTBL_T)); BScoreStack = (int32 *) CM_calloc (BScoreStackSize, sizeof(int32)); BPTableIdx = (int32 *) CM_calloc (MAX_FRAMES+2, sizeof(int32)); BPTableIdx++; /* Make BPTableIdx[-1] valid */ lattice_density = (int32 *) CM_calloc (MAX_FRAMES, sizeof(int32)); phone_perplexity = (double *) CM_calloc (MAX_FRAMES, sizeof(double)); init_search_tree (WordDict); active_word_list[0] = (WORD_ID *) CM_calloc (2*(NumWords+1), sizeof (WORD_ID)); active_word_list[1] = active_word_list[0] + NumWords+1; bestbp_rc = (struct bestbp_rc_s *) CM_calloc (NumCiPhones, sizeof (struct bestbp_rc_s)); #if SEARCH_TRACE_CHAN_DETAILED load_trace_wordlist ("_TRACEWORDS_"); #endif lastphn_cand = (lastphn_cand_t *) CM_calloc (NumWords, sizeof(lastphn_cand_t)); senone_active = (int32 *) CM_calloc (TotalDists, sizeof(int32)); senone_active_flag = (char *) CM_calloc (TotalDists, sizeof(char)); last_ltrans = (last_ltrans_t *) CM_calloc (NumWords, sizeof(last_ltrans_t)); search_fwdflat_init (); searchlat_init (); context_word[0] = -1; context_word[1] = -1; /* * Frames window size for predicting phones based on topsen. * If 1, no prediction; use all phones. */ if ((topsen_window = query_topsen_window ()) < 1) quit(-1, "%s(%d): topsen window = %d\n", __FILE__, __LINE__, topsen_window); E_INFO ("%s(%d): topsen-window = %d", __FILE__, __LINE__, topsen_window); topsen_thresh = query_topsen_thresh (); if (topsen_window > 1) E_INFO (", threshold = %d", topsen_thresh); else E_INFO (", no phone-prediction"); E_INFO ("\n"); topsen_init (); sc_scores = (int32 **) CM_2dcalloc (topsen_window, TotalDists, sizeof(int32)); distScores = sc_scores[0]; topsen_score = (int32 *) CM_calloc (MAX_FRAMES, sizeof(int32)); /* Inform SCVQ module of senones/phone and bestscore/phone array */ { bestpscr = (int32 *) CM_calloc (NumCiPhones, sizeof(int32)); utt_pscr = (uint16 **) CM_2dcalloc (MAX_FRAMES, NumCiPhones, sizeof(uint16)); scvq_set_psen (NumCiPhones, hmm_get_psen()); scvq_set_bestpscr (bestpscr); } } int32 *search_get_dist_scores(void) { return distScores; } #if 0 search_init (beam_width, all_word_mode, force_str) float beam_width; /* Width of beam threshold */ int32 all_word_mode; /* True if we're to run in all_word_mode */ char *force_str; /* Non-NULL iff forced recognition mode */ { char *startWord; /* For LISTEN project */ startWord = get_current_startword (); if (*startWord) { StartWordId = kb_get_word_id (startWord); E_INFO("startword %s -> %d (default is %d)\n", startWord, StartWordId, kb_get_word_id (kb_get_lm_start_sym())); if (StartWordId == -1) { StartWordId = kb_get_word_id (kb_get_lm_start_sym()); E_WARN("Using default startwordid %d\n", StartWordId); } } else { StartWordId = kb_get_word_id (kb_get_lm_start_sym()); } LogBeamWidth = 8 * LOG (beam_width); AllWordMode = all_word_mode; /* * If there is reference str we assume we are in forced rec mode */ if (force_str) { quit(-1, "%s(%d): forced recognition not implemented\n", __FILE__, __LINE__); ForcedRecMode = TRUE; ForceLen = parse_ref_str (force_str, ForceIds); } else { ForceLen = 0; ForcedRecMode = FALSE; } return 0; } #endif void search_set_startword (char const *str) { char const *startWord; /* For LISTEN project */ startWord = str; if (*startWord) { if ((StartWordId = kb_get_word_id (startWord)) < 0) { startWord = kb_get_lm_start_sym(); StartWordId = kb_get_word_id (startWord); } } else { startWord = kb_get_lm_start_sym(); StartWordId = kb_get_word_id (startWord); } E_INFO("%s(%d): startword= %s (id= %d)\n", __FILE__, __LINE__, startWord, StartWordId); } /* * Set previous two LM context words for search; ie, the first word decoded by * search will use w1 and w2 as history for trigram scoring. If w2 < 0, only * one-word history available. If both < 0, no history available. */ void search_set_context (int32 w1, int32 w2) { context_word[0] = w1; context_word[1] = w2; } /* * Hyp produced by search includes w1 and w2 from search_set_context() as the first * two words. Remove them, if present, from hyp. */ void search_remove_context (search_hyp_t *hyp) { int32 i, j; j = 0; if (context_word[0] >= 0) j++; if (context_word[1] >= 0) j++; if (j > 0) { for (i = j; hyp[i].wid >= 0; i++) hyp[i-j] = hyp[i]; hyp[i-j].wid = -1; /* hyp_wid has EVERYTHING, including the initial ; remove context after it */ for (i = j+1; i < n_hyp_wid; i++) hyp[i-j] = hyp[i]; n_hyp_wid -= j; } } /* * Mark the active senones for all senones belonging to channels that are active in the * current frame. */ void compute_sen_active (void) { ROOT_CHAN_T *rhmm; CHAN_T *hmm, **acl; int32 i, j, cf, w, *awl, s, *dist, d; char *flagptr; cf = CurrentFrame; memset (senone_active_flag, 0, TotalDists * sizeof(char)); n_senone_active = 0; /* Flag active senones for root channels */ for (i = n_root_chan, rhmm = root_chan; i > 0; --i, rhmm++) { if (rhmm->active == cf) { if (rhmm->mpx) { for (s = 0; s < HMM_LAST_STATE; s++) { dist = Models[rhmm->sseqid[s]].dist; d = dist[s*3]; senone_active_flag[d] = 1; } } else { dist = Models[rhmm->sseqid[0]].dist; for (s = 0; s < TRANS_CNT; s += 3) { d = dist[s]; senone_active_flag[d] = 1; } } } } /* Flag active senones for nonroot channels in HMM tree */ i = n_active_chan[cf & 0x1]; acl = active_chan_list[cf & 0x1]; for (hmm = *(acl++); i > 0; --i, hmm = *(acl++)) { dist = Models[hmm->sseqid].dist; for (s = 0; s < TRANS_CNT; s += 3) { d = dist[s]; senone_active_flag[d] = 1; } } /* Flag active senones for individual word channels */ i = n_active_word[cf & 0x1]; awl = active_word_list[cf & 0x1]; for (w = *(awl++); i > 0; --i, w = *(awl++)) { for (hmm = word_chan[w]; hmm; hmm = hmm->next) { dist = Models[hmm->sseqid].dist; for (s = 0; s < TRANS_CNT; s += 3) { d = dist[s]; senone_active_flag[d] = 1; } } } for (i = 0; i < n_1ph_words; i++) { w = single_phone_wid[i]; rhmm = (ROOT_CHAN_T *) word_chan[w]; if (rhmm->active == cf) { if (rhmm->mpx) { for (s = 0; s < HMM_LAST_STATE; s++) { dist = Models[rhmm->sseqid[s]].dist; d = dist[s*3]; senone_active_flag[d] = 1; } } else { dist = Models[rhmm->sseqid[0]].dist; for (s = 0; s < TRANS_CNT; s += 3) { d = dist[s]; senone_active_flag[d] = 1; } } } } /* Form active senone list */ j = 0; for (i = 0, flagptr = senone_active_flag; i < TotalDists; i++, flagptr++) if (*flagptr) senone_active[j++] = i; n_senone_active = j; } /* * Tree-Search one frame forward. */ void search_fwd (float *cep, float *dcep, float *dcep_80ms, float *pcep, float *ddcep) { int32 *newscr; int32 i, cf; /* Rotate senone scores arrays by one frame, recycling the oldest one */ distScores = sc_scores[0]; for (i = 0; i < topsen_window-1; i++) sc_scores[i] = sc_scores[i+1]; sc_scores[topsen_window-1] = distScores; newscr = distScores; /* Compute senone scores */ cf = CurrentFrame; if (! compute_all_senones) { compute_sen_active (); topsen_score[cf] = SCVQScores(newscr, cep, dcep, dcep_80ms, pcep, ddcep); } else { topsen_score[cf] = SCVQScores_all(newscr, cep, dcep, dcep_80ms, pcep, ddcep); } n_senone_active_utt += n_senone_active; if (topsen_window > 1) { /* * Predict next active phones (npa) from top senones within next topsen_window * frames, including the current frame. */ compute_phone_active (topsen_score[cf], topsen_thresh); distScores = sc_scores[0]; n_topsen_frm++; } else distScores = newscr; if ((topsen_window == 1) || (n_topsen_frm >= topsen_window)) search_one_ply_fwd (); } void search_start_fwd (void) { int32 i, rcsize, lscr; ROOT_CHAN_T *rhmm; dict_entry_t *de; n_phone_eval = 0; n_root_chan_eval = 0; n_nonroot_chan_eval = 0; n_last_chan_eval = 0; n_word_lastchan_eval = 0; n_lastphn_cand_utt = 0; n_phn_in_topsen = 0; n_senone_active_utt = 0; BPIdx = 0; BSSHead = 0; BPTblOflMsg = 0; CurrentFrame = 0; for (i = 0; i < NumWords; i++) WordLatIdx[i] = NO_BP; lm3g_cache_reset (); n_active_chan[0] = 0; n_active_word[0] = 0; BestScore = 0; renormalized = 0; for (i = 0; i < NumWords; i++) last_ltrans[i].sf = -1; hyp_str[0] = '\0'; hyp[0].wid = -1; if (context_word[0] < 0) { /* Start search with ; word_chan[] is permanently allocated */ rhmm = (ROOT_CHAN_T *) word_chan[StartWordId]; rhmm->score[0] = 0; rhmm->score[1] = WORST_SCORE; rhmm->score[2] = WORST_SCORE; #if HMM_5_STATE rhmm->score[3] = WORST_SCORE; rhmm->score[4] = WORST_SCORE; #endif rhmm->bestscore = WORST_SCORE; rhmm->path[0] = NO_BP; rhmm->active = 0; /* Frame in which active */ } else { /* Simulate insertion of context words into BPTable; first */ BPTableIdx[0] = 0; save_bwd_ptr (StartWordId, 0, NO_BP, 0); WordLatIdx[StartWordId] = NO_BP; CurrentFrame++; /* Insert first context word */ BPTableIdx[1] = 1; de = WordDict->dict_list[context_word[0]]; rcsize = (de->mpx && (de->len > 1)) ? RightContextFwdSize[de->phone_ids[de->len-1]] : 1; lscr = lm_bg_score (StartWordId, context_word[0]); for (i = 0; i < rcsize; i++) save_bwd_ptr (context_word[0], lscr, 0, i); WordLatIdx[context_word[0]] = NO_BP; CurrentFrame++; /* Insert 2nd context word, if any */ if (context_word[1] >= 0) { BPTableIdx[2] = 2; de = WordDict->dict_list[context_word[1]]; rcsize = (de->mpx && (de->len > 1)) ? RightContextFwdSize[de->phone_ids[de->len-1]] : 1; lscr += lm_tg_score (StartWordId, context_word[0], context_word[1]); for (i = 0; i < rcsize; i++) save_bwd_ptr (context_word[1], lscr, 1, i); WordLatIdx[context_word[0]] = NO_BP; CurrentFrame++; n_active_chan[1] = 0; n_active_word[1] = 0; } /* Search from silence */ rhmm = (ROOT_CHAN_T *) word_chan[SilenceWordId]; rhmm->score[0] = BPTable[BPIdx-1].score; rhmm->score[1] = WORST_SCORE; rhmm->score[2] = WORST_SCORE; #if HMM_5_STATE rhmm->score[3] = WORST_SCORE; rhmm->score[4] = WORST_SCORE; #endif rhmm->bestscore = WORST_SCORE; rhmm->path[0] = BPIdx-1; rhmm->active = CurrentFrame; /* Frame in which active */ } compute_all_senones = query_compute_all_senones() || (topsen_window > 1); if (topsen_window > 1) { /* Initialize next-phone-active flags */ memset (npa, 0, NumCiPhones * sizeof(int32)); for (i = 0; i < topsen_window; i++) memset (npa_frm[i], 0, NumCiPhones * sizeof(int32)); } n_topsen_frm = 0; } void evaluateChannels (void) { int32 bs; BestScore = eval_root_chan (); if ((bs = eval_nonroot_chan ()) > BestScore) BestScore = bs; if ((bs = eval_word_chan ()) > BestScore) BestScore = bs; LastPhoneBestScore = bs; BestScoreTable[CurrentFrame] = BestScore; } void pruneChannels (void) { n_lastphn_cand = 0; prune_root_chan (); prune_nonroot_chan (); last_phone_transition (); prune_word_chan (); } void search_one_ply_fwd (void) { int32 /* bs, */ i, cf, nf, w; /*, *awl; */ ROOT_CHAN_T *rhmm; if (CurrentFrame >= MAX_FRAMES-1) return; BPTableIdx[CurrentFrame] = BPIdx; /* Need to renormalize? */ if ((BestScore + (2 * LogBeamWidth)) < WORST_SCORE) { E_INFO("%s(%d): Renormalizing Scores at frame %d, best score %d\n", __FILE__, __LINE__, CurrentFrame, BestScore); renormalize_scores (BestScore); } BestScore = WORST_SCORE; #if SEARCH_TRACE_CHAN_DETAILED E_INFO ("[%4d] CHAN trace before eval\n", CurrentFrame); dump_traceword_chan (); #endif evaluateChannels(); #if SEARCH_TRACE_CHAN_DETAILED E_INFO ("[%4d] CHAN trace after eval\n", CurrentFrame); dump_traceword_chan (); #endif pruneChannels (); /* Do inter-word transitions */ if (BPTableIdx[CurrentFrame] < BPIdx) word_transition (); /* Clear score[] of pruned root channels (UGLY!) */ cf = CurrentFrame; nf = cf+1; for (i = n_root_chan, rhmm = root_chan; i > 0; --i, rhmm++) { if (rhmm->active == cf) { rhmm->bestscore = WORST_SCORE; rhmm->score[0] = WORST_SCORE; rhmm->score[1] = WORST_SCORE; rhmm->score[2] = WORST_SCORE; #if HMM_5_STATE rhmm->score[3] = WORST_SCORE; rhmm->score[4] = WORST_SCORE; #endif } } /* Clear score[] of pruned single-phone channels (UGLY!) */ for (i = 0; i < n_1ph_words; i++) { w = single_phone_wid[i]; rhmm = (ROOT_CHAN_T *) word_chan[w]; if (rhmm->active == cf) { rhmm->bestscore = WORST_SCORE; rhmm->score[0] = WORST_SCORE; rhmm->score[1] = WORST_SCORE; rhmm->score[2] = WORST_SCORE; #if HMM_5_STATE rhmm->score[3] = WORST_SCORE; rhmm->score[4] = WORST_SCORE; #endif } } /* This code terminates the loop by updating for the next pass */ CurrentFrame++; if (CurrentFrame >= MAX_FRAMES-1) { E_WARN ("%s(%d): MAX_FRAMES (%d) EXCEEDED; IGNORING REST OF UTTERANCE!!\n", __FILE__, __LINE__, MAX_FRAMES); } lm_next_frame (); } static void compute_phone_perplexity( void ); static search_hyp_t *fwdtree_pscr_path ( void ); void search_finish_fwd (void) { /* register int32 idx; */ int32 i, j, w, cf, nf, /* f, */ *awl; ROOT_CHAN_T *rhmm; CHAN_T *hmm, /* *thmm,*/ **acl; /* int32 bp, bestbp, bestscore; */ /* int32 l_scr; */ if ((CurrentFrame > 0) && (topsen_window > 1)) { /* Wind up remaining frames */ for (i = 1; i < topsen_window; i++) { distScores = sc_scores[i]; search_one_ply_fwd (); } } BPTableIdx[CurrentFrame] = BPIdx; if (CurrentFrame > 0) CurrentFrame--; LastFrame = CurrentFrame; /* Deactivate channels lined up for the next frame */ cf = CurrentFrame; nf = cf+1; /* First, root channels of HMM tree */ for (i = n_root_chan, rhmm = root_chan; i > 0; --i, rhmm++) { rhmm->active = -1; for (j = 0; j < HMM_LAST_STATE; j++) rhmm->score[j] = WORST_SCORE; rhmm->bestscore = WORST_SCORE; } /* nonroot channels of HMM tree */ i = n_active_chan[nf & 0x1]; acl = active_chan_list[nf & 0x1]; for (hmm = *(acl++); i > 0; --i, hmm = *(acl++)) { hmm->active = -1; for (j = 0; j < HMM_LAST_STATE; j++) hmm->score[j] = WORST_SCORE; hmm->bestscore = WORST_SCORE; } /* word channels */ i = n_active_word[nf & 0x1]; awl = active_word_list[nf & 0x1]; for (w = *(awl++); i > 0; --i, w = *(awl++)) { word_active[w] = 0; free_all_rc (w); } for (i = 0; i < n_1ph_words; i++) { w = single_phone_wid[i]; rhmm = (ROOT_CHAN_T *) word_chan[w]; rhmm->active = -1; for (j = 0; j < HMM_LAST_STATE; j++) rhmm->score[j] = WORST_SCORE; rhmm->bestscore = WORST_SCORE; } /* Obtain lattice density and phone perplexity info for this utterance */ bptbl2latdensity(BPIdx, lattice_density); compute_phone_perplexity (); search_postprocess_bptable ((double)1.0, "FWDTREE"); /* Get pscr-score for fwdtree recognition */ { search_hyp_t *pscrpath; if (query_phone_conf ()) { pscrpath = fwdtree_pscr_path (); search_hyp_free (pscrpath); } } #if SEARCH_PROFILE if (LastFrame > 0) { E_INFO("%8d words recognized (%d/fr)\n", BPIdx, (BPIdx+(LastFrame>>1))/(LastFrame+1)); if (topsen_window > 1) E_INFO("%8d phones in topsen (%d/fr)\n", n_phn_in_topsen, n_phn_in_topsen/(LastFrame+1)); E_INFO("%8d senones evaluated (%d/fr)\n", n_senone_active_utt, (n_senone_active_utt + (LastFrame>>1))/(LastFrame+1)); E_INFO("%8d channels searched (%d/fr), %d 1st, %d last\n", n_root_chan_eval + n_nonroot_chan_eval, (n_root_chan_eval + n_nonroot_chan_eval)/(LastFrame+1), n_root_chan_eval, n_last_chan_eval); E_INFO("%8d words for which last channels evaluated (%d/fr)\n", n_word_lastchan_eval, n_word_lastchan_eval/(LastFrame+1)); E_INFO("%8d candidate words for entering last phone (%d/fr)\n", n_lastphn_cand_utt, n_lastphn_cand_utt/(LastFrame+1)); lm3g_cache_stats_dump (stdout); } #endif } void search_postprocess_bptable (double lwf, char const *pass) { /* register int32 idx; */ int32 /* i, j, w,*/ cf, nf, f; /*, *awl; */ /* ROOT_CHAN_T *rhmm; */ /* CHAN_T *hmm, *thmm, **acl; */ int32 bp; int32 l_scr; if (LastFrame < 10) { /* HACK!! Hardwired constant 10 */ E_WARN("%s(%d): UTTERANCE TOO SHORT; IGNORED\n", __FILE__, __LINE__); LastFrame = 0; return; } /* Deactivate channels lined up for the next frame */ cf = CurrentFrame; nf = cf+1; /* * Print final Path */ for (bp = BPTableIdx[cf]; bp < BPIdx; bp++) { if (BPTable[bp].wid == FinishWordId) break; } if (bp >= BPIdx) { int32 bestbp = 0, bestscore = 0; /* FIXME: good defaults? */ E_WARN ("\n%s(%d): **ERROR** Failed to terminate in final state\n\n", __FILE__, __LINE__); /* Find the most recent frame containing the best BP entry */ for (f = cf; (f >= 0) && (BPTableIdx[f] == BPIdx); --f); if (f < 0) { E_WARN ("\n%s(%d): **EMPTY BPTABLE**\n\n", __FILE__, __LINE__); return; } bestscore = WORST_SCORE; for (bp = BPTableIdx[f]; bp < BPTableIdx[f+1]; bp++) { l_scr = lm_tg_score (BPTable[bp].prev_real_fwid, BPTable[bp].real_fwid, FinishWordId); l_scr *= lwf; if (BPTable[bp].score+l_scr > bestscore) { bestscore = BPTable[bp].score + l_scr; bestbp = bp; } } /* Force into bptable */ CurrentFrame++; LastFrame++; save_bwd_ptr (FinishWordId, bestscore, bestbp, 0); BPTableIdx[CurrentFrame+1] = BPIdx; bp = BPIdx-1; } HypTotalScore = BPTable[bp].score; compute_seg_scores (lwf); seg_back_trace (bp); search_remove_context (hyp); search_hyp_to_str(); E_INFO ("%s: %s (%s %d (A=%d L=%d))\n", pass, hyp_str, uttproc_get_uttid(), HypTotalScore, HypTotalScore - TotalLangScore, TotalLangScore); } void bestpath_search ( void ) { if (! renormalized) { lattice_rescore (bestpath_lw / fwdtree_lw); lm3g_cache_stats_dump (stdout); } else { E_INFO ("Renormalized in fwd pass; cannot rescore lattice\n"); } } /* * Convert search hypothesis (word-id sequence) to a single string. */ void search_hyp_to_str ( void ) { int32 i, k, l; char const *wd; hyp_str[0] = '\0'; k = 0; for (i = 0; hyp[i].wid >= 0; i++) { wd = WordIdToStr (WordDict, hyp[i].wid); l = strlen (wd); if (k+l > 4090) quit(-1, "%s(%d): **ERROR** Increase hyp_str[] size\n", __FILE__, __LINE__); strcpy (hyp_str+k, wd); k += l; hyp_str[k] = ' '; k++; hyp_str[k] = '\0'; } } int32 seg_topsen_score (int32 sf, int32 ef) { int32 f, sum; sum = 0; for (f = sf; f <= ef; f++) sum += topsen_score[f]; return (sum); } /* SEG_BACK_TRACE *-------------------------------------------------------------* * Print32 out the backtrace */ static void seg_back_trace (int32 bpidx) { static int32 last_score; static int32 last_time; static int32 seg; /* int32 *probs; */ int32 l_scr; int32 a_scr; int32 a_scr_norm; /* time normalized acoustic score */ int32 raw_scr; int32 seg_len; int32 altpron; int32 topsenscr_norm; int32 f, latden; double perp; altpron = query_report_altpron() || ForcedRecMode; if (bpidx != NO_BP) { seg_back_trace (BPTable[bpidx].bp); l_scr = BPTable[bpidx].lscr; raw_scr = BPTable[bpidx].score - last_score; a_scr = raw_scr - l_scr; seg_len = BPTable[bpidx].frame - last_time; a_scr_norm = ((seg_len == 0) ? 0 : (a_scr / seg_len)); topsenscr_norm = ((seg_len == 0) ? 0 : seg_topsen_score(last_time+1, BPTable[bpidx].frame) / seg_len); TotalLangScore += l_scr; /* Fill in lattice density and perplexity information for this word */ latden = 0; perp = 0.0; for (f = last_time+1; f <= BPTable[bpidx].frame; f++) { latden += lattice_density[f]; perp += phone_perplexity[f]; } if (seg_len > 0) { latden /= seg_len; perp /= seg_len; } if (print_back_trace) printf("%16s (%4d %4d) %7d %10d %8d %8d %6d %6.2f\n", WordIdToStr(WordDict, BPTable[bpidx].wid), last_time + 1, BPTable[bpidx].frame, a_scr_norm, a_scr, l_scr, /* BestScoreTable[BPTable[bpidx].frame] - BPTable[bpidx].score */ topsenscr_norm, latden, perp); hyp_wid[n_hyp_wid++] = BPTable[bpidx].wid; /* Store hypothesis word sequence and segmentation */ if (ISA_REAL_WORD(BPTable[bpidx].wid)) { if (seg >= HYP_SZ-1) quit(-1, "%s(%d): **ERROR** Increase HYP_SZ\n", __FILE__, __LINE__); hyp[seg].wid = altpron ? BPTable[bpidx].wid : WordDict->dict_list[BPTable[bpidx].wid]->fwid; hyp[seg].sf = uttproc_feat2rawfr(last_time + 1); hyp[seg].ef = uttproc_feat2rawfr(BPTable[bpidx].frame); hyp[seg].ascr = a_scr; hyp[seg].lscr = l_scr; hyp[seg].latden = latden; hyp[seg].phone_perp = perp; seg++; hyp[seg].wid = -1; } last_score = BPTable[bpidx].score; last_time = BPTable[bpidx].frame; } else { if (print_back_trace) printf("%16s (%4s %4s) %7s %10s %8s %8s %6s %6s\n\n", "WORD", "SFrm", "Efrm", "AS/Len", "AS_Score", "LM_Scr", "BSDiff", "LatDen", "PhPerp"); TotalLangScore = 0; last_score = 0; last_time = -1; /* Use -1 to count frame 0 */ seg = 0; hyp[0].wid = -1; n_hyp_wid = 0; } } /* PARTIAL_SEG_BACK_TRACE *-------------------------------------------------------------* * Like seg_back_trace, but not as detailed. */ static void partial_seg_back_trace (int32 bpidx) { static int32 seg; static int32 last_time; int32 altpron; altpron = query_report_altpron() || ForcedRecMode; if (bpidx != NO_BP) { partial_seg_back_trace (BPTable[bpidx].bp); /* Store hypothesis word sequence and segmentation */ if (ISA_REAL_WORD(BPTable[bpidx].wid)) { if (seg >= HYP_SZ-1) quit(-1, "%s(%d): **ERROR** Increase HYP_SZ\n", __FILE__, __LINE__); hyp[seg].wid = altpron ? BPTable[bpidx].wid : WordDict->dict_list[BPTable[bpidx].wid]->fwid; hyp[seg].sf = uttproc_feat2rawfr(last_time + 1); hyp[seg].ef = uttproc_feat2rawfr(BPTable[bpidx].frame); seg++; hyp[seg].wid = -1; } last_time = BPTable[bpidx].frame; } else { last_time = -1; /* Use -1 to count frame 0 */ seg = 0; hyp[0].wid = -1; } } static void renormalize_scores (int32 norm) { ROOT_CHAN_T *rhmm; CHAN_T *hmm, **acl; int32 i, j, cf, w, *awl; cf = CurrentFrame; /* Renormalize root channels */ for (i = n_root_chan, rhmm = root_chan; i > 0; --i, rhmm++) { if (rhmm->active == cf) { for (j = 0; j < NODE_CNT; j++) if (rhmm->score[j] > WORST_SCORE) rhmm->score[j] -= norm; } } /* Renormalize nonroot channels in HMM tree */ i = n_active_chan[cf & 0x1]; acl = active_chan_list[cf & 0x1]; for (hmm = *(acl++); i > 0; --i, hmm = *(acl++)) { for (j = 0; j < NODE_CNT; j++) if (hmm->score[j] > WORST_SCORE) hmm->score[j] -= norm; } /* Renormalize individual word channels */ i = n_active_word[cf & 0x1]; awl = active_word_list[cf & 0x1]; for (w = *(awl++); i > 0; --i, w = *(awl++)) { for (hmm = word_chan[w]; hmm; hmm = hmm->next) { for (j = 0; j < NODE_CNT; j++) if (hmm->score[j] > WORST_SCORE) hmm->score[j] -= norm; } } for (i = 0; i < n_1ph_words; i++) { w = single_phone_wid[i]; rhmm = (ROOT_CHAN_T *) word_chan[w]; if (rhmm->active == cf) { for (j = 0; j < NODE_CNT; j++) if (rhmm->score[j] > WORST_SCORE) rhmm->score[j] -= norm; } } renormalized = 1; } int32 search_get_score (void) /*---------------------*/ { return HypTotalScore; } void search_set_beam_width (double beam) { LogBeamWidth = 8 * LOG (beam); } /* SEARCH_SET_NEW_WORD_BEAM *-------------------------------------------------------------* */ void search_set_new_word_beam_width (float beam) { NewWordLogBeamWidth = 8 * LOG (beam); E_INFO ("%8d = new word beam width\n", NewWordLogBeamWidth); } /* SEARCH_SET_LASTPHONE_ALONE_BEAM_WIDTH *-------------------------------------------------------------* */ void search_set_lastphone_alone_beam_width (float beam) { LastPhoneAloneLogBeamWidth = 8 * LOG (beam); E_INFO ("%8d = Last phone alone beam width\n", LastPhoneAloneLogBeamWidth); } /* SEARCH_SET_NEW_PHONE_BEAM *-------------------------------------------------------------* */ void search_set_new_phone_beam_width (float beam) { NewPhoneLogBeamWidth = 8 * LOG (beam); E_INFO ("%8d = new phone beam width\n", NewPhoneLogBeamWidth); } /* SEARCH_SET_LAST_PHONE_BEAM *-------------------------------------------------------------* */ void search_set_last_phone_beam_width (float beam) { LastPhoneLogBeamWidth = 8 * LOG (beam); E_INFO ("%8d = last phone beam width\n", LastPhoneLogBeamWidth); } /* SEARCH_SET_CHANNELS_PER_FRAME_TARGET *-------------------------------------------------------------* */ void search_set_channels_per_frame_target (int32 cpf) { ChannelsPerFrameTarget = cpf; } void searchSetScVqTopN (int32 topN) { scVqTopN = topN; } int32 searchFrame (void) { return LastFrame; } int32 searchCurrentFrame (void) { return CurrentFrame; } void search_set_newword_penalty (double nw_pen) { newword_penalty = LOG (nw_pen); E_INFO ("%8d = newword penalty\n", newword_penalty); } void search_set_silence_word_penalty (float pen, float pip) /* Phone insertion penalty */ { logPhoneInsertionPenalty = LOG(pip); SilenceWordPenalty = LOG (pen) + LOG (pip); E_INFO ("%8d = LOG (Silence Word Penalty) + LOG (Phone Penalty)\n", SilenceWordPenalty); } void search_set_filler_word_penalty (float pen, float pip) { FillerWordPenalty = LOG (pen) + LOG (pip);; E_INFO ("%8d = LOG (Filler Word Penalty) + LOG (Phone Penalty)\n", FillerWordPenalty); } void search_set_lw (double p1lw, double p2lw, double p3lw) { fwdtree_lw = p1lw; fwdflat_lw = p2lw; bestpath_lw = p3lw; E_INFO ("%s(%d): LW = fwdtree: %.1f, fwdflat: %.1f, bestpath: %.1f\n", __FILE__, __LINE__, fwdtree_lw, fwdflat_lw, bestpath_lw); } void search_set_ip (float ip) { LogInsertionPenalty = LOG(ip); } void search_set_hyp_alternates (int32 arg) { hyp_alternates = arg; if (hyp_alternates) E_INFO ("Will report alternate hypotheses\n"); else E_INFO ("Will NOT report alternate hypotheses\n"); } void search_set_skip_alt_frm (int32 flag) { skip_alt_frm = flag; } /* * SEARCH_FINISH_DOCUMENT: clean up at the end of a "document". */ void search_finish_document (void) { } void searchBestN (void) { quit(-1, "%s(%d): searchBestN() not implemented\n", __FILE__,__LINE__); } void search_hyp_write (void) { quit(-1, "%s(%d): search_hyp_write() not implemented\n", __FILE__,__LINE__); } void parse_ref_str (void) { quit(-1, "%s(%d): parse_ref_str() not implemented\n", __FILE__,__LINE__); } /* * Return the best path in the bptable until now. */ int32 search_partial_result (int32 *fr, char **res) { int32 bp, bestscore = 0, bestbp = 0, f; /* FIXME: good defaults? */ bestscore = WORST_SCORE; f = CurrentFrame-1; for (; f >= 0; --f) { for (bp = BPTableIdx[f]; bp < BPIdx; bp++) { if (BPTable[bp].score > bestscore) { bestscore = BPTable[bp].score; bestbp = bp; } } if (bestscore > WORST_SCORE) break; } if (f >= 0) { partial_seg_back_trace (bestbp); search_hyp_to_str(); } else hyp_str[0] = '\0'; *fr = uttproc_feat2rawfr(CurrentFrame); *res = hyp_str; return 0; } /* SEARCH_RESULT *-------------------------------------------------------------* * Return the result of the search. */ int32 search_result (int32 *fr, char **res) { *fr = uttproc_feat2rawfr(LastFrame); *res = hyp_str; return 0; } /* * Return recognized word-id sequence; Note that the user cannot clobber it, * and must make a copy of it, if needed over the long term. */ search_hyp_t *search_get_hyp (void) { return (hyp); } char *search_get_wordlist (int *len, char sep_char) { dict_entry_t **dents = WordDict->dict_list; int32 dent_cnt = WordDict->dict_entry_count; int32 i, p; static char *fwrdl = NULL; static int32 flen = 0; /* malloc memory for all word strings in one shot; do it only once */ if (fwrdl == NULL) { for (i = 0, flen = 0; i < dent_cnt; i++) flen += strlen(dents[i]->word) + 1; /* +1 for sep_char */ ++flen; /* for the terminal '\0' */ fwrdl = (char *) CM_calloc(flen, sizeof(char)); for (i = 0, p = 0; i < dent_cnt; i++) { strcpy(&fwrdl[p], dents[i]->word); p += strlen(dents[i]->word); fwrdl[p++] = sep_char; } fwrdl[p++] = '\0'; } *len = flen; return fwrdl; } void search_filtered_endpts (void) { quit(-1, "%s(%d): search_filtered_endpts() not implemented\n", __FILE__,__LINE__); } /* * Temporary structure for BPTable binary dump. */ struct bptbl_entry_s { uint16 sf, ef; int32 score; int32 ascr, lscr; uint16 bp; uint16 wid; } bptbl_entry; void search_dump_lattice (char const *file) { int32 i; FILE *fp; if ((fp = fopen (file, "w")) == NULL) { E_ERROR("%s(%d): fopen(%s,w) failed\n", __FILE__, __LINE__, file); return; } for (i = 0; i < BPIdx; i++) { bptbl_entry.sf = (BPTable[i].bp >= 0) ? BPTable[BPTable[i].bp].frame+1 : 0; bptbl_entry.ef = BPTable[i].frame; bptbl_entry.score = BPTable[i].score; bptbl_entry.ascr = BPTable[i].ascr; bptbl_entry.lscr = BPTable[i].lscr; bptbl_entry.bp = BPTable[i].bp; bptbl_entry.wid = BPTable[i].wid; fwrite (&bptbl_entry, sizeof(struct bptbl_entry_s), 1, fp); } fclose (fp); } void search_dump_lattice_ascii (char const *file) { int32 i, sf; FILE *fp; if ((fp = fopen (file, "w")) == NULL) { E_ERROR("%s(%d): fopen(%s,w) failed\n", __FILE__, __LINE__, file); return; } fprintf (fp, "%6s %4s %4s %11s %9s %9s %8s %6s %5s %s\n\n", "ID", "SF", "EF", "TOTSCR", "ASCR", "TOPSENSCR", "LSCR", "BPID", "WID", "WORD"); for (i = 0; i < BPIdx; i++) { sf = (BPTable[i].bp >= 0) ? BPTable[BPTable[i].bp].frame+1 : 0; fprintf (fp, "%6d %4d %4d %11d %9d %9d %8d %6d %5d %s\n", i, sf, BPTable[i].frame, BPTable[i].score, BPTable[i].ascr, seg_topsen_score (sf, BPTable[i].frame), BPTable[i].lscr, BPTable[i].bp, BPTable[i].wid, WordDict->dict_list[BPTable[i].wid]->word); } fclose (fp); } #if SEARCH_TRACE_CHAN_DETAILED char *trace_wid; static void load_trace_wordlist (char const *file) { FILE *fp; char wd[1000]; int32 wid; trace_wid = (char *) CM_calloc (NumWords, sizeof(char)); E_INFO("%s(%d): Looking for file trace-wordlist file %s\n", __FILE__, __LINE__, file); if ((fp = fopen (file, "r")) == NULL) { E_ERROR("%s(%d): fopen(%s,r) failed\n", __FILE__, __LINE__, file); return; } while (fscanf (fp, "%s", wd) == 1) { wid = dictStrToWordId (WordDict, wd, TRUE); trace_wid[wid] = 1; } fclose (fp); } void dump_traceword_chan (void) { int32 w, len, i, j, k, cf; dict_entry_t *de; ROOT_CHAN_T *rhmm; CHAN_T *hmm; int32 *sseq_rc; cf = CurrentFrame; for (w = 0; w < NumMainDictWords; w++) { if (! trace_wid[w]) continue; de = WordDict->dict_list[w]; if (de->len == 1) continue; /* Find root channel for first phone of w */ for (i = 0, rhmm = root_chan; i < n_root_chan; i++, rhmm++) { if (rhmm->diphone == de->phone_ids[0]) break; } if (i >= n_root_chan) quit(-1, "%s(%d): Cannot locate rhmm for %s\n", __FILE__, __LINE__, de->word); printf ("[%4d] Trace %5d=wid %4d=active[root] %s\n", cf, w, rhmm->active, de->word); /* If root chan active print details */ if (rhmm->active == cf) { printf (" "); printf (" %10d %10d %10d %10d %10d %10d", rhmm->score[0], rhmm->score[1], rhmm->score[2], rhmm->score[3], rhmm->score[4], rhmm->score[5]); printf (" %4d %4d %4d %4d %4d %4d\n", rhmm->path[0], rhmm->path[1], rhmm->path[2], rhmm->path[3], rhmm->path[4], rhmm->path[5]); printf (" "); for (j = 0; j < 5; j++) { printf (" %d(", rhmm->sseqid[j]); for (k = 0; k < NumCiPhones; k++) { if (LeftContextFwd[rhmm->diphone][k] == rhmm->sseqid[j]) printf ("%s,", phone_from_id(k)); } printf (")\n"); } } /* Track down other phones (except the last) for w in tree */ for (len = 1; len < de->len-1; len++) { hmm = (len == 1) ? rhmm->next : hmm->next; for (; hmm && (de->phone_ids[len] != hmm->sseqid); hmm = hmm->alt); if (! hmm) quit(-1, "%s(%d): Cannot locate %d hmm for %s\n", __FILE__, __LINE__, len, de->word); printf (" %4d=A %5d=ss", hmm->active, hmm->sseqid); if (hmm->active == cf) { printf (" %10d %10d %10d %10d %10d %10d", hmm->score[0], hmm->score[1], hmm->score[2], hmm->score[3], hmm->score[4], hmm->score[5]); printf (" %4d %4d %4d %4d %4d %4d\n", hmm->path[0], hmm->path[1], hmm->path[2], hmm->path[3], hmm->path[4], hmm->path[5]); } else printf ("\n"); } /* Track down last phones (with multiple right contexts) */ for (hmm = word_chan[w]; hmm; hmm = hmm->next) { printf (" %4d=A ", hmm->active); printf (" %10d %10d %10d %10d %10d %10d", hmm->score[0], hmm->score[1], hmm->score[2], hmm->score[3], hmm->score[4], hmm->score[5]); printf (" %4d %4d %4d %4d %4d %4d", hmm->path[0], hmm->path[1], hmm->path[2], hmm->path[3], hmm->path[4], hmm->path[5]); printf (" %d =ss(", hmm->sseqid); j = de->phone_ids[de->len-1]; for (k = 0; k < NumCiPhones; k++) { if (RightContextFwd[j][RightContextFwdPerm[j][k]] == hmm->sseqid) printf ("%s,", phone_from_id(k)); } printf (")\n"); } } #if 0 for (i = 0; i < n_1ph_words; i++) { w = single_phone_wid[i]; if (! trace_wid[w]) continue; de = WordDict->dict_list[w]; printf ("[%4d] Trace %5d=wid %s\n", cf, w, de->word); for (hmm = word_chan[w]; hmm; hmm = hmm->next) { printf (" %4d=A %d=ss", hmm->active, hmm->sseqid); printf (" %10d %10d %10d %10d %10d %10d", hmm->score[0], hmm->score[1], hmm->score[2], hmm->score[3], hmm->score[4], hmm->score[5]); printf (" %4d %4d %4d %4d %4d %4d\n", hmm->path[0], hmm->path[1], hmm->path[2], hmm->path[3], hmm->path[4], hmm->path[5]); } } #endif } #endif static int32 *first_phone_rchan_map; /* map 1st (left) diphone to root-chan index */ /* * Allocate that part of the search channel tree structure that is independent of the * LM in use. */ void init_search_tree (dictT *dict) { int32 w, mpx, max_ph0, i, s; dict_entry_t *de; ROOT_CHAN_T *rhmm; homophone_set = (WORD_ID *) CM_calloc (NumMainDictWords, sizeof (WORD_ID)); /* Find #single phone words, and #unique first diphones (#root channels) in dict. */ max_ph0 = -1; n_1ph_words = 0; mpx = dict->dict_list[0]->mpx; for (w = 0; w < NumMainDictWords; w++) { de = dict->dict_list[w]; if (de->mpx != mpx) quit(-1, "%s(%d): HMM tree words not all mpx or all non-mpx\n", __FILE__, __LINE__); if (de->len == 1) n_1ph_words++; else { if (max_ph0 < de->phone_ids[0]) max_ph0 = de->phone_ids[0]; } } /* Add remaining dict words (, , , noise words) to single-phone words */ n_1ph_words += (NumWords - NumMainDictWords); n_root_chan_alloc = max_ph0+1; /* Allocate and initialize root channels */ root_chan = (ROOT_CHAN_T *) CM_calloc (n_root_chan_alloc, sizeof (ROOT_CHAN_T)); for (i = 0; i < n_root_chan_alloc; i++) { root_chan[i].mpx = mpx; root_chan[i].penult_phn_wid = -1; root_chan[i].active = -1; for (s = 0; s < NODE_CNT; s++) root_chan[i].score[s] = WORST_SCORE; root_chan[i].bestscore = WORST_SCORE; root_chan[i].next = NULL; } /* Allocate space for left-diphone -> root-chan map */ first_phone_rchan_map = (int32 *) CM_calloc (n_root_chan_alloc, sizeof(int32)); /* Permanently allocate channels for single-phone words (1/word) */ rhmm = (ROOT_CHAN_T *) CM_calloc (n_1ph_words, sizeof (ROOT_CHAN_T)); i = 0; for (w = 0; w < NumWords; w++) { de = WordDict->dict_list[w]; if (de->len != 1) continue; rhmm[i].sseqid[0] = de->phone_ids[0]; rhmm[i].diphone = de->phone_ids[0]; rhmm[i].ciphone = de->ci_phone_ids[0]; rhmm[i].mpx = de->mpx; rhmm[i].active = -1; for (s = 0; s < NODE_CNT; s++) rhmm[i].score[s] = WORST_SCORE; rhmm[i].bestscore = WORST_SCORE; rhmm[i].next = NULL; word_chan[w] = (CHAN_T *) &(rhmm[i]); i++; } single_phone_wid = (WORD_ID *) CM_calloc (n_1ph_words, sizeof(WORD_ID)); /* * Create search tree once, without using LM, to know the max #nonroot chans. * search_initialize() needs this to allocate active-channel lists. */ create_search_tree (dict, 0); /* arg2=0 => tree for entire dictionary; not just words in the LMs. */ delete_search_tree (); } /* * One-time initialization of internal channels in HMM tree. */ static void init_nonroot_chan (CHAN_T *hmm, int32 ph, int32 ci) { int32 s; hmm->next = NULL; hmm->alt = NULL; for (s = 0; s < NODE_CNT; s++) hmm->score[s] = WORST_SCORE; hmm->bestscore = WORST_SCORE; hmm->info.penult_phn_wid = -1; hmm->active = -1; hmm->sseqid = ph; hmm->ciphone = ci; } /* * Allocate and initialize search channel-tree structure. If (use_lm) do so wrt the * currently active LM for the next utterance (i.e., ignore words not in LM). * At this point, all the root-channels have been allocated and partly initialized * (as per init_search_tree()), and channels for all the single-phone words have been * allocated and initialized. None of the interior channels of search-trees have * been allocated. * This routine may be called on every utterance, after delete_search_tree() clears * the search tree created for the previous utterance. Meant for reconfiguring the * search tree to suit the currently active LM. */ void create_search_tree (dictT *dict, int32 use_lm) { dict_entry_t *de; CHAN_T *hmm; ROOT_CHAN_T *rhmm; int32 w, i, j, p, ph; if (use_lm) E_INFO("%s(%d): Creating search tree\n", __FILE__, __LINE__); else E_INFO("%s(%d): Estimating maximal search tree\n", __FILE__, __LINE__); for (w = 0; w < NumMainDictWords; w++) homophone_set[w] = -1; for (i = 0; i < n_root_chan_alloc; i++) first_phone_rchan_map[i] = -1; n_1ph_LMwords = 0; n_root_chan = 0; n_nonroot_chan = 0; for (w = 0; w < NumMainDictWords; w++) { de = dict->dict_list[w]; /* Ignore dictionary words not in LM */ if (use_lm && (! dictwd_in_lm (de->fwid))) continue; /* Handle single-phone words individually; not in channel tree */ if (de->len == 1) { single_phone_wid[n_1ph_LMwords++] = w; continue; } /* Insert w into channel tree; first find or allocate root channel */ if (first_phone_rchan_map[de->phone_ids[0]] < 0) { first_phone_rchan_map[de->phone_ids[0]] = n_root_chan; rhmm = &(root_chan[n_root_chan]); rhmm->sseqid[0] = de->phone_ids[0]; rhmm->diphone = de->phone_ids[0]; rhmm->ciphone = de->ci_phone_ids[0]; n_root_chan++; } else rhmm = &(root_chan[first_phone_rchan_map[de->phone_ids[0]]]); /* Now, rhmm = root channel for w. Go on to remaining phones */ if (de->len == 2) { /* Next phone is the last; not kept in tree; add w to penult_phn_wid set */ if ((j = rhmm->penult_phn_wid) < 0) rhmm->penult_phn_wid = w; else { for (; homophone_set[j] >= 0; j = homophone_set[j]); homophone_set[j] = w; } } else { /* Add remaining phones, except the last, to tree */ ph = de->phone_ids[1]; hmm = rhmm->next; if (hmm == NULL) { rhmm->next = hmm = (CHAN_T *) listelem_alloc (sizeof(CHAN_T)); init_nonroot_chan (hmm, ph, de->ci_phone_ids[1]); n_nonroot_chan++; } else { CHAN_T *prev_hmm = NULL; for (; hmm && (hmm->sseqid != ph); hmm = hmm->alt) prev_hmm = hmm; if (! hmm) { /* thanks, rkm! */ prev_hmm->alt = hmm = (CHAN_T *) listelem_alloc (sizeof(CHAN_T)); init_nonroot_chan (hmm, ph, de->ci_phone_ids[1]); n_nonroot_chan++; } } /* de->phone_ids[1] now in tree; pointed to by hmm */ for (p = 2; p < de->len-1; p++) { ph = de->phone_ids[p]; if (! hmm->next) { hmm->next = (CHAN_T *) listelem_alloc (sizeof(CHAN_T)); hmm = hmm->next; init_nonroot_chan (hmm, ph, de->ci_phone_ids[p]); n_nonroot_chan++; } else { CHAN_T * prev_hmm = NULL; for (hmm = hmm->next; hmm && (hmm->sseqid != ph); hmm = hmm->alt) prev_hmm = hmm; if (! hmm) { /* thanks, rkm! */ prev_hmm->alt = hmm = (CHAN_T *) listelem_alloc (sizeof(CHAN_T)); init_nonroot_chan (hmm, ph, de->ci_phone_ids[p]); n_nonroot_chan++; } } } /* All but last phone of w in tree; add w to hmm->info.penult_phn_wid set */ if ((j = hmm->info.penult_phn_wid) < 0) hmm->info.penult_phn_wid = w; else { for (; homophone_set[j] >= 0; j = homophone_set[j]); homophone_set[j] = w; } } } n_1ph_words = n_1ph_LMwords; n_1ph_LMwords++; /* including */ for (w = FinishWordId; w < NumWords; w++) { de = dict->dict_list[w]; if (use_lm && (! (ISA_FILLER_WORD(w))) && (! dictwd_in_lm (de->fwid))) continue; single_phone_wid[n_1ph_words++] = w; } if (max_nonroot_chan < n_nonroot_chan+1) { /* Give some room for channels for new words added dynamically at run time */ max_nonroot_chan = n_nonroot_chan+128; E_INFO ("%s(%d): max nonroot chan increased to %d\n", __FILE__, __LINE__, max_nonroot_chan); /* Free old active channel list array if any and allocate new one */ if (active_chan_list[0] != NULL) free (active_chan_list[0]); active_chan_list[0] = (CHAN_T **) CM_calloc (max_nonroot_chan*2, sizeof(CHAN_T *)); active_chan_list[1] = active_chan_list[0] + max_nonroot_chan; } E_INFO("%s(%d): %d root, %d non-root channels, %d single-phone words\n", __FILE__, __LINE__, n_root_chan, n_nonroot_chan, n_1ph_words); #if 0 printf ("%s(%d): Main Dictionary:\n", __FILE__, __LINE__); for (w = 0; w < n_wd; w++) { de = dict->dict_list[w]; printf ("%s", de->word); for (i = 0; i < de->len; i++) printf (" %d", de->phone_ids[i]); printf ("\n"); } printf ("\n"); printf ("%s(%d): Single-phone words:\n", __FILE__, __LINE__); for (i = 0; i < n_1ph_words; i++) { printf (" %5d %s\n", single_phone_wid[i], dict->dict_list[single_phone_wid[i]]->word); } printf ("\n"); printf ("%s(%d): HMM tree:\n", __FILE__, __LINE__); for (i = 0; i < n_root_chan; i++) dump_search_tree_root (dict, root_chan+i); printf ("\n"); #endif } #if 0 int32 mid_stk[100]; static dump_search_tree_root (dictT *dict, ROOT_CHAN_T *hmm) { int32 i; CHAN_T *t; dict_entry_t *de; printf (" %d(%d):", hmm->diphone, hmm->mpx); for (i = hmm->penult_phn_wid; i >= 0; i = homophone_set[i]) { de = dict->dict_list[i]; printf (" %d: %s\n", de->phone_ids[de->len-1], de->word); } printf (" "); for (t = hmm->next; t; t = t->alt) printf (" %d", t->sseqid); printf ("\n"); mid_stk[0] = hmm->diphone; if (hmm->mpx) mid_stk[0] |= 0x80000000; for (t = hmm->next; t; t = t->alt) dump_search_tree (dict, t, 1); } static dump_search_tree (dictT *dict, CHAN_T *hmm, int32 level) { int32 i; CHAN_T *t; dict_entry_t *de; printf (" %d(%d)", mid_stk[0] & 0x7fffffff, (mid_stk[0] < 0)); for (i = 1; i < level; i++) printf (" %d", mid_stk[i]); printf (" %d:\n", hmm->sseqid); for (i = hmm->info.penult_phn_wid; i >= 0; i = homophone_set[i]) { de = dict->dict_list[i]; printf (" %d: %s\n", de->phone_ids[de->len-1], de->word); } printf (" "); for (t = hmm->next; t; t = t->alt) printf (" %d", t->sseqid); printf ("\n"); mid_stk[level] = hmm->sseqid; for (t = hmm->next; t; t = t->alt) dump_search_tree (dict, t, level+1); } #endif /* * Delete search tree by freeing all interior channels within search tree and * restoring root channel state to the init state (i.e., just after init_search_tree()). */ void delete_search_tree (void) { int32 i; CHAN_T *hmm, *sibling; for (i = 0; i < n_root_chan; i++) { hmm = root_chan[i].next; while (hmm) { sibling = hmm->alt; delete_search_subtree (hmm); hmm = sibling; } root_chan[i].penult_phn_wid = -1; root_chan[i].next = NULL; } } void delete_search_subtree (CHAN_T *hmm) { CHAN_T *child, *sibling; /* First free all children under hmm */ for (child = hmm->next; child; child = sibling) { sibling = child->alt; delete_search_subtree (child); } /* Now free hmm */ listelem_free (hmm, sizeof(CHAN_T)); } /* * Switch search module to new LM. NOTE: The LM module should have been switched first. */ void search_set_current_lm (void) { lm_t *lm; lm = lm_get_current (); if (LangModel) delete_search_tree (); create_search_tree (WordDict, 1); LangModel = lm; } /* * Compute acoustic and LM scores for each BPTable entry (segment). */ void compute_seg_scores (double lwf) { int32 bp, start_score; BPTBL_T *bpe, *p_bpe; int32 *rcpermtab; dict_entry_t *de; for (bp = 0; bp < BPIdx; bp++) { bpe = &(BPTable[bp]); if (bpe->bp == NO_BP) { bpe->ascr = bpe->score; bpe->lscr = 0; continue; } de = WordDict->dict_list[bpe->wid]; p_bpe = &(BPTable[bpe->bp]); rcpermtab = (p_bpe->r_diph >= 0) ? RightContextFwdPerm[p_bpe->r_diph] : zeroPermTab; start_score = BScoreStack[p_bpe->s_idx + rcpermtab[de->ci_phone_ids[0]]]; if (bpe->wid == SilenceWordId) { bpe->lscr = SilenceWordPenalty; } else if (ISA_FILLER_WORD(bpe->wid)) { bpe->lscr = FillerWordPenalty; } else { bpe->lscr = lm_tg_score (p_bpe->prev_real_fwid, p_bpe->real_fwid, de->fwid); bpe->lscr *= lwf; } bpe->ascr = bpe->score - start_score - bpe->lscr; } } int32 search_get_bptable_size (void) { return (BPIdx); } int32 *search_get_lattice_density ( void ) { return lattice_density; } double *search_get_phone_perplexity ( void ) { return phone_perplexity; } void search_set_hyp_total_score (int32 score) { HypTotalScore = score; } int32 search_get_sil_penalty (void) { return (SilenceWordPenalty); } int32 search_get_filler_penalty ( void ) { return (FillerWordPenalty); } BPTBL_T *search_get_bptable ( void ) { return (BPTable); } int32 *search_get_bscorestack ( void ) { return (BScoreStack); } double search_get_lw ( void ) { return (fwdtree_lw); } /* --------------- Re-process lattice a la fbs6 ----------------- */ static latnode_t *frm_wordlist[MAX_FRAMES]; static int32 *fwdflat_wordlist = NULL; static int32 n_fwdflat_chan; static int32 n_fwdflat_words; static int32 n_fwdflat_word_transition; static char *expand_word_flag = NULL; static int32 *expand_word_list = NULL; #define MIN_EF_WIDTH 4 #define MAX_SF_WIN 25 static void build_fwdflat_wordlist ( void ) { int32 i, f, sf, ef, wid, nwd; BPTBL_T *bp; latnode_t *node, *prevnode, *nextnode; dict_entry_t *de; if (! query_fwdtree_flag()) { /* No tree-search run; include all words in expansion list */ for (i = 0; i < StartWordId; i++) fwdflat_wordlist[i] = i; fwdflat_wordlist[i] = -1; return; } memset (frm_wordlist, 0, MAX_FRAMES * sizeof(latnode_t *)); for (i = 0, bp = BPTable; i < BPIdx; i++, bp++) { sf = (bp->bp < 0) ? 0 : BPTable[bp->bp].frame+1; ef = bp->frame; wid = bp->wid; if ((wid >= SilenceWordId) || (wid == StartWordId)) continue; de = WordDict->dict_list[wid]; for (node = frm_wordlist[sf]; node && (node->wid != wid); node = node->next); if (node) node->lef = ef; else { /* New node; link to head of list */ node = (latnode_t *) listelem_alloc (sizeof(latnode_t)); node->wid = wid; node->fef = node->lef = ef; node->next = frm_wordlist[sf]; frm_wordlist[sf] = node; } } /* Eliminate "unlikely" words, for which there are too few end points */ for (f = 0; f <= LastFrame; f++) { prevnode = NULL; for (node = frm_wordlist[f]; node; node = nextnode) { nextnode = node->next; if ((node->lef - node->fef < MIN_EF_WIDTH) || ((node->wid == FinishWordId) && (node->lef < LastFrame-1))) { if (! prevnode) frm_wordlist[f] = nextnode; else prevnode->next = nextnode; listelem_free (node, sizeof(latnode_t)); } else prevnode = node; } } /* Form overall wordlist for 2nd pass */ nwd = 0; memset (word_active, 0, NumWords * sizeof(int32)); for (f = 0; f <= LastFrame; f++) { for (node = frm_wordlist[f]; node; node = node->next) { if (! word_active[node->wid]) { word_active[node->wid] = 1; fwdflat_wordlist[nwd++] = node->wid; } } } fwdflat_wordlist[nwd] = -1; /* * NOTE: fwdflat_wordlist excludes , and noise words; it includes . * That is, it includes anything to which a transition can be made in the LM. */ } void destroy_frm_wordlist ( void ) { latnode_t *node, *tnode; int32 f; if (! query_fwdtree_flag()) return; for (f = 0; f <= LastFrame; f++) { for (node = frm_wordlist[f]; node; node = tnode) { tnode = node->next; listelem_free (node, sizeof(latnode_t)); } } } void build_fwdflat_chan ( void ) { int32 i, s, wid, p; dict_entry_t *de; ROOT_CHAN_T *rhmm; CHAN_T *hmm, *prevhmm; for (i = 0; fwdflat_wordlist[i] >= 0; i++) { wid = fwdflat_wordlist[i]; de = WordDict->dict_list[wid]; if (de->len == 1) continue; assert (de->mpx); assert (word_chan[wid] == NULL); rhmm = (ROOT_CHAN_T *) listelem_alloc (sizeof(ROOT_CHAN_T)); rhmm->diphone = de->phone_ids[0]; rhmm->ciphone = de->ci_phone_ids[0]; rhmm->mpx = 1; rhmm->active = -1; rhmm->bestscore = WORST_SCORE; for (s = 0; s < HMM_LAST_STATE; s++) { rhmm->sseqid[s] = 0; rhmm->score[s] = WORST_SCORE; } rhmm->sseqid[0] = rhmm->diphone; prevhmm = NULL; for (p = 1; p < de->len-1; p++) { hmm = (CHAN_T *) listelem_alloc (sizeof(CHAN_T)); hmm->sseqid = de->phone_ids[p]; hmm->info.rc_id = p+1 - de->len; hmm->active = -1; hmm->bestscore = WORST_SCORE; for (s = 0; s < HMM_LAST_STATE; s++) hmm->score[s] = WORST_SCORE; if (prevhmm) prevhmm->next = hmm; else rhmm->next = hmm; prevhmm = hmm; } alloc_all_rc (wid); if (prevhmm) prevhmm->next = word_chan[wid]; else rhmm->next = word_chan[wid]; word_chan[wid] = (CHAN_T *) rhmm; } } void destroy_fwdflat_chan ( void ) { int32 i, wid; /*, p; */ dict_entry_t *de; ROOT_CHAN_T *rhmm; CHAN_T *hmm, *nexthmm; for (i = 0; fwdflat_wordlist[i] >= 0; i++) { wid = fwdflat_wordlist[i]; de = WordDict->dict_list[wid]; if (de->len == 1) continue; assert (de->mpx); assert (word_chan[wid] != NULL); rhmm = (ROOT_CHAN_T *) word_chan[wid]; hmm = rhmm->next; listelem_free (rhmm, sizeof(ROOT_CHAN_T)); for (; hmm; hmm = nexthmm) { nexthmm = hmm->next; listelem_free (hmm, sizeof(CHAN_T)); } word_chan[wid] = NULL; } } void search_set_fwdflat_bw (double bw, double nwbw) { FwdflatLogBeamWidth = 8*LOG(bw); FwdflatLogWordBeamWidth = 8*LOG(nwbw); E_INFO ("%s(%d): Flat-pass bw = %.1e (%d), nwbw = %.1e (%d)\n", __FILE__, __LINE__, bw, FwdflatLogBeamWidth, nwbw, FwdflatLogWordBeamWidth); } void search_fwdflat_start ( void ) { int32 i, j, s; ROOT_CHAN_T *rhmm; build_fwdflat_wordlist (); build_fwdflat_chan (); BPIdx = 0; BSSHead = 0; BPTblOflMsg = 0; CurrentFrame = 0; for (i = 0; i < NumWords; i++) WordLatIdx[i] = NO_BP; /* lm3g_cache_reset (); */ /* Start search with ; word_chan[] is permanently allocated */ rhmm = (ROOT_CHAN_T *) word_chan[StartWordId]; rhmm->score[0] = 0; for (s = 1; s < HMM_LAST_STATE; s++) rhmm->score[s] = WORST_SCORE; rhmm->bestscore = WORST_SCORE; rhmm->path[0] = NO_BP; rhmm->active = 0; /* Frame in which active */ active_word_list[0][0] = StartWordId; n_active_word[0] = 1; BestScore = 0; renormalized = 0; for (i = 0; i < NumWords; i++) last_ltrans[i].sf = -1; hyp_str[0] = '\0'; hyp[0].wid = -1; n_fwdflat_chan = 0; n_fwdflat_words = 0; n_fwdflat_word_transition = 0; n_senone_active_utt = 0; compute_all_senones = query_compute_all_senones(); distScores = sc_scores[0]; if (! query_fwdtree_flag()) { /* No tree-search run; include all words (upto ) in expansion list */ j = 0; for (i = 0; i < StartWordId; i++) { if (dictwd_in_lm (WordDict->dict_list[i]->fwid)) { expand_word_list[j] = i; expand_word_flag[i] = 1; j++; } else expand_word_flag[i] = 0; } expand_word_list[j] = -1; } } void search_fwdflat_frame (float *cep, float *dcep, float *dcep_80ms, float *pcep, float *ddcep) { int32 nf, i, j; int32 *nawl; if (! compute_all_senones) { compute_fwdflat_senone_active (); SCVQScores(distScores, cep, dcep, dcep_80ms, pcep, ddcep); } else SCVQScores_all(distScores, cep, dcep, dcep_80ms, pcep, ddcep); n_senone_active_utt += n_senone_active; if (CurrentFrame >= MAX_FRAMES-1) return; BPTableIdx[CurrentFrame] = BPIdx; /* Need to renormalize? */ if ((BestScore + (2 * LogBeamWidth)) < WORST_SCORE) { E_INFO("Renormalizing Scores at frame %d, best score %d\n", CurrentFrame, BestScore); fwdflat_renormalize_scores (BestScore); } BestScore = WORST_SCORE; fwdflat_eval_chan (); fwdflat_prune_chan (); fwdflat_word_transition (); /* Create next active word list */ nf = CurrentFrame+1; nawl = active_word_list[nf & 0x1]; for (i = 0, j = 0; fwdflat_wordlist[i] >= 0; i++) { if (word_active[fwdflat_wordlist[i]]) { *(nawl++) = fwdflat_wordlist[i]; j++; } } for (i = StartWordId; i < NumWords; i++) { if (word_active[i]) { *(nawl++) = i; j++; } } n_active_word[nf & 0x1] = j; /* This code terminates the loop by updating for the next pass */ CurrentFrame = nf; if (CurrentFrame >= MAX_FRAMES-1) { E_WARN ("%s(%d): MAX_FRAMES (%d) EXCEEDED; IGNORING REST OF UTTERANCE!!\n", __FILE__, __LINE__, MAX_FRAMES); } lm_next_frame (); } void compute_fwdflat_senone_active ( void ) { int32 i, cf, w, s, d; int32 *awl, *dist; ROOT_CHAN_T *rhmm; CHAN_T *hmm; memset (senone_active_flag, 0, TotalDists * sizeof(char)); n_senone_active = 0; cf = CurrentFrame; i = n_active_word[cf & 0x1]; awl = active_word_list[cf & 0x1]; for (w = *(awl++); i > 0; --i, w = *(awl++)) { rhmm = (ROOT_CHAN_T *) word_chan[w]; if (rhmm->active == cf) { if (rhmm->mpx) { for (s = 0; s < HMM_LAST_STATE; s++) { dist = Models[rhmm->sseqid[s]].dist; d = dist[s*3]; if (! senone_active_flag[d]) { senone_active_flag[d] = 1; senone_active[n_senone_active++] = d; } } } else { dist = Models[rhmm->sseqid[0]].dist; for (s = 0; s < TRANS_CNT; s += 3) { d = dist[s]; if (! senone_active_flag[d]) { senone_active_flag[d] = 1; senone_active[n_senone_active++] = d; } } } } for (hmm = rhmm->next; hmm; hmm = hmm->next) { if (hmm->active == cf) { dist = Models[hmm->sseqid].dist; for (s = 0; s < TRANS_CNT; s += 3) { d = dist[s]; if (! senone_active_flag[d]) { senone_active_flag[d] = 1; senone_active[n_senone_active++] = d; } } } } } } void fwdflat_eval_chan ( void ) { int32 i, cf, w, bestscore; int32 *awl; ROOT_CHAN_T *rhmm; CHAN_T *hmm; cf = CurrentFrame; i = n_active_word[cf & 0x1]; awl = active_word_list[cf & 0x1]; bestscore = WORST_SCORE; n_fwdflat_words += i; for (w = *(awl++); i > 0; --i, w = *(awl++)) { rhmm = (ROOT_CHAN_T *) word_chan[w]; if (rhmm->active == cf) { if (rhmm->mpx) root_chan_v_mpx_eval (rhmm); else root_chan_v_eval (rhmm); n_fwdflat_chan++; } if ((bestscore < rhmm->bestscore) && (w != FinishWordId)) bestscore = rhmm->bestscore; for (hmm = rhmm->next; hmm; hmm = hmm->next) { if (hmm->active == cf) { chan_v_eval (hmm); if (bestscore < hmm->bestscore) bestscore = hmm->bestscore; n_fwdflat_chan++; } } } BestScoreTable[cf] = BestScore = bestscore; } void fwdflat_prune_chan ( void ) { int32 i, cf, nf, w, s, pip, newscore, thresh, wordthresh; int32 *awl; ROOT_CHAN_T *rhmm; CHAN_T *hmm, *nexthmm; dict_entry_t *de; cf = CurrentFrame; nf = cf+1; i = n_active_word[cf & 0x1]; awl = active_word_list[cf & 0x1]; memset (word_active, 0, NumWords * sizeof(int32)); thresh = BestScore + FwdflatLogBeamWidth; wordthresh = BestScore + FwdflatLogWordBeamWidth; pip = logPhoneInsertionPenalty; for (w = *(awl++); i > 0; --i, w = *(awl++)) { de = WordDict->dict_list[w]; rhmm = (ROOT_CHAN_T *) word_chan[w]; if (rhmm->active == cf) { if (rhmm->bestscore > thresh) { rhmm->active = nf; word_active[w] = 1; /* Transitions out of root channel */ newscore = rhmm->score[HMM_LAST_STATE]; if (rhmm->next) { assert (de->len > 1); newscore += + pip; if (newscore > thresh) { hmm = rhmm->next; if (hmm->info.rc_id >= 0) { for (; hmm; hmm = hmm->next) { if ((hmm->active < cf) || (hmm->score[0] < newscore)) { hmm->score[0] = newscore; hmm->path[0] = rhmm->path[HMM_LAST_STATE]; hmm->active = nf; } } } else { if ((hmm->active < cf) || (hmm->score[0] < newscore)) { hmm->score[0] = newscore; hmm->path[0] = rhmm->path[HMM_LAST_STATE]; hmm->active = nf; } } } } else { assert (de->len == 1); if (newscore > wordthresh) { save_bwd_ptr (w, newscore, rhmm->path[HMM_LAST_STATE], 0); } } } } for (hmm = rhmm->next; hmm; hmm = hmm->next) { if (hmm->active >= cf) { if (hmm->bestscore > thresh) { hmm->active = nf; word_active[w] = 1; newscore = hmm->score[HMM_LAST_STATE]; if (hmm->info.rc_id < 0) { newscore += pip; if (newscore > thresh) { nexthmm = hmm->next; if (nexthmm->info.rc_id >= 0) { for (; nexthmm; nexthmm = nexthmm->next) { if ((nexthmm->active < cf) || (nexthmm->score[0] < newscore)) { nexthmm->score[0] = newscore; nexthmm->path[0] = hmm->path[HMM_LAST_STATE]; nexthmm->active = nf; } } } else { if ((nexthmm->active < cf) || (nexthmm->score[0] < newscore)) { nexthmm->score[0] = newscore; nexthmm->path[0] = hmm->path[HMM_LAST_STATE]; nexthmm->active = nf; } } } } else { if (newscore > wordthresh) { save_bwd_ptr (w, newscore, hmm->path[HMM_LAST_STATE], hmm->info.rc_id); } } } else if (hmm->active != nf) { hmm->bestscore = WORST_SCORE; for (s = 0; s < HMM_LAST_STATE; s++) hmm->score[s] = WORST_SCORE; } } } } } void fwdflat_word_transition ( void ) { int32 cf, nf, b, thresh, pip, i, w, s, newscore; int32 best_silrc_score = 0, best_silrc_bp = 0; /* FIXME: good defaults? */ BPTBL_T *bp; dict_entry_t *de, *newde; int32 *rcpermtab, *rcss; ROOT_CHAN_T *rhmm; int32 *awl; double lwf; cf = CurrentFrame; nf = cf+1; thresh = BestScore + FwdflatLogBeamWidth; pip = logPhoneInsertionPenalty; best_silrc_score = WORST_SCORE; lwf = fwdflat_lw / fwdtree_lw; get_expand_wordlist (cf, MAX_SF_WIN); for (b = BPTableIdx[cf]; b < BPIdx; b++) { bp = BPTable+b; WordLatIdx[bp->wid] = NO_BP; if (bp->wid == FinishWordId) continue; de = WordDict->dict_list[bp->wid]; rcpermtab = (bp->r_diph >= 0) ? RightContextFwdPerm[bp->r_diph] : zeroPermTab; rcss = BScoreStack + bp->s_idx; for (i = 0; expand_word_list[i] >= 0; i++) { w = expand_word_list[i]; newde = WordDict->dict_list[w]; newscore = rcss[rcpermtab[newde->ci_phone_ids[0]]]; newscore += lm_tg_score(bp->prev_real_fwid, bp->real_fwid, newde->fwid) * lwf; newscore += pip; if (newscore > thresh) { rhmm = (ROOT_CHAN_T *) word_chan[w]; if ((rhmm->active < cf) || (rhmm->score[0] < newscore)) { rhmm->score[0] = newscore; rhmm->path[0] = b; if (rhmm->mpx) rhmm->sseqid[0] = LeftContextFwd[rhmm->diphone][de->ci_phone_ids[de->len-1]]; rhmm->active = nf; word_active[w] = 1; } } } if (best_silrc_score < rcss[rcpermtab[SilencePhoneId]]) { best_silrc_score = rcss[rcpermtab[SilencePhoneId]]; best_silrc_bp = b; } } /* Transition to */ newscore = best_silrc_score + SilenceWordPenalty + pip; if ((newscore > thresh) && (newscore > WORST_SCORE)) { w = SilenceWordId; rhmm = (ROOT_CHAN_T *) word_chan[w]; if ((rhmm->active < cf) || (rhmm->score[0] < newscore)) { rhmm->score[0] = newscore; rhmm->path[0] = best_silrc_bp; rhmm->active = nf; word_active[w] = 1; } } /* Transition to noise words */ newscore = best_silrc_score + FillerWordPenalty + pip; if ((newscore > thresh) && (newscore > WORST_SCORE)) { for (w = SilenceWordId+1; w < NumWords; w++) { rhmm = (ROOT_CHAN_T *) word_chan[w]; if ((rhmm->active < cf) || (rhmm->score[0] < newscore)) { rhmm->score[0] = newscore; rhmm->path[0] = best_silrc_bp; rhmm->active = nf; word_active[w] = 1; } } } /* Reset initial channels of words that have become inactive even after word trans. */ i = n_active_word[cf & 0x1]; awl = active_word_list[cf & 0x1]; for (w = *(awl++); i > 0; --i, w = *(awl++)) { rhmm = (ROOT_CHAN_T *) word_chan[w]; if (rhmm->active == cf) { rhmm->bestscore = WORST_SCORE; for (s = 0; s < HMM_LAST_STATE; s++) rhmm->score[s] = WORST_SCORE; } } } void search_fwdflat_finish ( void ) { destroy_fwdflat_chan (); destroy_frm_wordlist (); memset (word_active, 0, NumWords * sizeof(int32)); BPTableIdx[CurrentFrame] = BPIdx; CurrentFrame--; /* Backup to the last Active Frame */ LastFrame = CurrentFrame; search_postprocess_bptable ((double) fwdflat_lw/fwdtree_lw, "FWDFLAT"); #if SEARCH_PROFILE E_INFO("%8d words recognized (%d/fr)\n", BPIdx, (BPIdx+(LastFrame>>1))/(LastFrame+1)); E_INFO("%8d senones evaluated (%d/fr)\n", n_senone_active_utt, (n_senone_active_utt + (LastFrame>>1))/(LastFrame+1)); E_INFO("%8d channels searched (%d/fr)\n", n_fwdflat_chan, n_fwdflat_chan/(LastFrame+1)); E_INFO("%8d words searched (%d/fr)\n", n_fwdflat_words, n_fwdflat_words/(LastFrame+1)); E_INFO("%8d word transitions (%d/fr)\n", n_fwdflat_word_transition, n_fwdflat_word_transition/(LastFrame+1)); lm3g_cache_stats_dump (stdout); #endif } static void fwdflat_renormalize_scores (int32 norm) { ROOT_CHAN_T *rhmm; CHAN_T *hmm; int32 i, j, cf, w, *awl; cf = CurrentFrame; /* Renormalize individual word channels */ i = n_active_word[cf & 0x1]; awl = active_word_list[cf & 0x1]; for (w = *(awl++); i > 0; --i, w = *(awl++)) { rhmm = (ROOT_CHAN_T *) word_chan[w]; if (rhmm->active == cf) { for (j = 0; j < NODE_CNT; j++) if (rhmm->score[j] > WORST_SCORE) rhmm->score[j] -= norm; } for (hmm = rhmm->next; hmm; hmm = hmm->next) { if (hmm->active == cf) { for (j = 0; j < NODE_CNT; j++) if (hmm->score[j] > WORST_SCORE) hmm->score[j] -= norm; } } } renormalized = 1; } void get_expand_wordlist (int32 frm, int32 win) { int32 f, sf, ef, nwd; latnode_t *node; if (! query_fwdtree_flag()) { n_fwdflat_word_transition += StartWordId; return; } sf = frm-win; if (sf < 0) sf = 0; ef = frm+win; if (ef > MAX_FRAMES) ef = MAX_FRAMES; memset (expand_word_flag, 0, NumWords); nwd = 0; for (f = sf; f < ef; f++) { for (node = frm_wordlist[f]; node; node = node->next) { if (! expand_word_flag[node->wid]) { expand_word_list[nwd++] = node->wid; expand_word_flag[node->wid] = 1; } } } expand_word_list[nwd] = -1; n_fwdflat_word_transition += nwd; } void search_fwdflat_init ( void ) { fwdflat_wordlist = (int32 *) CM_calloc (NumWords+1, sizeof(int32)); expand_word_flag = (char *) CM_calloc (NumWords, 1); expand_word_list = (int32 *) CM_calloc (NumWords+1, sizeof(int32)); #if 0 E_INFO ("%s(%d): MIN_EF_WIDTH = %d, MAX_SF_WIN = %d\n", __FILE__, __LINE__, MIN_EF_WIDTH, MAX_SF_WIN); #endif } /* ------------------ CODE FOR TOP SENONES BASED SEARCH PRUNING ----------------- */ #define DUMP_PHN_TOPSEN_SCR 0 static void compute_phone_perplexity( void ) { int32 f, nf, p, sum, prob; double pp, sumpp; /* phone probs */ double logpp; /* log phone probs */ double perp; /* Perplexity */ register int32 ts = Table_Size; register int16 *at = Addition_Table; nf = searchFrame(); for (f = 0; f < nf - topsen_window; f++) { /* Find Sum(pscr[p]) over p */ sum = - (utt_pscr[f][0] << 4); for (p = 1; p < NumCiPhones; p++) { prob = - (utt_pscr[f][p] << 4); FAST_ADD(sum, sum, prob, at, ts); } perp = 0.0; sumpp = 0.0; for (p = 0; p < NumCiPhones; p++) { logpp = (utt_pscr[f][p] << 4); logpp = -logpp; logpp -= sum; logpp *= LOG_BASE; pp = exp(logpp); perp -= pp * logpp; sumpp += pp; } phone_perplexity[f] = perp; } for (; f <= nf; f++) phone_perplexity[f] = 1.0; } static void topsen_init ( void ) { int32 p; /* ,s; */ char const *phn_name; npa = (int32 *) CM_calloc (NumCiPhones, sizeof(int32)); npa_frm = (int32 **) CM_2dcalloc (topsen_window, NumCiPhones, sizeof(int32)); if (topsen_window > 1) { filler_phone = (int32 *) CM_calloc (NumCiPhones, sizeof(int32)); for (p = 0; p < NumCiPhones; p++) { phn_name = phone_from_id(p); filler_phone[p] = (phn_name[0] == '+'); } } else { /* All phones always potentially active */ for (p = 0; p < NumCiPhones; p++) npa[p] = 1; } } static void compute_phone_active (int32 topsenscr, int32 npa_th) { int32 *tmp, i, p, *newlist, thresh; uint16 *uttpscrp; thresh = topsenscr + npa_th; /* Remove oldest frame from active list and reclaim space */ for (i = 0; i < NumCiPhones; i++) npa[i] -= npa_frm[0][i]; tmp = npa_frm[0]; for (i = 0; i < topsen_window-1; i++) npa_frm[i] = npa_frm[i+1]; npa_frm[topsen_window-1] = tmp; newlist = tmp; /* Compute phones predicted by top senones in current frame */ memset (newlist, 0, NumCiPhones * sizeof(int32)); uttpscrp = utt_pscr[n_topsen_frm]; for (i = 0; i < NumCiPhones; i++) { if (bestpscr[i] > thresh) newlist[i] = 1; uttpscrp[i] = (-bestpscr[i]) >> 4; } /* Add phones active in current frame to cumulative active phone list */ for (i = 0; i < NumCiPhones; i++) { npa[i] += newlist[i]; if ((! filler_phone[i]) && npa[i]) n_phn_in_topsen++; } if (DUMP_PHN_TOPSEN_SCR) { static int16 *pscr = NULL; static FILE *pscrfp = NULL; char pscrfile[1024]; int32 scaled_scr; if (! pscr) pscr = (int16 *) CM_calloc (NumCiPhones, sizeof(int32)); for (i = 0; i < NumCiPhones; i++) pscr[i] = (int16) 0x8000; for (p = 0; p < NumCiPhones; p++) { scaled_scr = bestpscr[p] >> 5; if (pscr[p] < scaled_scr) pscr[p] = scaled_scr; } if (n_topsen_frm == 0) { if (pscrfp) fclose (pscrfp); sprintf (pscrfile, "%s.pscr", uttproc_get_uttid()); if ((pscrfp = fopen (pscrfile, "w")) == NULL) printf ("%s(%d): fopen(%s,w) failed\n", __FILE__, __LINE__, pscrfile); } if (pscrfp) fwrite (pscr, sizeof(uint16), NumCiPhones, pscrfp); } } uint16 **search_get_uttpscr ( void ) { return utt_pscr; } typedef struct { int32 score; int16 sf; int16 pred; } vithist_t; /* Min frames for each phone in allphone decoding */ #define MIN_ALLPHONE_SEG 3 #define PHONE_TRANS_PROB 0.0001 int32 search_uttpscr2phlat_print ( void ) { int32 *pval; int32 f, i, p, nf, maxp, best, np; int32 *pid; if (topsen_window == 1) return -1; /* No lattice available */ pval = (int32 *) CM_calloc (NumCiPhones, sizeof(int32)); pid = (int32 *) CM_calloc (NumCiPhones, sizeof(int32)); E_INFO ("Phone lattice:\n"); nf = n_topsen_frm; for (f = 0; f < nf; f++) { for (p = 0; p < NumCiPhones; p++) pval[p] = -(utt_pscr[f][p] << 4); best = (int32)0x80000000; np = 0; for (i = 0; i < NumCiPhones; i++) { maxp = 0; for (p = 1; p < NumCiPhones; p++) { if (pval[p] > pval[maxp]) maxp = p; } if (pval[maxp] - (topsen_thresh >> 1) >= best) /* Why >>1? */ pid[np++] = maxp; else break; if (best < pval[maxp]) best = pval[maxp]; pval[maxp] = (int32)0x80000000; } printf ("%5d %3d", f, np); for (i = 0; i < np; i++) printf (" %s", phone_from_id(pid[i])); printf("\n"); } free (pval); return 0; } #ifdef DUMP_VITHIST static void vithist_dump (FILE *fp, vithist_t **vithist, int32 *pid, int32 nfr, int32 n_state) { int32 i, j; for (i = 0; i < nfr; i++) { fprintf (fp, "Frame %4d\n", i); for (j = 0; j < n_state; j++) { fprintf (fp, "\t%3d %4d %10d %3d %s\n", j, vithist[i][j].sf, vithist[i][j].score, vithist[i][j].pred, phone_from_id(pid[j])); } fprintf (fp, "\n"); } fflush (fp); } #endif /* DUMP_VITHIST */ /* * Search a given CI-phone state-graph using utt_pscr scores for the best path. * Called with a completely initialized vithist matrix; in particular the start state. * (Not the most efficient, but can be used to search arbitrary phone-state graphs!) */ static search_hyp_t * search_pscr_path (vithist_t **vithist, /* properly initialized */ char **tmat, /* State-adjacency matrix [from][to] */ int32 *pid, /* CI-phoneid[0..n_state] */ int32 n_state, /* #States in graph being searched */ int32 minseg, /* Min #frames per phone segment */ double tprob, /* State transition (exit) probability */ int32 final_state) /* Nominally, where search should exit */ { int32 i, j, f, tp, newscore, bestscore, bestp, pred_bestp, nseg; search_hyp_t *head, *tmp; if (topsen_window <= 1) { E_ERROR("Must use -topsen prediction to use this feature\n"); return NULL; } tp = LOG(tprob); /* Search */ for (f = 0; f < n_topsen_frm; f++) { /* Update path scores for current frame state scores */ for (i = 0; i < n_state; i++) { vithist[f][i].score -= (utt_pscr[f][pid[i]] << 4); /* Propagate to next frame */ if (vithist[f][i].sf <= f-minseg) { newscore = vithist[f][i].score + tp; for (j = 0; j < n_state; j++) { if ((i == j) || (! tmat[i][j])) continue; if (vithist[f+1][j].score < newscore) { vithist[f+1][j].score = newscore; vithist[f+1][j].pred = i; vithist[f+1][j].sf = f+1; } } } if (vithist[f+1][i].score <= vithist[f][i].score) { vithist[f+1][i].score = vithist[f][i].score; vithist[f+1][i].pred = i; vithist[f+1][i].sf = vithist[f][i].sf; } } } /* Find proper final state to use */ if (vithist[n_topsen_frm-1][final_state].pred < 0) { E_ERROR("%s: search_pscr_path() didn't end in final state\n", uttproc_get_uttid()); #ifdef DUMP_VITHIST vithist_dump (stdout, vithist, pid, n_topsen_frm, n_state); #endif bestscore = (int32)0x80000000; bestp = -1; for (i = 0; i < n_state; i++) { if (vithist[n_topsen_frm-1][i].score > bestscore) { bestscore = vithist[n_topsen_frm-1][i].score; bestp = i; } } if ((bestp < 0) || (vithist[n_topsen_frm-1][bestp].score <= WORST_SCORE)) { E_ERROR("%s: search_pscr_path() failed\n", uttproc_get_uttid()); return NULL; } final_state = bestp; } /* Backtrace. (Hack!! Misuse of search_hyp_t to store phones, rather than words) */ head = (search_hyp_t *) listelem_alloc (sizeof(search_hyp_t)); head->wid = pid[final_state]; head->ef = n_topsen_frm - 1; head->next = NULL; head->ascr = vithist[n_topsen_frm-1][final_state].score; /* Fixed later */ nseg = 1; bestp = final_state; for (f = n_topsen_frm-2; f >= 0; --f) { pred_bestp = vithist[f+1][bestp].pred; if (pred_bestp != bestp) { head->sf = f+1; head->ascr -= (vithist[f][pred_bestp].score + tp); tmp = (search_hyp_t *) listelem_alloc (sizeof(search_hyp_t)); tmp->wid = pid[pred_bestp]; tmp->ef = f; tmp->next = head; head = tmp; head->ascr = vithist[f][pred_bestp].score; nseg++; } bestp = pred_bestp; } head->sf = 0; return head; } static void print_pscr_path (FILE *fp, search_hyp_t *hyp, char const *caption) { search_hyp_t *h; int32 pathscore, nf; if (! hyp) { E_ERROR("%s(%s): none\n", caption, uttproc_get_uttid()); return; } fprintf (fp, "%s(%s):\n", caption, uttproc_get_uttid()); pathscore = nf = 0; for (h = hyp; h; h = h->next) { fprintf (fp, "%5d %5d %10d %s\n", h->sf, h->ef, h->ascr, phone_from_id(h->wid)); pathscore += h->ascr; nf = h->ef; } nf++; fprintf (fp, "Pathscore(%s (%s)): %d /frame: %d\n", caption, uttproc_get_uttid(), pathscore, (pathscore+(nf>>1))/nf); fprintf (fp, "\n"); fflush (fp); } search_hyp_t *search_uttpscr2allphone ( void ) { static vithist_t **allphone_vithist = NULL; static char **allphone_tmat; static int32 *allphone_pid; search_hyp_t *allp; int32 i, j; if (allphone_vithist == NULL) { allphone_vithist = (vithist_t **) CM_2dcalloc (MAX_FRAMES, NumCiPhones, sizeof (vithist_t)); allphone_pid = (int32 *) CM_calloc (NumCiPhones, sizeof(int32)); for (i = 0; i < NumCiPhones; i++) allphone_pid[i] = i; allphone_tmat = (char **) CM_2dcalloc (NumCiPhones, NumCiPhones, sizeof(char)); for (i = 0; i < NumCiPhones; i++) { for (j = 0; j < NumCiPhones; j++) allphone_tmat[i][j] = 1; allphone_tmat[i][i] = 0; } } /* Start search with silencephoneid; all others inactive */ for (i = 0; i < n_topsen_frm; i++) { for (j = 0; j < NumCiPhones; j++) { allphone_vithist[i][j].score = WORST_SCORE; allphone_vithist[i][j].sf = 0; allphone_vithist[i][j].pred = -1; } } allphone_vithist[0][SilencePhoneId].score = 0; allp = search_pscr_path (allphone_vithist, allphone_tmat, allphone_pid, NumCiPhones, MIN_ALLPHONE_SEG, PHONE_TRANS_PROB, SilencePhoneId); print_pscr_path (stdout, allp, "Allphone-PSCR"); return allp; } static search_hyp_t *fwdtree_pscr_path ( void ) { int32 n_state; int32 i, j, s; dict_entry_t *de; vithist_t **pscr_vithist; char **pscr_tmat; int32 *pscr_pid; search_hyp_t *hyp; n_state = 0; for (i = 0; i < n_hyp_wid; i++) { de = WordDict->dict_list[hyp_wid[i]]; n_state += de->len; } pscr_vithist = (vithist_t **) CM_2dcalloc (MAX_FRAMES, n_state, sizeof(vithist_t)); pscr_pid = (int32 *) CM_calloc (n_state, sizeof(int32)); s = 0; for (i = 0; i < n_hyp_wid; i++) { de = WordDict->dict_list[hyp_wid[i]]; for (j = 0; j < de->len; j++) pscr_pid[s++] = de->ci_phone_ids[j]; } pscr_tmat = (char **) CM_2dcalloc (n_state, n_state, sizeof(char)); for (i = 1; i < n_state; i++) pscr_tmat[i-1][i] = 1; /* Start search with silencephoneid; all others inactive */ for (i = 0; i < n_topsen_frm; i++) { for (j = 0; j < n_state; j++) { pscr_vithist[i][j].score = WORST_SCORE; pscr_vithist[i][j].sf = 0; pscr_vithist[i][j].pred = -1; } } pscr_vithist[0][0].score = 0; hyp = search_pscr_path (pscr_vithist, pscr_tmat, pscr_pid, n_state, 1, PHONE_TRANS_PROB, n_state-1); free (pscr_vithist); free (pscr_pid); free (pscr_tmat); print_pscr_path (stdout, hyp, "FwdTree-PSCR"); return hyp; } /* * Search bptable for word wid and return its BPTable index. * Start search from the given frame frm. * Return value: BPTable index that matched. If none, -1. */ int32 search_bptbl_wordlist (int32 wid, int32 frm) { int32 b, first; first = BPTableIdx[frm]; for (b = BPIdx-1; b >= first; --b) { if (wid == WordDict->dict_list[BPTable[b].wid]->fwid) return b; } return -1; } int32 search_bptbl_pred (int32 b) { for (b = BPTable[b].bp; ISA_FILLER_WORD(BPTable[b].wid); b = BPTable[b].bp); return (WordDict->dict_list[BPTable[b].wid]->fwid); }