/* ==================================================================== * Copyright (c) 1999-2001 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * */ /* * fbs.h -- Interface exported by the decoder module * * ********************************************** * CMU ARPA Speech Project * * Copyright (c) 1996 Carnegie Mellon University. * ALL RIGHTS RESERVED. * ********************************************** * * HISTORY * * $Log: fbs.h,v $ * Revision 1.6 2001/12/07 17:30:00 lenzo * Clean up and remove extra lines. * * Revision 1.5 2001/12/07 05:14:19 lenzo * License 1.2. * * Revision 1.4 2001/11/20 21:22:31 lenzo * Win32 re-compatibility fixes. * * Revision 1.3 2000/12/05 01:45:11 lenzo * Restructuring, hear rationalization, warning removal, ANSIfy * * Revision 1.2 2000/02/08 20:44:32 lenzo * Changed uttproc_allphone_cepfile() to uttproc_allphone_file. * * Revision 1.1.1.1 2000/01/28 22:09:07 lenzo * Initial import of sphinx2 * * * * 05-Jan-99 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added cdcn.h and uttproc_get_cdcn_ptr(). * * 04-Nov-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added conf field to search_hyp_t. * * 30-Oct-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added ascr, lscr fields to search_hyp_t. * * 19-Oct-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added uttproc_set_logfile(). * * 10-Sep-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added uttproc_allphone_cepfile(). * * 20-Aug-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added functions uttproc_agcemax_get() and set(). * * 20-Apr-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added uttproc_set_auto_uttid_prefix(). * * 24-Mar-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added additional phone_perp field to search_hyp_t for confidence measure * based on phone perplexity. * * 08-Mar-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added additional latden field to search_hyp_t for confidence measure * based on lattice density. * * 07-Aug-96 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added uttproc_result_seg and uttproc_partial_result_seg. * Changed search_hyp_t to support linked list and include word string. * * 17-Jun-96 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added uttproc_set_context(). * * 04-Jun-96 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Added BLOCKING option to uttproc_rawdata, uttproc_cepdata, uttproc_result. * Removed uttproc_set_uttid and added id argument to uttproc_begin_utt. * * 24-May-96 M K Ravishankar (rkm@cs) at Carnegie Mellon University * Substantially modified to be driven with externally provided data, rather * than explicitly reading an A/D source. * Added uttproc_abort_utt() and uttproc_partial_result(). * Added raw and mfc logging function. * * 01-May-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added functions uttproc_cepmean_set, uttproc_cepmean_get, * uttproc_agcmax_set, uttproc_agcmax_get. * * 07-Aug-95 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added uttproc_rawdata(). * * 05-Aug-95 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added uttproc_beginutt(), uttproc_cepdata(), and uttproc_endutt(). * * 13-Jun-95 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Simplified the uttproc interface by combining functions and redefining * others. * * 01-Jun-95 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Added uttproc_set_lm() and uttproc_set_startword(). * * 01-May-95 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University * Created. */ #ifndef _FBS_H_ #define _FBS_H_ #include "s2types.h" /* * The decoder is set up to process one finite-duration utterance at a time. The * maximum duration of an utterance is about 60sec, though other resource limits, * such as the back pointer table size, could constrain the duration further. */ /* * Recognition result (hypothesis) with word segmentation information. * * FIXME: should this be in search.h? */ typedef struct search_hyp_s { char const *word; /* READ-ONLY */ int32 wid; /* For internal use of decoder */ int32 sf, ef; /* Start, end frames within utterance for this word */ int32 ascr, lscr; /* Acoustic, LM scores (not always used!) */ float conf; /* Confidence measure (roughly prob(correct)) for this word; NOT FILLED IN BY THE RECOGNIZER at the moment!! */ struct search_hyp_s *next; /* Next word segment in the hypothesis; NULL if none */ int32 latden; /* Average lattice density in segment. Larger values imply more confusion and less certainty about the result. To use it for rejection, cutoffs must be found independently */ double phone_perp; /* Average phone perplexity in segment. Larger values imply more confusion and less certainty. To use it for rejection, cutoffs must be found independently. */ } search_hyp_t; /* * Called once at initialization with the list of arguments to initialize to initialize * the decoder. If the -ctlfn argument is given, it will process the argument file in * batch mode and exit the application. * Return value: 0 if successful, -1 otherwise. */ int32 fbs_init (int32 argc, char **argv); /* Arguments for initialization */ /* * Called before quitting the application to tie up loose ends in the decoder. * Return value: 0 if successful, -1 otherwise. */ int32 fbs_end ( void ); /* * Called at the beginning of each utterance. uttid is an input string identifying the * utterance; utterance data (raw or mfc files, if any) logged under this name. The * recognition result in the "match" file also identified with this id. If uttid is * NULL, an automatically generated running sequence number (of the form %08d) is used * instead. * Return value: 0 if successful, else -1. */ int32 uttproc_begin_utt (char const *uttid); /* * Decode the next block of input samples in the current utterance. The "block" argument * specifies whether the decoder should block until all pending data have been processed. * If 0, it is "non-blocking". That is, the decoder steps through only a few pending * frames (at least 1), and the remaining input data is queued up internally for later * processing. In particular, this function can be called with 0-length data to simply * process internally queued up frames. * * NOTE: The decoder will not actually process the input data if any of the processing * depends on the entire utterance. (For example, if CMN/AGC is based on entire current * utterance.) The data are queued up internally for processing after uttproc_end_utt is * called. Also, one cannot combine uttproc_rawdata and uttproc_cepdata within the same * utterance. * * Return value: #frames internally queued up and remaining to be decoded; -1 if any * error occurs. */ int32 uttproc_rawdata (int16 *raw, /* In: Block of int16 samples */ int32 nsample, /* In: #Samples in above block; can be 0!! */ int32 block); /* In: if !0, process all data before returning */ /* * Like uttproc_rawdata, but the input data are cepstrum vectors rather than raw samples. * One cannot combine uttproc_cepdata and uttproc_rawdata within the same utterance. * Return value: #frames internally queued up and remaining to be decoded; -1 if any * error occurs. */ int32 uttproc_cepdata (float **cep, /* In: cep[i] = i-th frame of cepstrum data */ int32 nfrm, /* In: #frames of cep data; can be 0!! */ int32 block); /* In: if !0, process all data before returning */ /* * For bookkeeping purposes, marking that no more data is forthcoming in the current * utterance. It should be followed by uttproc_result to obtain the final recognition * result. * Return value: 0 if successful, else -1. */ int32 uttproc_end_utt ( void ); /* * Obtain recognition result for utterance after uttproc_end_utt has been called. In * the blocking form, all queued up data is processed and final result returned. In the * non-blocking version, only a few pending frames (at least 1) are processed. In the * latter case, the function can be called repeatedly to allow the decoding to complete. * * Return value: #frames remaining to be processed. If non-zero (non-blocking mode) the * final result is not yet available. If 0, frm and hyp contain the final recognition * result. If there is any error, the function returns -1. */ int32 uttproc_result (int32 *frm, /* Out: *frm = #frames in current utterance */ char **hyp, /* Out: *hyp = recognition string; READ-ONLY. Contents clobbered by the next uttproc_result or uttproc_partial_result call */ int32 block); /* In: If !0, process all data and return final result */ /* * Like uttproc_result, but returns a list of word segmentations instead of the full * recognition string. The list of word segmentations is READ-ONLY, and clobbered by * the next call to any of the result functions. * Use uttproc_result or uttproc_result_seg to obtain the final result, but not both! */ int32 uttproc_result_seg (int32 *frm, /* Out: *frm = #frames in utterance */ search_hyp_t **hyp, /* Out: *hyp = first element in NULL terminated linked list of word segmentations */ int32 block); /* * Obtain a partial recognition result in the middle of an utterance. This function can * be called anytime after uttproc_begin_utt and before the final uttproc_result. * Return value: 0 if successful, else -1. */ int32 uttproc_partial_result (int32 *frm, /* Out: *frm = #frames processed corresponding to the partial result */ char **hyp); /* Out: *hyp = partial recognition string, READ-ONLY. Contents clobbered by the next uttproc_result or uttproc_partial_result call. */ /* * Like uttproc_partial_result, but returns a list of word segmentations instead of * the partial recognition string. The list of word segmentations is READ-ONLY, and * clobbered by the next call to any of the result functions. */ int32 uttproc_partial_result_seg (int32 *frm, search_hyp_t **hyp); /* Out: *hyp = first element in NULL terminated linked list of word segmentations */ /* * Called instead of uttproc_end_utt to abort the current utterance immediately. No * final recognition result is available. Note that one cannot abort an utterance after * uttproc_end_utt. * Return value: 0 if successful, else -1. */ int32 uttproc_abort_utt ( void ); /* * The sequence uttproc_stop_utt()...uttproc_restart_utt() can be used to re-recognize * the current utterance. It is typically used to switch to a new language model in the * middle of an utterance, for example, based on a partial recognition result; the * switch occurs in the middle of the two calls. uttproc_stop_utt must eventually be * followed by uttproc_restart_utt. There can be no other intervening calls relating to * the current utterance; i.e., no uttproc_begin_utt, uttproc_rawdata, uttproc_cepdata, * uttproc_end_utt, uttproc_result, uttproc_partial_result, or uttproc_abort_utt. * This operation cannot be performed after uttproc_end_utt. * Return value: 0 if successful, else -1. */ int32 uttproc_stop_utt ( void ); int32 uttproc_restart_utt ( void ); /* * Perform allphone recognition on the given cepstrum file and return a linked list of * phones and segmentation. The filename should NOT contain the (.mfc) file extension. * Return value: pointer to head of linked list of search_hyp_t entries for the phone * segments; it may be NULL. It is a READ-ONLY list. It will be clobbered by the next * call to this function. */ search_hyp_t *uttproc_allphone_file (char const *file); /* Without filename extension */ /* * Obtain the uttid for the most recent utterance (in progress or just finished) * Return value: pointer to READ-ONLY string that is the utterance id. */ char const *uttproc_get_uttid ( void ); /* * For automatically generated uttid's (see uttproc_begin_utt), also use the prefix * given below. (So the uttid is formatted "%s%08d", prefix, sequence_no.) */ int32 uttproc_set_auto_uttid_prefix (char const *prefix); /* * Set the currently active LM to the given named LM. Multiple LMs can be loaded initially * (during fbs_init) or at run time using lm_read (see below). * Return value: 0 if successful, else -1. */ int32 uttproc_set_lm (char const *lmname); /* * Indicate to the decoder that the named LM has been updated (e.g., with the addition of * a new unigram). * Return value: 0 if successful, else -1. */ int32 uttproc_lmupdate (char const *lmname); /* * Set trigram history context for the next utterance. Instead of the next utterance * beginning with a clean slate, it is begun as if the two words wd1 and wd2 have just * been recognized. They are used as the (trigram) language model history for the * utterance. wd1 can be NULL if there is only a one word history wd2, or both wd1 and * wd2 can be NULL to clear any history information. * Return value: 0 if successful, else -1. */ int32 uttproc_set_context (char const *wd1, /* In: First word of history (possibly NULL) */ char const *wd2);/* In: Last (most recent) history (maybe NULL) */ /* * Set the current logging directory for per utterance raw sample files and cepstrum * files. The file names are .raw and .mfc respectively, where is * the utterance id associated with the current utterance (see uttproc_begin_utt). * Return value: 0 if successful, else -1. */ int32 uttproc_set_rawlogdir (char const *dir); int32 uttproc_set_mfclogdir (char const *dir); /* Logfile can be changed in between utterances. Return value: 0 if ok, else -1 */ int32 uttproc_set_logfile (char const *file); /* * Set and get the current cepstral means for CMN. * Return value: 0 if successful, else -1. */ int32 uttproc_cepmean_set (float *cep); /* Cepstral mean set to cep[0-12] */ int32 uttproc_cepmean_get (float *cep); /* Current cepstral mean copied into cep[0-12] */ /* Similarly, AGC-Estimated-Max */ int32 uttproc_agcemax_set (float c0max); double uttproc_agcemax_get ( void ); /* * For LISTEN project use only. (okay, but other stuff uses it anyway) */ int32 uttproc_set_startword (char const *startword); /* * Read in a new LM file (lmfile), and associate it with name lmname. If there is * already an LM with the same name, it is automatically deleted. The current LM is * undefined at this point; use uttproc_set_lm(lmname) immediately afterwards. * Return value: 0 if successful, else -1. */ int32 lm_read (char const *lmfile, /* In: LM file name */ char const *lmname, /* In: LM name associated with this model */ double lw, /* In: Language weight; typically 6.5-9.5 */ double uw, /* In: Unigram weight; typically 0.5 */ double wip); /* In: Word insertion penalty; typically 0.65 */ /* * Delete the named LM from the LM collection. The current LM is undefined at this * point. Use uttproc_set_lm(...) immediately afterwards. * Return value: 0 if successful, else -1. */ int32 lm_delete (char const *lmname); /* Read utterance data from a file (instead of from an audio device) - passed to uttproc for batch-mode processing. */ int32 adc_file_read(int16 *buf, int32 max); /* Of course you have to know how to open that file (which was cheerfully omitted from this header file in the past) */ int uttfile_open(char const *utt); /* Misc. undocumented functions. FIXME: These don't belong here! */ char const *get_current_startword(void); char const *get_ref_sent(void); char const *query_ctlfile_name ( void ); char const *query_match_file_name (void); char const *query_matchseg_file_name (void); char const *query_dumplat_dir (void); char const *query_cdcn_file (void); int32 query_lattice_size ( void ); int32 query_topsen_window ( void ); int32 query_topsen_thresh ( void ); int32 query_report_altpron ( void ); int32 query_fwdtree_flag ( void ); int32 query_fwdflat_flag ( void ); int32 query_bestpath_flag ( void ); int32 query_sampling_rate ( void ); int32 query_phone_conf ( void ); int32 query_compute_all_senones (void); int32 uttproc_init(void); int32 uttproc_end(void); int32 uttproc_feat2rawfr (int32 fr); int32 uttproc_raw2featfr (int32 fr); void uttproc_align(char *sent); /* Really should be const */ int32 uttproc_nosearch(int32 flag); int32 uttproc_get_featbuf (float **cep, float **dcep, float **dcep_80ms, float **pcep, float **ddcep); void uttprocSetcomp2rawfr(int32 num, int32 const *ptr); int32 uttprocGetcomp2rawfr(int16 **ptr); void time_align_utterance (char const *utt, FILE *out_sent_fp, char const *left_word, int32 begin_frame, char *pe_words, /* FIXME: should be const */ int32 end_frame, char const *right_word); void utt_seghyp_free(search_hyp_t *h); void run_ctl_file (char const *ctl_file_name); void run_time_align_ctl_file (char const *utt_ctl_file_name, char const *pe_ctl_file_name, char const *out_sent_file_name); void agc_set_threshold (float threshold); int32 cep_read_bin (float32 **buf, int32 *len, char const *file); int32 cep_write_bin(char const *file, float32 *buf, int32 len); /* * Obtain N-best list for current utterance: * NOTE: Should be preceded by search_save_lattice (). * NOTE: Clobbers any previously returned N-best hypotheses in *alt_out. * Arguments: * sf, ef: Start and end frame range within utterance for generating N-best list. * w1, w2: Two-word context preceding utterance; w2 is the later one. w1 may be -1 * (i.e., non-existent). w2 must be valid; it can be the word-id for . * On return, alt_out[i] = i-th hypothesis generated. * Return value: #alternative hypotheses returned; -1 if error. */ int32 search_get_alt (int32 n, /* In: No. of alternatives to produce */ int32 sf, int32 ef, /* In: Start/End frame */ int32 w1, int32 w2, /* In: context words */ search_hyp_t ***alt_out); /* Out: array of alternatives */ /* Should be called before search_get_alt */ void search_save_lattice ( void ); /* Function used internally to decode each utt in ctlfile */ search_hyp_t *run_sc_utterance (char *mfcfile, int32 sf, int32 ef, char *idspec); #endif