/* ==================================================================== * Copyright (c) 1999-2001 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * This work was supported in part by funding from the Defense Advanced * Research Projects Agency and the National Science Foundation of the * United States of America, and the CMU Sphinx Speech Consortium. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * ==================================================================== * */ /* * demo.c -- An example SphinxII program using continuous listening/silence filtering * to segment speech into utterances that are then decoded. * * HISTORY * * 15-Jun-99 Kevin A. Lenzo (lenzo@cs.cmu.edu) at Carnegie Mellon University * Added i386_linux and used ad_open_sps instead of ad_open * * 14-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University. * Created. */ /* * This is a simple, tty-based example of a SphinxII client that uses continuous listening * with silence filtering to automatically segment a continuous stream of audio input * into utterances that are then decoded. * * Remarks: * - Each utterance is ended when a silence segment of at least 1 sec is recognized. * - Single-threaded implementation for portability. * - Uses fbs8 audio library; can be replaced with an equivalent custom library. */ #include #include #include #include #include "s2types.h" #include "err.h" #include "ad.h" #include "cont_ad.h" #include "fbs.h" #ifdef WIN32 #include #else #include #include #endif #define SAMPLE_RATE 16000 static ad_rec_t *ad; /* Sleep for specified msec */ static void sleep_msec (int32 ms) { #ifdef WIN32 Sleep(ms); #else /* ------------------- Unix ------------------ */ struct timeval tmo; tmo.tv_sec = 0; tmo.tv_usec = ms*1000; select(0, NULL, NULL, NULL, &tmo); #endif } /* * Main utterance processing loop: * for (;;) { * wait for start of next utterance; * decode utterance until silence of at least 1 sec observed; * print utterance result; * } */ static void utterance_loop() { int16 adbuf[4096]; int32 k, fr, ts, rem; char *hyp; cont_ad_t *cont; char word[256]; /* Initialize continuous listening module */ if ((cont = cont_ad_init (ad, ad_read)) == NULL) E_FATAL("cont_ad_init failed\n"); if (ad_start_rec (ad) < 0) E_FATAL("ad_start_rec failed\n"); if (cont_ad_calib (cont) < 0) E_FATAL("cont_ad_calib failed\n"); for (;;) { /* Indicate listening for next utterance */ printf ("READY....\n"); fflush (stdout); fflush (stderr); /* Await data for next utterance */ while ((k = cont_ad_read (cont, adbuf, 4096)) == 0) sleep_msec(200); if (k < 0) E_FATAL("cont_ad_read failed\n"); /* * Non-zero amount of data received; start recognition of new utterance. * NULL argument to uttproc_begin_utt => automatic generation of utterance-id. */ if (uttproc_begin_utt (NULL) < 0) E_FATAL("uttproc_begin_utt() failed\n"); uttproc_rawdata (adbuf, k, 0); printf ("Listening...\n"); fflush (stdout); /* Note timestamp for this first block of data */ ts = cont->read_ts; /* Decode utterance until end (marked by a "long" silence, >1sec) */ for (;;) { /* Read non-silence audio data, if any, from continuous listening module */ if ((k = cont_ad_read (cont, adbuf, 4096)) < 0) E_FATAL("cont_ad_read failed\n"); if (k == 0) { /* * No speech data available; check current timestamp with most recent * speech to see if more than 1 sec elapsed. If so, end of utterance. */ if ((cont->read_ts - ts) > DEFAULT_SAMPLES_PER_SEC) break; } else { /* New speech data received; note current timestamp */ ts = cont->read_ts; } /* * Decode whatever data was read above. NOTE: Non-blocking mode!! * rem = #frames remaining to be decoded upon return from the function. */ rem = uttproc_rawdata (adbuf, k, 0); /* If no work to be done, sleep a bit */ if ((rem == 0) && (k == 0)) sleep_msec (20); } /* * Utterance ended; flush any accumulated, unprocessed A/D data and stop * listening until current utterance completely decoded */ ad_stop_rec (ad); cont_ad_reset (cont); printf ("Stopped listening, please wait...\n"); fflush (stdout); #if 0 /* Power histogram dump (FYI) */ cont_ad_powhist_dump (stdout, cont); #endif /* Finish decoding, obtain and print result */ uttproc_end_utt (); if (uttproc_result (&fr, &hyp, 1) < 0) E_FATAL("uttproc_result failed\n"); printf ("%d: %s\n", fr, hyp); fflush (stdout); /* Exit if the first word spoken was GOODBYE */ sscanf (hyp, "%s", word); if (strcmp (word, "goodbye") == 0) break; /* Resume A/D recording for next utterance */ if (ad_start_rec (ad) < 0) E_FATAL("ad_start_rec failed\n"); } cont_ad_close (cont); } static jmp_buf jbuf; static void sighandler(int signo) { longjmp(jbuf, 1); } int main (int argc, char *argv[]) { /* Make sure we exit cleanly (needed for profiling among other things) */ signal(SIGINT, &sighandler); fbs_init (argc, argv); if ((ad = ad_open_sps (SAMPLE_RATE)) == NULL) E_FATAL("ad_open_sps failed\n"); E_INFO("%s COMPILED ON: %s, AT: %s\n\n", argv[0], __DATE__, __TIME__); if (setjmp(jbuf) == 0) { utterance_loop (); } fbs_end (); ad_close (ad); return 0; }