ports//mail/crm114/work/crm114-20070810-BlameTheSegfault.src/crm_osb

//  crm_osb_winnow.c  - Controllable Regex Mutilator,  version v1.0
//  Copyright 2001-2006  William S. Yerazunis, all rights reserved.
//  
//  This software is licensed to the public under the Free Software
//  Foundation's GNU GPL, version 2.  You may obtain a copy of the
//  GPL by visiting the Free Software Foundations web site at
//  www.fsf.org, and a copy is included in this distribution.  
//
//  Other licenses may be negotiated; contact the 
//  author for details.  
//
//  include some standard files
#include "crm114_sysincludes.h"

//  include any local crm114 configuration file
#include "crm114_config.h"

//  include the crm114 data structures file
#include "crm114_structs.h"

//  and include the routine declarations file
#include "crm114.h"

//    the command line argc, argv
extern int prog_argc;
extern char **prog_argv;

//    the auxilliary input buffer (for WINDOW input)
extern char *newinputbuf;

//    the globals used when we need a big buffer  - allocated once, used 
//    wherever needed.  These are sized to the same size as the data window.
extern char *inbuf;
extern char *outbuf;
extern char *tempbuf;

////////////////////////////////////////////////////////////////////
//
//     the hash coefficient table (hctable) should be full of relatively
//     prime numbers, and preferably superincreasing, though both of those
//     are not strict requirements.
//
static long hctable[] =
    { 1, 7,
      3, 13,
      5, 29,
      11, 51,
      23, 101,
      47, 203,
      97, 407,
      197, 817,
      397, 1637,
      797, 3277 };
      


//          Where does the nominative data start?     
static long spectra_start;



//    How to learn Osb_Winnow style  - in this case, we'll include the single
//    word terms that may not strictly be necessary, but their weight will
//    be set to 0 in the evaluation.
//

int crm_expr_osb_winnow_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb,
			       char *txtptr, long txtstart, long txtlen)
{
  //     learn the osb_winnow transform spectrum of this input window as 
  //     belonging to a particular type.
  //     learn <flags> (classname) /word/
  //
  long i, j, k;
  long h;                   //  h is our counter in the hashpipe;
  char ptext[MAX_PATTERN];  //  the regex pattern
  long plen;
  char htext[MAX_PATTERN];  //  the hash name
  long hlen;
  long cflags, eflags;
  struct stat statbuf;      //  for statting the hash file
  long hfsize;              //  size of the hash file
  char *fname;
  WINNOW_FEATUREBUCKET_STRUCT *hashes;  //  the text of the hash file
  unsigned char *xhashes;               //  and the mask of what we've seen
  unsigned long hashpipe[OSB_WINNOW_WINDOW_LEN+1]; 
  //
  regex_t regcb;
  regmatch_t match[5];      //  we only care about the outermost match
  long textoffset;
  long textmaxoffset;
  float sense;
  long microgroom;
  long use_unigrams;
  long fev;
  long made_new_file; 
  
  

  if (internal_trace)
    fprintf (stderr, "executing an OSB-WINNOW LEARN\n");

  //   Keep the gcc compiler from complaining about unused variables
  //  i = hctable[0];

  //           extract the hash file name
  crm_get_pgm_arg (htext, MAX_PATTERN, apb->p1start, apb->p1len);
  hlen = apb->p1len;
  hlen = crm_nexpandvar (htext, hlen, MAX_PATTERN);
  //
  //        We get the varname and var-restriction from the caller now
  //  crm_get_pgm_arg (ltext, MAX_PATTERN, apb->b1start, apb->b1len);
  // llen = apb->b1len;
  // llen = crm_nexpandvar (ltext, llen, MAX_PATTERN);
  
  //     get the "this is a word" regex
  crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len);
  plen = apb->s1len;
  plen = crm_nexpandvar (ptext, plen, MAX_PATTERN);
  
  //            set our cflags, if needed.  The defaults are
  //            "case" and "affirm", (both zero valued).
  //            and "microgroom" disabled.
  cflags = REG_EXTENDED;
  eflags = 0;

  if (apb->sflags & CRM_NOCASE)
    {
      cflags = cflags | REG_ICASE;
      eflags = 1;
      if (user_trace)
	fprintf (stderr, "turning oncase-insensitive match\n");
    };


  //
  sense = OSB_WINNOW_PROMOTION;
  if (apb->sflags & CRM_REFUTE)
    {
      sense = OSB_WINNOW_DEMOTION; 
                    //  GROT GROT GROT Learning would be symmetrical
                    //  if this were 
                    //       sense = 1.0 / sense;
                    //  but that's inferior, because thenn the weights are 
                    // limited to the values of sense^n.
      if (user_trace)
	fprintf (stderr, " refuting learning\n");
    };

  microgroom = 0;
  if (apb->sflags & CRM_MICROGROOM)
    {
      microgroom = 1;
      if (user_trace)
	fprintf (stderr, " enabling microgrooming.\n");
    };

  use_unigrams = 0;
  if (apb->sflags & CRM_UNIGRAM)
    {
      use_unigrams = 1;
      if (user_trace)
	fprintf (stderr, " enabling unigram-only operation.\n");
    };


  //
  //             grab the filename, and stat the file
  //      note that neither "stat", "fopen", nor "open" are
  //      fully 8-bit or wchar clean...
  i = 0;
  while (htext[i] < 0x021) i++;
  j = i;
  while (htext[j] >= 0x021) j++;

  //             filename starts at i,  ends at j. null terminate it.
  htext[j] = '\000';

  fname = strdup (&htext[i]);
  //             and stat it to get it's length
  k = stat (fname, &statbuf);

  made_new_file = 0;

  //             quick check- does the file even exist?
  if (k != 0)
    {
      //      file didn't exist... create it
      FILE *f;
      if (user_trace)
	fprintf (stderr, "\n Opening new COW file %s for write\n", fname);
      f = fopen (fname, "wb");
      if (!f)
	{
	  fprintf (stderr, 
		"\n Couldn't open your new COW file %s for writing; errno=%d .\n",
		 fname, errno);
	  if (engine_exit_base != 0)
	    {
	      exit (engine_exit_base + 21);
	    }
	  else
	    exit (EXIT_FAILURE);
        };
      //       do we have a user-specified file size?
      if (sparse_spectrum_file_length == 0 ) {
        sparse_spectrum_file_length = 
	  DEFAULT_WINNOW_SPARSE_SPECTRUM_FILE_LENGTH;
      };

      //       put in sparse_spectrum_file_length entries of NULL
      for (j = 0; 
	   j < sparse_spectrum_file_length 
	     * sizeof ( WINNOW_FEATUREBUCKET_STRUCT);
	   j++)
	fputc ('\000', f);
      made_new_file = 1;
      //      
      fclose (f);
      //    and reset the statbuf to be correct
      k = stat (fname, &statbuf);
    };
  //    
  hfsize = statbuf.st_size;
  if (user_trace) 
    fprintf (stderr, "Sparse spectra file %s has length %ld bins\n",
	     fname, hfsize / sizeof (WINNOW_FEATUREBUCKET_STRUCT));

  //
  //         open the .cow hash file into memory so we can bitwhack it
  //
  hashes = (WINNOW_FEATUREBUCKET_STRUCT *) 
    crm_mmap_file ( 
		   fname,
		   0, hfsize,
		   PROT_READ | PROT_WRITE,
		   MAP_SHARED,
		   NULL);
  if (hashes == MAP_FAILED)
    {
      fev = fatalerror ("Couldn't memory-map the .cow file named: ",
			fname);
      return (fev);
    };

  //          if this is a new file, set the proper version number.
  if (made_new_file)
    {
      hashes[0].hash  = 1;
      hashes[0].key   = 0;
      hashes[0].value = 1;
    };   

  //        check the version of the file
  //
#ifdef CSS_VERSION_CHECK
  if (hashes[0].hash != 1 ||
      hashes[0].key  != 0 )
    {
      fprintf (stderr, "Hash was: %ld, key was %ld\n", hashes[0].hash, hashes[0].key);
      fev =fatalerror ("The .cow file is the wrong type!  We're expecting "
		       "a Osb_Winnow-spectrum file.  The filename is: ",
		       fname);
      return (fev);
    };
#endif

  //
  //         In this format, bucket 0.value contains the start of the spectra.
  //  
  hashes[0].value = 1;
  spectra_start = hashes[0].value;

  //
  //   now set the hfsize to the number of entries, not the number
  //   of bytes total
  hfsize = hfsize / sizeof ( WINNOW_FEATUREBUCKET_STRUCT );


  //    and allocate the mask-off flags for this file
  //    so we only use each feature at most once
  //
  xhashes = calloc ( hfsize, (sizeof (unsigned char)) );
  if ( !xhashes )
    untrappableerror(
		     "Couldn't malloc xhashes\n",
		     "We need that part.  Sorry.\n");

  //   compile the word regex
  //
  if ( internal_trace)
    fprintf (stderr, "\nWordmatch pattern is %s", ptext);
  i = crm_regcomp (&regcb, ptext, plen, cflags);
  if ( i > 0)
    {
      crm_regerror ( i, &regcb, tempbuf, data_window_size);
      nonfatalerror ("Regular Expression Compilation Problem:", tempbuf);
      goto regcomp_failed;
    };
  
  
  //   Start by priming the pipe... we will shift to the left next.
  //     sliding, hashing, xoring, moduloing, and incrmenting the
  //     hashes till there are no more.
  k = 0;
  j = 0;
  i = 0;

#ifdef OLD_STUPID_VAR_RESTRICTION
  if (llen > 0)
    {
      vhtindex = crm_vht_lookup (vht, ltext, llen);
    }
  else
    {
      vhtindex = crm_vht_lookup (vht, ":_dw:", 5);
    };
  
  if (vht[vhtindex] == NULL)
    {
      long q;
      q = fatalerror (" Attempt to LEARN from a nonexistent variable ",
		  ltext);
      return (q);
    };
  mdw = NULL;
  if (tdw->filetext == vht[vhtindex]->valtxt)
    mdw = tdw;
  if (cdw->filetext == vht[vhtindex]->valtxt)
    mdw = cdw;
  if (mdw == NULL)
    {
      long q;
      q = fatalerror (" Bogus text block containing variable ", ltext);  
      return (q);
    }
  textoffset = vht[vhtindex]->vstart;
  textmaxoffset = textoffset + vht[vhtindex]->vlen;
#endif

  textoffset = txtstart;
  textmaxoffset = txtstart + txtlen;


  //   init the hashpipe with 0xDEADBEEF 
  for (h = 0; h < OSB_WINNOW_WINDOW_LEN; h++)
    {
      hashpipe[h] = 0xDEADBEEF;
    };
  
  //    and the big loop...
  i = 0;
  while (k == 0 && textoffset <= textmaxoffset)
    {
      long wlen;
      long slen;
      //      unsigned char *ptok = &(mdw->filetext[textoffset]);
      //unsigned char *ptok_max = &(mdw->filetext[textmaxoffset]);

      //  do the regex
      //  slen = endpoint (= start + len) 
      //        - startpoint (= curr textoffset)
      //      slen = txtlen ;
      slen = textmaxoffset - textoffset;

      // if pattern is empty, extract non graph delimited tokens
      // directly ([[graph]]+) instead of calling regexec
      if (ptext[0] != '\0')
	{
	  k = crm_regexec (&regcb, &(txtptr[textoffset]),
			   slen, 5, match, 0, NULL);
	}
      else
	{
	  k = 0;
	  //         skip non-graphical characthers
	  match[0].rm_so = 0;
	  while (!isgraph (txtptr[textoffset + match[0].rm_so])
		 && textoffset + match[0].rm_so < textmaxoffset)
	    match[0].rm_so ++;
	  match[0].rm_eo = match[0].rm_so;
	  while (isgraph (txtptr [textoffset + match[0].rm_eo])
		 && textoffset + match[0].rm_eo < textmaxoffset)
	    match[0].rm_eo ++;
	  if ( match[0].rm_so == match[0].rm_eo)
	    k = 1;
	}
      
      if (k != 0 || textoffset > textmaxoffset)
	goto learn_end_regex_loop;
      
      {
	wlen = match[0].rm_eo - match[0].rm_so;
	memmove (tempbuf, 
		 &(txtptr[textoffset + match[0].rm_so]),
		 wlen);
	tempbuf[wlen] = '\000';
	
	if (internal_trace)
	  {
	    fprintf (stderr, 
		     "  Learn #%ld t.o. %ld strt %ld end %ld len %ld is -%s-\n", 
		     i, 
		     textoffset,
		     (long) match[0].rm_so, 
		     (long) match[0].rm_eo,
		     wlen,
		     tempbuf);
	  };
	if (match[0].rm_eo == 0)
	  {
	    nonfatalerror ( "The LEARN pattern matched zero length! ",
			    "\n Forcing an increment to avoid an infinite loop.");
	    match[0].rm_eo = 1;
	  };


	//      Shift the hash pipe down one
	//
	for (h = OSB_WINNOW_WINDOW_LEN-1; h > 0; h--)
	  {
	    hashpipe [h] = hashpipe [h-1];
	  };

	
	//  and put new hash into pipeline
	hashpipe[0] = strnhash (tempbuf, wlen);
	
	if (internal_trace)
	  {
	    fprintf (stderr, "  Hashpipe contents: ");
	    for (h = 0; h < OSB_WINNOW_WINDOW_LEN; h++)
	      fprintf (stderr, " %ld", hashpipe[h]);
	    fprintf (stderr, "\n");
	  };
	    
	
	//  and account for the text used up.
	textoffset = textoffset + match[0].rm_eo;
	i++;

	//        is the pipe full enough to do the hashing?
	if (1)   //  we always run the hashpipe now, even if it's
	         //  just full of 0xDEADBEEF.  (was i >=5)
	  {
	    unsigned long hindex;
	    unsigned long h1, h2;
	    long th = 0;         // a counter used for TSS tokenizing
	    unsigned long incrs;
	    long j;
	    //
	    //
	    th = 0;
	    //
	    //     Note that we start at j==1 here, so that we do NOT
	    //     ever calculate (or save) the unigrams.
	    //
	    for (j = 1; 
		 j < OSB_WINNOW_WINDOW_LEN;
		 j++)
	      {
		if (use_unigrams)
		  {
		    h1 = hashpipe[0]*hctable[0];
		    if (h1 < spectra_start)
		      h1 = spectra_start;
		    h2 = hashpipe[0]*hctable[1];
		    if (h2 == 0) h2 = 0xdeadbeef;
		    j = OSB_WINNOW_WINDOW_LEN;
		  }
		else
		  {
		    h1 = hashpipe[0]*hctable[0] + hashpipe[j] * hctable[j<<1];
		    if (h1 < spectra_start)
		      h1 = spectra_start;
		    h2 = hashpipe[0]*hctable[1] + hashpipe[j] * hctable[(j<<1)-1];
		    if (h2 == 0) h2 = 0xdeadbeef;
		  };
		hindex = h1 % hfsize;
		if (hindex < spectra_start ) hindex = spectra_start;

		if (internal_trace)
		  fprintf (stderr, "Polynomial %ld has h1:%ld  h2: %ld\n",
			   j, h1, h2);

		//
		//   we now look at both the primary (h1) and 
		//   crosscut (h2) indexes to see if we've got
		//   the right bucket or if we need to look further
		//
		incrs = 0;
		//   while ( hashes[hindex].key != 0
		//	&&  ( hashes[hindex].hash != h1
		//	      || hashes[hindex].key  != h2 ))
		while((!((hashes[hindex].hash==h1)&&(hashes[hindex].key==h2)))
		      //   Unnecessary - if it doesn't match, and value != 0...
		      //  && (hashes[hindex].key != 0)
		      && (hashes[hindex].value != 0)) 
		  {
		    //
		    //  
		    //       If microgrooming is enabled, and we've found a 
		    //       chain that's too long, we groom it down.
		    //
		    if (microgroom && (incrs > MICROGROOM_CHAIN_LENGTH))
		      {
			//     set the random number generator up...
			//     note that this is repeatable for a
			//     particular test set, yet dynamic.  That
			//     way, we don't always autogroom away the
			//     same feature; we depend on the previous
			//     feature's key.
			srand ( (unsigned int) h2);
			//
			//   and do the groom.
			
			//   reset our hindex to where we started...  
			//
			hindex = h1 % hfsize;
			if (hindex < spectra_start ) hindex = spectra_start;

			//    and microgroom.
			//fprintf (stderr,  "\nCalling microgroom hindex %ld hash: %ld  key: %ld  value: %f ", 
			//	hindex, hashes[hindex].hash, hashes[hindex].key, hashes[hindex].value );
			
			crm_winnow_microgroom 
                                   ( hashes, xhashes, hfsize, hindex);
			incrs = 0;
		      };
		    //      check to see if we've incremented ourself all the
		    //      way around the .cow file.  If so, we're full, and
		    //      can hold no more features (this is unrecoverable)
		    if (incrs > hfsize - 3)
		      {
			nonfatalerror ("Your program is stuffing too many "
				       "features into this size .cow file.  "
				       "Adding any more features is "
				       "impossible in this file.",
				       "You are advised to build a larger "
				       ".cow file and merge your data into "
				       "it.");
			goto learn_end_regex_loop;
		      };
		    //
		    //     FINALLY!!!  
		    //
		    //    This isn't the hash bucket we're looking for.  Move
		    //    along, move along....
		    incrs++;
		    hindex++;
		    if (hindex >= hfsize) hindex = spectra_start;
		  };
		
		if (internal_trace)
		  {
		    if (hashes[hindex].value == 0)
		      {
			fprintf (stderr,"New feature at %ld\n", hindex);
		      }
		    else
		      {
			fprintf (stderr, "Old feature at %ld\n", hindex);
		      };
		  };
		
		//      With _winnow_, we just multiply by the sense factor.
		//
		if (xhashes[hindex] == 0)
		  {
		    hashes[hindex].hash = h1;
		    hashes[hindex].key  = h2;
		    xhashes[hindex] = 1;
		    if (hashes[hindex].value > 0.0) 
		      {
			  hashes[hindex].value = hashes[hindex].value * sense;
		      }
		    else
		      {
			hashes[hindex].value = sense;
		      };
		  };

		//		fprintf (stderr, "Hash index: %ld  value: %f \n", hindex, hashes[hindex].value);

	      };
	  };
      };
    };   
  //   end the while k==0

 learn_end_regex_loop:
 regcomp_failed:

  //  and remember to let go of the mmap and the pattern bufffer
  // (and force a cache purge)
  // crm_munmap_all ();
  crm_munmap_file ((void *) hashes);

  free (xhashes);

#ifdef POSIX
  //    Because mmap/alter/munmap doesn't set atime, nor set the "modified"
  //    flag, some network filesystems will fail to mark the file as
  //    modified and so their cacheing will make a mistake.
  //
  //    The fix is to do a trivial read/write on the .cow file, to force
  //    the filesystem to repropagate it's caches.
  //

  {
    int hfd;                  //  hashfile fd
    FEATURE_HEADER_STRUCT foo;
    hfd = open (fname, O_RDWR);
    read (hfd, &foo, sizeof(foo));
    lseek (hfd, 0, SEEK_SET);
    write (hfd, &foo, sizeof(foo));
  close (hfd);
  }
#endif

  if (ptext[0] != '\0') crm_regfree (&regcb);
  return (0);
}



//      How to do a Osb_Winnow CLASSIFY some text.
//
int crm_expr_osb_winnow_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb,
				  char *txtptr, long txtstart, long txtlen)
{
  //      classify the sparse spectrum of this input window 
  //      as belonging to a particular type.
  //
  //       This code should look very familiar- it's cribbed from
  //       the code for LEARN
  //
  long i, j, k;
  long h;          //  we use h for our hashpipe counter, as needed.
  char ptext[MAX_PATTERN];  //  the regex pattern
  long plen;
  //  the hash file names
  char htext[MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN];  
  long htext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN;
  long hlen;
  //  the match statistics variable inbuf
  char stext[MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100)]; 
  long stext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100); 
  long slen;
  char svrbl[MAX_PATTERN];  //  the match statistics text buffer
  long svlen;
  long fnameoffset;
  char fname[MAX_FILE_NAME_LEN];
  long eflags;
  long cflags;
  long not_microgroom = 1;
  long use_unigrams;

  struct stat statbuf;      //  for statting the hash file
  unsigned long hashpipe[OSB_WINNOW_WINDOW_LEN+1]; 
  regex_t regcb;
  regmatch_t match[5];      //  we only care about the outermost match

  double fcounts [MAX_CLASSIFIERS]; // total counts for feature normalize
  unsigned long totalcount = 0;

  double cpcorr[MAX_CLASSIFIERS];  // corpus correction factors
  double hits[MAX_CLASSIFIERS];  // actual hits per feature per classifier
  long totalhits[MAX_CLASSIFIERS];  // actual total hits per classifier
  double totalweights[MAX_CLASSIFIERS];  //  total of hits * weights
  double unseens[MAX_CLASSIFIERS]; //  total unseen features.
  double classifierprs[MAX_CLASSIFIERS]; //  pR's of each class
  long totalfeatures;   //  total features
  double htf;             // hits this feature got.
  double tprob = 0;         //  total probability in the "success" domain.

  //double textlen;    //  text length  - rougly corresponds to
                        //  information content of the text to classify

  WINNOW_FEATUREBUCKET_STRUCT *hashes[MAX_CLASSIFIERS];
  unsigned char *xhashes[MAX_CLASSIFIERS];
  long hashlens[MAX_CLASSIFIERS];
  char *hashname[MAX_CLASSIFIERS];
  long succhash;
  long vbar_seen;	// did we see '|' in classify's args?
  long maxhash;
  long fnstart, fnlen;
  long fn_start_here;
  long textoffset;
  long textmaxoffset;
  long bestseen;
  long thistotal;

  double top10scores[10];
  long top10polys[10];
  char top10texts[10][MAX_PATTERN];


  if (internal_trace)
    fprintf (stderr, "executing an OSB-WINNOW CLASSIFY\n");
  
  // 
  //      We get the to-be-classified text from the caller now.
  //
  //  crm_get_pgm_arg (ltext, MAX_PATTERN, apb->b1start, apb->b1len);
  // llen = apb->b1len;
  // llen = crm_nexpandvar (ltext, llen, MAX_PATTERN);
  
  //           extract the hash file names
  crm_get_pgm_arg (htext, htext_maxlen, apb->p1start, apb->p1len);
  hlen = apb->p1len;
  hlen = crm_nexpandvar (htext, hlen, htext_maxlen);
  
  //           extract the "this is a word" regex
  //
  crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len);
  plen = apb->s1len;
  plen = crm_nexpandvar (ptext, plen, MAX_PATTERN);
  
  //            extract the optional "match statistics" variable
  //
  crm_get_pgm_arg (svrbl, MAX_PATTERN, apb->p2start, apb->p2len);
  svlen = apb->p2len;
  svlen = crm_nexpandvar (svrbl, svlen, MAX_PATTERN);
  { 
    long vstart, vlen;
    crm_nextword (svrbl, svlen, 0, &vstart, &vlen);
    memmove (svrbl, &svrbl[vstart], vlen);
    svlen = vlen;
    svrbl[vlen] = '\000';
  };
  
  //     status variable's text (used for output stats)
  //    
  stext[0] = '\000';
  slen = 0;
  
  //            set our flags, if needed.  The defaults are
  //            "case" 
  cflags = REG_EXTENDED;
  eflags = 0;

  if (apb->sflags & CRM_NOCASE)
    {
      cflags += REG_ICASE;
      eflags = 1;
    };

  not_microgroom = 1;
  if (apb->sflags & CRM_MICROGROOM)
    {
      not_microgroom = 0;
      if (user_trace)
	fprintf (stderr, " disabling fast-skip optimization.\n");
    };

  use_unigrams = 0;
  if (apb->sflags & CRM_UNIGRAM)
    {
      use_unigrams = 1;
      if (user_trace)
	fprintf (stderr, " enabling unigram-only operation.\n");
    };

  //   compile the word regex
  if ( internal_trace)
    fprintf (stderr, "\nWordmatch pattern is %s", ptext);
  i = crm_regcomp (&regcb, ptext, plen, cflags);
  if ( i > 0)
    {
      crm_regerror ( i, &regcb, tempbuf, data_window_size);
      nonfatalerror ("Regular Expression Compilation Problem:", tempbuf);
      goto regcomp_failed;
    };
  

  
  //       Now, the loop to open the files.  
  bestseen = 0;
  thistotal = 0;
  //  goodcount = evilcount = 1;   // prevents a divide-by-zero error.
  //cpgood = cpevil = 0.0;
  //ghits = ehits = 0.0 ;
  //psucc = 0.5;
  //pfail = (1.0 - psucc);
  //pic = 0.5;
  //pnic = 0.5;
  

  //      initialize our arrays for N .css files
  for (i = 0; i < MAX_CLASSIFIERS; i++) 
    {
      fcounts[i] = 0.0;    // check later to prevent a divide-by-zero 
    			 // error on empty .css file
      cpcorr[i] = 0.0;   // corpus correction factors
      hits[i] = 0.0;     // absolute hit counts 
      totalhits[i] = 0.0;        // absolute hit counts 
      totalweights[i] = 0.0;     // hit_i * weight*i count
      unseens[i] = 0.0;       // text features not seen in statistics files
    };

  for (i = 0; i < 10; i++)
    {
      top10scores[i] = 0;
      top10polys[i] = 0;
      strcpy (top10texts[i], "");
    };
  //
  //     --  The Winnow evaluator --
  //
  //    Winnow is NOT a bayesian evaluator.  Instead, it generates
  //    a set of positive-only weights for each feature.  If a 
  //    feature is present, it's weight is added to the total.
  //    The feature file with the greater total wins.  Simple, eh?
  //
  //    Initial weights (set when a feature is first seen in learning)
  //    is 1.0.  Whenever a feature is "learned" as true, it's weight
  //    is multiplied by the OSB_PROMOTION factor.  When it's learned
  //    as incorrect, it's multiplied by the OSB_DEMOTION factor.
  //
  //
  
  vbar_seen = 0;
  maxhash = 0;
  succhash = 0;
  fnameoffset = 0;
  //    now, get the file names and mmap each file
  //     get the file name (grody and non-8-bit-safe, but doesn't matter
  //     because the result is used for open() and nothing else.
  //   GROT GROT GROT  this isn't NULL-clean on filenames.  But then
  //    again, stdio.h itself isn't NULL-clean on filenames.
  if (user_trace)
    fprintf (stderr, "Classify list: -%s- \n", htext);
  fn_start_here = 0;
  fnlen = 1;
  while ( fnlen > 0 && ((maxhash < MAX_CLASSIFIERS-1)))
    {
      crm_nextword (htext,
		    hlen, fn_start_here,
		    &fnstart, &fnlen); 
      if (fnlen > 0)
	{
	  strncpy (fname, &htext[fnstart], fnlen);
	  fn_start_here = fnstart + fnlen + 1;
	  fname[fnlen] = '\000';
	  if (user_trace)
	    fprintf (stderr, "Classifying with file -%s- "\
			     "succhash=%ld, maxhash=%ld\n",
			     fname, succhash, maxhash);
	  if ( fname[0] == '|' && fname[1] == '\000')
	    {
	      if (vbar_seen)
		{
		  nonfatalerror ("Only one ' | ' allowed in a CLASSIFY. \n" ,
				 "We'll ignore it for now.");
		}
	      else
		{
		  succhash = maxhash;
		};
	      vbar_seen ++;
	    }
	  else
	    {
	      //  be sure the file exists 
	      //             stat the file to get it's length
	      k = stat (fname, &statbuf);
	      //             quick check- does the file even exist?
	      if (k != 0)
		{
		  nonfatalerror ("Nonexistent Classify table named: ",
				 fname);
		}
	      else
		{
		  //  file exists - do the open/process/close      
		  //    
		  hashlens[maxhash] = statbuf.st_size;
		  //  mmap the hash file into memory so we can bitwhack it
		  hashes[maxhash] = (WINNOW_FEATUREBUCKET_STRUCT *)
		    crm_mmap_file ( fname, 
				    0, hashlens[maxhash],
				    PROT_READ,
				    MAP_SHARED,
				    NULL);

		  if (hashes[maxhash] == MAP_FAILED )
		    {
		      nonfatalerror ("Couldn't memory-map the table file",
				     fname);
		    }
		  else
		    {
		      //
#ifdef CSS_VERSION_CHECK
		      //     Check to see if this file is the right version
		      //
		      long fev;
		      if (hashes[maxhash][0].hash != 1 ||
			  hashes[maxhash][0].key  != 0) 
			{
			  fev =fatalerror ("The .css file is the wrong type!  We're expecting "
					   "a Osb_Winnow-spectrum file.  The filename is: ",
					   &htext[i]);
			  return (fev);
			};
#endif
		      //     grab the start of the actual spectrum data.
		      //
		      spectra_start = hashes[maxhash][0].value;
		      
		      
		      //  set this hashlens to the length in features instead
		      //  of the length in bytes.
		      hashlens[maxhash] = hashlens[maxhash] / sizeof (WINNOW_FEATUREBUCKET_STRUCT);
		      hashname[maxhash] = (char *) malloc (fnlen+10);
		      if (!hashname[maxhash])
			untrappableerror(
					 "Couldn't malloc hashname[maxhash]\n","We need that part later, so we're stuck.  Sorry.");
		      strncpy(hashname[maxhash],fname,fnlen);
		      hashname[maxhash][fnlen]='\000';
		      
		      //    and allocate the mask-off flags for this file
		      //    so we only use each feature at most once
		      //
		      xhashes[maxhash] = calloc (hashlens[maxhash], 
						 sizeof (unsigned char));
		      if (!xhashes[maxhash])
			untrappableerror(
					 "Couldn't malloc xhashes[maxhash]\n",
					 "We need that part.  Sorry.\n");
		      
		      maxhash++;
		    };
		};
	    };
	  if (maxhash > MAX_CLASSIFIERS-1)
	    nonfatalerror ("Too many classifier files.",
			   "Some may have been disregarded");
	};
    };
  
  //
  //    If there is no '|', then all files are "success" files.  
  if (succhash == 0) 
    succhash = maxhash;
  
  //    a CLASSIFY with no arguments is always a "success".
  if (maxhash == 0)
    return (0);

  if (user_trace)
    fprintf (stderr, "Running with %ld files for success out of %ld files\n",
	     succhash, maxhash );

  // sanity checks...  Uncomment for super-strict CLASSIFY.
  // 
  //	do we have at least 1 valid .css files?
  if (maxhash == 0)
    {
      fatalerror ("Couldn't open at least 1 .cow files for classify().", "");
    };
  //	do we have at least 1 valid .cow file at both sides of '|'?
  //if (!vbar_seen || succhash < 0 || (maxhash < succhash + 2))
  //  {
  //    nonfatalerror (
  //      "Couldn't open at least 1 .css file per SUCC | FAIL classes "
  //	" for classify().\n","Hope you know what are you doing.");
  //  };

  {
    long ifile;
    long k;
    //      count up the total first
    for (ifile = 0; ifile < maxhash; ifile++)
      {
	fcounts[ifile] = 0.0 ;
	for (k = 1; k < hashlens[ifile]; k++)
	  fcounts [ifile] = fcounts[ifile] + hashes[ifile][k].value;
	if (fcounts[ifile] == 0.0) fcounts[ifile] = 1.0 ;
	totalcount = totalcount + fcounts[ifile];
      };
    //
    //     calculate cpcorr (count compensation correction)
    //

    for (ifile = 0; ifile < maxhash; ifile++)
      {
	//  cpcorr [ifile] = ( totalcount / (fcounts[ifile] * (maxhash-1)));
	//
	//   disable cpcorr for now... unclear that it's useful.
	cpcorr[ifile] = 1.0;
      };
  };
  
  //
  //   now all of the files are mmapped into memory,
  //   and we can do the polynomials and add up points.
  i = 0;
  j = 0;
  k = 0;
  thistotal = 0;

#ifdef OLD_STUPID_VAR_RESTRICTION
  if (llen > 0)
    {
      vhtindex = crm_vht_lookup (vht, ltext, llen );
    }
  else
    {
      vhtindex = crm_vht_lookup (vht, ":_dw:", 5);
    }
  if (vht[vhtindex] == NULL)
    {
      return (fatalerror (" Attempt to CLASSIFY from a nonexistent variable ",
			  ltext));
    };
  mdw = NULL;
  if (tdw->filetext == vht[vhtindex]->valtxt)
    mdw = tdw;
  if (cdw->filetext == vht[vhtindex]->valtxt)
    mdw = cdw;
  if (mdw == NULL)
    return ( fatalerror (" Bogus text block containing variable ", ltext));  
  textoffset = vht[vhtindex]->vstart;
  textmaxoffset = textoffset + vht[vhtindex]->vlen;

  textlen = (vht[vhtindex]->vlen);
  if (textlen < 1.0) textlen = 1.0; 
#endif
  textoffset = txtstart;
  textmaxoffset = txtstart + txtlen;


  //   init the hashpipe with 0xDEADBEEF 
  for (h = 0; h < OSB_WINNOW_WINDOW_LEN; h++)
    {
      hashpipe[h] = 0xDEADBEEF;
    };

  totalfeatures = 0;
  
  //  stop when we no longer get any regex matches
  //   possible edge effect here- last character must be matchable, yet
  //    it's also the "end of buffer".
  while (k == 0 && textoffset <= textmaxoffset)
    {
      long wlen;
      long slen;
      //      unsigned char *ptok = &(mdw->filetext[textoffset]);
      //  unsigned char *ptok_max = &(mdw->filetext[textmaxoffset]);

      //  do the regex  
      //      slen = txtlen - textoffset;
      slen = textmaxoffset - textoffset;

      // if pattern is empty, extract non graph delimited tokens
      // directly ([[graph]]+) instead of calling regexec
      if (ptext[0] != '\0')
	{
	  k = crm_regexec (&regcb, &(txtptr[textoffset]),
			   slen, 5, match, 0, NULL);
	}
      else
	{
	  k = 0;
	  //         skip non-graphical characthers
	  match[0].rm_so = 0;
	  while (!isgraph (txtptr[textoffset + match[0].rm_so])
		 && textoffset + match[0].rm_so < textmaxoffset)
	    match[0].rm_so ++;
	  match[0].rm_eo = match[0].rm_so;
	  while (isgraph (txtptr [textoffset + match[0].rm_eo])
		 && textoffset + match[0].rm_eo < textmaxoffset)
	    match[0].rm_eo ++;
	  if ( match[0].rm_so == match[0].rm_eo)
	    k = 1;
	}

      if (k != 0 || textoffset > textmaxoffset)
	goto classify_end_regex_loop;
      
      wlen = match[0].rm_eo - match[0].rm_so;
      memmove (tempbuf, 
	       &(txtptr[textoffset + match[0].rm_so]),
	       wlen);
      tempbuf[wlen] = '\000';
      
      if (internal_trace)
	{		   
	  fprintf (stderr, 
		   "  Classify #%ld t.o. %ld strt %ld end %ld len %ld is -%s-\n", 
		   i, 
		   textoffset,
		   (long) match[0].rm_so, 
		   (long) match[0].rm_eo,
		   wlen,
		   tempbuf);
	  
	};
      if (match[0].rm_eo == 0)
	{
	  nonfatalerror ( "The CLASSIFY pattern matched zero length! ",
			  "\n Forcing an increment to avoid an infinite loop.");
	  match[0].rm_eo = 1;
	};
      //  slide previous hashes up 1
      for (h = OSB_WINNOW_WINDOW_LEN-1; h >= 1; h--)
	{
	  hashpipe [h] = hashpipe [h-1];
	};

      
      //  and put new hash into pipeline
      hashpipe[0] = strnhash ( tempbuf, wlen);
      
      if (0) 
	  {
	    fprintf (stderr, "  Hashpipe contents: ");
	    for (h = 0; h < OSB_WINNOW_WINDOW_LEN; h++)
	      fprintf (stderr, " %ld", hashpipe[h]);
	    fprintf (stderr, "\n");
	  };
      
      //   account for the text we used up...
      textoffset = textoffset + match[0].rm_eo;
      i++;
      
      //        is the pipe full enough to do the hashing?
      if (1)   //  we init with 0xDEADBEEF, so the pipe is always full (i >=5)
	{
	  int j, k;
	  unsigned th=0;          //  a counter used only in TSS hashing
	  unsigned long hindex;
	  unsigned long h1, h2;
	  //unsigned long good, evil;
	  //
	  //
	  th = 0;

	  //
	  //     Note that we start at j==1 here, so that we do NOT
	  //     ever calculate (or save) the unigrams.
	  //
	  for (j = 1; 
	       j < OSB_WINNOW_WINDOW_LEN;
	       j++)
	    {
	      if (use_unigrams)
		{
		  h1 = hashpipe[0]*hctable[0];
		  if (h1 < spectra_start)
		    h1 = spectra_start;
		  h2 = hashpipe[0]*hctable[1];
		  if (h2 == 0) h2 = 0xdeadbeef;
		  j = OSB_WINNOW_WINDOW_LEN;
		}
	      else
		{
		  h1 = hashpipe[0]*hctable[0] + hashpipe[j] * hctable[j<<1];
		  if (h1 < spectra_start)
		    h1 = spectra_start;
		  h2 = hashpipe[0]*hctable[1] + hashpipe[j] * hctable[(j<<1)-1];
		  if (h2 == 0) h2 = 0xdeadbeef;
		};

	      hindex = h1;
	      if (internal_trace)
		fprintf (stderr, "Polynomial %d has h1:%ld  h2: %ld\n",
			 j, h1, h2);

	      //    Now, for each of the feature files, what are
	      //    the statistics (found, not found, whatever)
	      //
	      htf = 0;
	      totalfeatures++;
	      for (k = 0; k < maxhash; k++)
		{
		  long lh, lh0;
		  float z;
		  lh = hindex % (hashlens[k]);
		  if (lh < spectra_start ) lh = spectra_start;
		  lh0 = lh;
		  hits[k] = 0;
		  while ( hashes[k][lh].key != 0 
			  && ( hashes[k][lh].hash != h1    
			       || hashes[k][lh].key  != h2 ))
		    {
		      lh++;
		      if (lh >= hashlens[k]) lh = spectra_start;
		      if (lh == lh0) break; // wraparound
		    };

		  //   Did we find the feature?  Or did we hit end-of-chain?
		  //
		  if (hashes[k][lh].hash == h1 && hashes[k][lh].key == h2)
		    {
		      //    found the feature
		      //
		      //    Have we seen it before?
		      if (xhashes[k][lh] == 0)
			{
			  // remember totalhits
			  htf = htf + 1;            // and hits-this-feature
			  hits[k] ++;               // increment hits.
			  z = hashes[k][lh].value; 
			  //		      fprintf (stdout, "L: %f  ", z);
			  // and weight sum
			  totalweights[k] = totalweights[k] + z; 
			  totalhits[k] = totalhits[k] + 1;
			  //
			  //  and mark the feature as seen.
			  xhashes[k][lh] = 1;
			};
		    }
		  else
		    {
		      // unseens score 1.0, which is totally ambivalent; seen
		      //  and accepted score more, seen and refuted score less
		      //
		      unseens[k] = unseens[k] + 1.0 ;
		      totalweights[k] = totalweights[k] + 1.0 ;
		    };
		};

	      if (internal_trace) 
		{
		  for (k = 0; k < maxhash; k++)
		    {
		      // fprintf (stderr, "ZZZ\n");
		      fprintf (stderr, 
		       " poly: %d  filenum: %d, HTF: %7.0f, hits: %7.0f, th: %10ld, tw: %6.4e\n",
			      j, k, htf, hits[k], totalhits[k], totalweights[k]);
		    };
		};
	      //    
	      //    avoid the fencepost error for window=1
	      if ( OSB_WINNOW_WINDOW_LEN == 1) 
		{
		  j = 99999;
		};
	    };
	};
    };      //  end of repeat-the-regex loop
 classify_end_regex_loop:
  
  //  cleanup time!
  //  remember to let go of the fd's and mmaps and mallocs
  for (k = 0; k < maxhash; k++)
    {
      crm_munmap_file ( (void *) hashes[k]);
      free (xhashes[k]);
    };
  //  and let go of the regex buffery      
  crm_regfree (&regcb);

  if (user_trace) 
    {
      for (k = 0; k < maxhash; k++)
	fprintf (stderr, "Match for file %ld:  hits: %ld  weight: %f\n", 
		 k, totalhits[k], totalweights[k]);
    };
  //
  //      Do the calculations and format some output, which we may or may
  //      not use... but we need the calcualted result anyway.
  //
  if (1)
    {
      char buf[1024];
      double accumulator;
      double remainder;
      double overall_pR;
      long m;
      buf [0] = '\000';
      accumulator = 10 * DBL_MIN;

      for (m = 0; m < maxhash; m++)
	{
	  if (totalweights[m] < 1)
	    totalweights[m] = 1;
	  if (totalhits[m] < 1)
	    totalhits[m] = 1;
	  classifierprs[m] = 10*(log10 (totalweights[m])-log10(totalhits[m]));
	};
      for (m = 0; m < succhash; m++)
	{
	  accumulator = accumulator + totalweights[m];
	};
      remainder = 10 * DBL_MIN;
      for (m = succhash; m < maxhash; m++)
	{
	  remainder = remainder + totalweights[m];
	};
      
      tprob = (accumulator) / (accumulator + remainder);

      //     *******************************************
      //   
      //        Note - we use 10 as the normalization for pR here.
      //        it's because we don't have an actual probability
      //        but we want this to scale similarly with the other 
      //        recognizers.
      // 
      overall_pR = 10 * (log10 (accumulator) - log10 (remainder));

      //   note also that strcat _accumulates_ in stext.
      //  There would be a possible buffer overflow except that _we_ control
      //   what gets written here.  So it's no biggie.

      if (tprob > 0.5000)
	{
	  sprintf (buf, "CLASSIFY succeeds; success probability: %6.4f  pR: %6.4f\n", tprob, overall_pR );
	}
      else
	{
	  sprintf (buf, "CLASSIFY fails; success probability: %6.4f  pR: %6.4f\n", tprob, overall_pR );
	};
      if (strlen (stext) + strlen(buf) <= stext_maxlen)
	strcat (stext, buf);

      //   find best single matching file
      //
      bestseen = 0;
      for (k = 0; k < maxhash; k++)
	if (classifierprs[k] > classifierprs[bestseen] ) bestseen = k;

      remainder = 10 * DBL_MIN;
      for (m = 0; m < maxhash; m++)
	if (bestseen != m)
	  {
	    remainder = remainder + totalweights[m];
	  };

      //   ... and format some output of best single matching file
      //    
      sprintf (buf, "Best match to file #%ld (%s) "\
    		    "weight: %6.4f  pR: %6.4f  \n",
	       bestseen,
	       hashname[bestseen],
	       totalweights[bestseen],
	       classifierprs[bestseen]);
      if (strlen (stext) + strlen(buf) <= stext_maxlen)
	strcat (stext, buf);
      sprintf (buf, "Total features in input file: %ld\n", totalfeatures); 
      if (strlen (stext) + strlen(buf) <= stext_maxlen)
	strcat (stext, buf);

      //     Now do the per-file breakdowns:
      //
      for (k = 0; k < maxhash; k++)
	{
	  long m;
	  remainder = 10 * DBL_MIN;
	  for (m = 0; m < maxhash; m++)
	      if (k != m)
		{
		  remainder = remainder + totalweights[m];
		};
	  sprintf (buf, 
		   "#%ld (%s):"\
		   " features: %.2f, unseen: %3.2e, weight: %3.2e, pR: %6.2f \n", 
		   k,
		   hashname[k],
		   fcounts[k],
		   unseens[k],
		   totalweights[k], 
		   classifierprs[k]);
	  // strcat (stext, buf);
         if (strlen(stext)+strlen(buf) <= stext_maxlen)
           strcat (stext, buf);
	};
      // check here if we got enough room in stext to stuff everything
      // perhaps we'd better rise a nonfatalerror, instead of just
      // whining on stderr
      if (strcmp(&(stext[strlen(stext)-strlen(buf)]), buf) != 0)
        {
          nonfatalerror( "WARNING: not enough room in the buffer to create "
			 "the statistics text.  Perhaps you could try bigger "
			 "values for MAX_CLASSIFIERS or MAX_FILE_NAME_LEN?",
			 " ");
	};
      if (svlen > 0)
	crm_destructive_alter_nvariable (svrbl, svlen, 
					 stext, strlen (stext));
    };
  
  //
  //  Free the hashnames, to avoid a memory leak.
  //
  for (i = 0; i < maxhash; i++)
    free (hashname[i]);
  if (tprob <= 0.5000)
    {
      if (user_trace)
	fprintf (stderr, "CLASSIFY was a FAIL, skipping forward.\n");
      //    and do what we do for a FAIL here
      csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1;
      csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1;
      return (0);
    };
  
  
  //    
  //   all done... if we got here, we should just continue execution
  if (user_trace)
    fprintf (stderr, "CLASSIFY was a SUCCESS, continuing execution.\n");
 regcomp_failed:
  return (0);
};
syntax highlighted by Code2HTML, v. 0.9.1