// crm_markovian.c - Controllable Regex Mutilator, version v1.0
// Copyright 2001-2006 William S. Yerazunis, all rights reserved.
//
// This software is licensed to the public under the Free Software
// Foundation's GNU GPL, version 2. You may
// obtain a copy of the GPL by visiting the Free Software Foundations
// web site at www.fsf.org, and a copy is included in this
// distribution.
//
// Other licenses may be negotiated; contact the
// author for details.
//
// include some standard files
#include "crm114_sysincludes.h"
// include any local crm114 configuration file
#include "crm114_config.h"
// include the crm114 data structures file
#include "crm114_structs.h"
// and include the routine declarations file
#include "crm114.h"
// the command line argc, argv
extern int prog_argc;
extern char **prog_argv;
// the auxilliary input buffer (for WINDOW input)
extern char *newinputbuf;
// the globals used when we need a big buffer - allocated once, used
// wherever needed. These are sized to the same size as the data window.
extern char *inbuf;
extern char *outbuf;
extern char *tempbuf;
////////////////////////////////////////////////////////////////////
//
// the hash coefficient table (hctable) should be full of relatively
// prime numbers, and preferably superincreasing, though both of those
// are not strict requirements.
//
static long hctable[] =
{ 1, 7,
3, 13,
5, 29,
11, 51,
23, 101,
47, 203,
97, 407,
197, 817,
397, 1637,
797, 3277 };
// How to learn Markovian style.
//
int crm_expr_markov_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb,
char *txtptr, long txtstart, long txtlen)
{
// learn the sparse spectrum of this input window as
// belonging to a particular type.
// learn <flags> (classname) /word/
//
long i, j, k;
long h; // h is our counter in the hashpipe;
char ptext[MAX_PATTERN]; // the regex pattern
long plen;
char ltext[MAX_PATTERN]; // the variable to learn
long llen;
char htext[MAX_PATTERN]; // the hash name
long hlen;
long cflags, eflags;
struct stat statbuf; // for statting the hash file
long hfsize; // size of the hash file
FEATUREBUCKET_TYPE *hashes; // the text of the hash file
unsigned long hashpipe[MARKOVIAN_WINDOW_LEN+1];
//
regex_t regcb;
regmatch_t match[5]; // we only care about the outermost match
long textoffset;
long textmaxoffset;
long sense;
long microgroom;
long max_feature_terms; // how many terms do we actually get per word?
long fev;
long learns_index;
long features_index;
// seen_features - for UNIQUEness tests
long unique_mode;
unsigned char *seen_features = NULL;
if (internal_trace)
fprintf (stderr, "executing a LEARN\n");
// Keep the gcc compiler from complaining about unused variables
i = hctable[0];
// extract the hash file name
crm_get_pgm_arg (htext, MAX_PATTERN, apb->p1start, apb->p1len);
hlen = apb->p1len;
hlen = crm_nexpandvar (htext, hlen, MAX_PATTERN);
//
// extract the variable name (if present)
crm_get_pgm_arg (ltext, MAX_PATTERN, apb->b1start, apb->b1len);
llen = apb->b1len;
llen = crm_nexpandvar (ltext, llen, MAX_PATTERN);
// get the "this is a word" regex
crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len);
plen = apb->s1len;
plen = crm_nexpandvar (ptext, plen, MAX_PATTERN);
// set our cflags, if needed. The defaults are
// "case" and "affirm", (both zero valued).
// and "microgroom" disabled.
cflags = REG_EXTENDED;
eflags = 0;
sense = +1;
unique_mode = 0;
seen_features = NULL;
if (apb->sflags & CRM_NOCASE)
{
cflags = cflags | REG_ICASE;
eflags = 1;
if (user_trace)
fprintf (stderr, "turning oncase-insensitive match\n");
};
if (apb->sflags & CRM_REFUTE)
{
sense = -sense;
if (user_trace)
fprintf (stderr, " refuting learning\n");
};
if (apb->sflags & CRM_UNIQUE)
{
if (user_trace)
fprintf (stderr, " turning on UNIQUE features only.\n");
unique_mode = 1;
};
microgroom = 0;
if (apb->sflags & CRM_MICROGROOM)
{
microgroom = 1;
if (user_trace)
fprintf (stderr, " enabling microgrooming.\n");
};
// How many features in a Markovian? Here's the answer:
max_feature_terms = ( 1 << (MARKOVIAN_WINDOW_LEN -1 ));
if (apb->sflags & CRM_UNIGRAM)
{
max_feature_terms = 1;
if (user_trace)
fprintf (stderr, " enabling unigram-only (Bayesian) features.\n");
};
//
// grab the filename, and stat the file
// note that neither "stat", "fopen", nor "open" are
// fully 8-bit or wchar clean...
i = 0;
while (htext[i] < 0x021) i++;
j = i;
while (htext[j] >= 0x021) j++;
// filename starts at i, ends at j. null terminate it.
htext[j] = '\000';
// and stat it to get it's length
k = stat (&htext[i], &statbuf);
// quick check- does the file even exist?
if (k != 0)
{
// file didn't exist... create it
FILE *f;
if (user_trace)
fprintf (stderr, "\nHad to create new CSS file %s\n", &htext[i]);
f = fopen (&htext[i], "wb");
if (!f)
{
fprintf (stderr,
"\n Couldn't open your new CSS file %s for writing; errno=%d .\n",
&htext[i], errno);
if (engine_exit_base != 0)
{
exit (engine_exit_base + 19);
}
else
exit (EXIT_FAILURE);
};
// do we have a user-specified file size?
if (sparse_spectrum_file_length == 0 ) {
sparse_spectrum_file_length =
DEFAULT_MARKOVIAN_SPARSE_SPECTRUM_FILE_LENGTH ;
};
// put in sparse_spectrum_file_length entries of NULL
for (j = 0;
j < sparse_spectrum_file_length
* sizeof ( FEATUREBUCKET_TYPE);
j++)
fputc ('\000', f);
// fprintf (f,"%c", '\000');
fclose (f);
// and reset the statbuf to be correct
k = stat (&htext[i], &statbuf);
};
//
hfsize = statbuf.st_size;
if (user_trace)
fprintf (stderr, "Sparse spectra file %s has length %ld bins\n",
&htext[i], hfsize / sizeof (FEATUREBUCKET_TYPE));
//
// mmap the hash file into memory so we can bitwhack it
//
hashes = (FEATUREBUCKET_TYPE *) crm_mmap_file (&(htext[i]),
0, hfsize,
PROT_READ | PROT_WRITE,
MAP_SHARED,
NULL);
if (hashes == MAP_FAILED)
{
fev = fatalerror ("Couldn't get access to the statistics file named: ",
&htext[i]);
return (fev);
};
//
// check the version of the file
//
// if (hashes[0].hash != 0 ||
// hashes[0].key != 0 ||
// hashes[0].value!= 0)
// {
// fev =fatalerror ("The .css file is the wrong version! Filename is: ",
// &htext[i]);
// return (fev);
// };
//
// now set the hfsize to the number of entries, not the number
// of bytes total
hfsize = hfsize / sizeof ( FEATUREBUCKET_TYPE );
#ifdef OSB_LEARNCOUNTS
// Reserve the OSB_LEARNCOUNT buckets, even though Markov doesn't
// use them.
//
// We use the reserved h2 == 0 setup for the learncount.
//
{
char* litf = "Learnings in this file";
char* fitf = "Features in this file";
unsigned long hcode, h1, h2;
//
hcode = strnhash (litf, strlen ( litf ));
h1 = hcode % hfsize;
h2 = 0;
if (hashes[h1].hash != hcode)
{
// initialize the file?
if (hashes[h1].hash == 0 && hashes[h1].key == 0)
{
hashes[h1].hash = hcode;
hashes[h1].key = 0;
hashes[h1].value = 1;
learns_index = h1;
}
else
{
//fatalerror (" This file should have learncounts, but doesn't!",
// " The slot is busy, too. It's hosed. Time to die.");
// goto regcomp_failed;
};
}
else
{
if (hashes[h1].key == 0)
// the learncount matched.
{
learns_index = h1;
if (sense > 0)
hashes[h1].value = hashes[h1].value + sense;
else
{
if (hashes[h1].value + sense > 0)
hashes[h1].value += sense;
else
hashes[h1].value = 0;
}
if (user_trace)
fprintf (stderr, "This file has had %ld documents learned!\n",
hashes[h1].value);
};
};
hcode = strnhash (fitf, strlen ( fitf ));
h1 = hcode % hfsize;
h2 = 0;
if (hashes[h1].hash != hcode)
{
// initialize the file?
if (hashes[h1].hash == 0 && hashes[h1].key == 0)
{
hashes[h1].hash = hcode;
hashes[h1].key = 0;
hashes[h1].value = 1;
features_index = h1;
}
else
{
//fatalerror (" This file should have learncounts, but doesn't!",
// " The slot is busy, too. It's hosed. Time to die.");
//goto regcomp_failed ;
};
}
else
{
if (hashes[h1].key == 0)
// the learncount matched.
{
features_index = h1;
if (user_trace)
fprintf (stderr, "This file has had %ld features learned!\n",
hashes[h1].value);
};
};
};
#endif
if (unique_mode)
{
seen_features = calloc (hfsize, 1);
if (seen_features == NULL)
untrappableerror (" Couldn't allocate enough memory to keep track",
"of nonunique features. This is deadly");
};
// compile the word regex
//
if ( internal_trace)
fprintf (stderr, "\nWordmatch pattern is %s", ptext);
i = crm_regcomp (®cb, ptext, plen, cflags);
if ( i > 0)
{
crm_regerror ( i, ®cb, tempbuf, data_window_size);
nonfatalerror ("Regular Expression Compilation Problem:", tempbuf);
goto regcomp_failed;
};
// Start by priming the pipe... we will shift to the left next.
// sliding, hashing, xoring, moduloing, and incrmenting the
// hashes till there are no more.
k = 0;
j = 0;
i = 0;
#ifdef OLD_STUPID_VAR_RESTRICTION
if (llen > 0)
{
vhtindex = crm_vht_lookup (vht, ltext, llen);
}
else
{
vhtindex = crm_vht_lookup (vht, ":_dw:", 5);
};
if (vht[vhtindex] == NULL)
{
long q;
q = fatalerror (" Attempt to LEARN from a nonexistent variable ",
ltext);
return (q);
};
mdw = NULL;
if (tdw->filetext == vht[vhtindex]->valtxt)
mdw = tdw;
if (cdw->filetext == vht[vhtindex]->valtxt)
mdw = cdw;
if (mdw == NULL)
{
long q;
q = fatalerror (" Bogus text block containing variable ", ltext);
return (q);
}
textoffset = vht[vhtindex]->vstart;
textmaxoffset = textoffset + vht[vhtindex]->vlen;
#endif
// No need to do any parsing of a box restriciton, as those
// are passed in to us from the caller.
textoffset = txtstart;
textmaxoffset = txtstart + txtlen;
// init the hashpipe with 0xDEADBEEF
for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++)
{
hashpipe[h] = 0xDEADBEEF;
};
// and the big loop...
i = 0;
while (k == 0 && textoffset <= textmaxoffset)
{
long wlen;
long slen;
// do the regex
// slen = endpoint (= start + len)
// - startpoint (= curr textoffset)
// slen = txtlen;
slen = textmaxoffset - textoffset;
// if pattern is empty, extract non graph delimited tokens
// directly ([[graph]]+) instead of calling regexec (8% faster!)
if (ptext[0] != '\0')
{
k = crm_regexec ( ®cb, &(txtptr[textoffset]),
slen,
5, match, 0, NULL);
}
else
{
k = 0;
// skip non-graphical characthers
match[0].rm_so = 0;
while (!isgraph (txtptr[textoffset + match[0].rm_so])
&& textoffset + match[0].rm_so < textmaxoffset)
match[0].rm_so ++;
match[0].rm_eo = match[0].rm_so;
while (isgraph (txtptr [textoffset + match[0].rm_eo])
&& textoffset + match[0].rm_eo < textmaxoffset)
match[0].rm_eo ++;
if ( match[0].rm_so == match[0].rm_eo)
k = 1;
}
if (k != 0 || textoffset > textmaxoffset)
goto learn_end_regex_loop;
{
wlen = match[0].rm_eo - match[0].rm_so;
memmove (tempbuf,
&(txtptr[textoffset + match[0].rm_so]),
wlen);
tempbuf[wlen] = '\000';
if (internal_trace)
{
fprintf (stderr,
" Learn #%ld t.o. %ld strt %ld end %ld len %ld is -%s-\n",
i,
textoffset,
(long) match[0].rm_so,
(long) match[0].rm_eo,
wlen,
tempbuf);
};
if (match[0].rm_eo == 0)
{
nonfatalerror ( "The LEARN pattern matched zero length! ",
"\n Forcing an increment to avoid an infinite loop.");
match[0].rm_eo = 1;
};
// Shift the hash pipe down one
//
for (h = MARKOVIAN_WINDOW_LEN-1; h > 0; h--)
{
hashpipe [h] = hashpipe [h-1];
};
// and put new hash into pipeline
hashpipe[0] = strnhash (tempbuf, wlen);
if (internal_trace)
{
fprintf (stderr, " Hashpipe contents: ");
for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++)
fprintf (stderr, " %ld", hashpipe[h]);
fprintf (stderr, "\n");
};
// and account for the text used up.
textoffset = textoffset + match[0].rm_eo;
i++;
// is the pipe full enough to do the hashing?
if (1) // we always run the hashpipe now, even if it's
// just full of 0xDEADBEEF. (was i >=5)
{
unsigned long hindex;
unsigned long h1, h2;
long th = 0; // a counter used for TSS tokenizing
unsigned long incrs;
long j;
//
// old Hash polynomial: h0 + 3h1 + 5h2 +11h3 +23h4
// (coefficients chosen by requiring superincreasing,
// as well as prime)
//
th = 0;
// for ( j = 0; j <= 15 ; j++)
for (j = 0; j < max_feature_terms; j++)
{
#ifdef TGB
//
// Token Grab Bag - ignore sequence, distance. alias
//
hindex = hashpipe [0]
+ ( hashpipe[1] * ((j>>0) & 0x0001))
+ ( hashpipe[2] * ((j>>1) & 0x0001))
+ ( hashpipe[3] * ((j>>2) & 0x0001))
+ ( hashpipe[4] * ((j>>3) & 0x0001));
if (hindex == 0) hindex = 1;
h1 = hindex;
hindex = hindex % hfsize;
if (hindex == 0) hindex = 1;
// this is the secondary (crosscut) hash, used for
// confirmation of the key value.
h2 = hashpipe[0]
+ ( hashpipe[1] * ((j>>0) & 0x0001))
+ ( hashpipe[2] * ((j>>1) & 0x0001))
+ ( hashpipe[3] * ((j>>2) & 0x0001))
+ ( hashpipe[4] * ((j>>3) & 0x0001));
if (h2 == 0) h2 = 0xdeadbeef;
#endif
#ifdef TGB2
//
// Token Grab Bag - ignore sequence, distance.
//
hindex = hashpipe [0]
+ ( hashpipe[1] * ((j>>0) & 0x0001))
+ ( hashpipe[2] * ((j>>1) & 0x0001))
+ ( hashpipe[3] * ((j>>2) & 0x0001))
+ ( hashpipe[4] * ((j>>3) & 0x0001));
if (hindex == 0) hindex = 1;
h1 = hindex;
hindex = hindex % hfsize;
if (hindex == 0) hindex = 1;
// this is the secondary (crosscut) hash, used for
// confirmation of the key value.
h2 = hashpipe[0] * hashpipe[0]
+ ( hashpipe[1] * hashpipe[1] * ((j>>0) & 0x0001))
+ ( hashpipe[2] * hashpipe[2] * ((j>>1) & 0x0001))
+ ( hashpipe[3] * hashpipe[3] * ((j>>2) & 0x0001))
+ ( hashpipe[4] * hashpipe[4] * ((j>>3) & 0x0001));
if (h2 == 0) h2 = 0xdeadbeef;
#endif
#ifdef TSS
//
// Token Grab Bag - ignore sequence, distance, prevent
// aliasing (quadratic H2)
//
hindex = hashpipe [0]
+ ( hashpipe[1] * ((j>>0) & 0x0001))
+ ( hashpipe[2] * ((j>>1) & 0x0001))
+ ( hashpipe[3] * ((j>>2) & 0x0001))
+ ( hashpipe[4] * ((j>>3) & 0x0001));
if (hindex == 0) hindex = 1;
h1 = hindex;
hindex = hindex % hfsize;
if (hindex == 0) hindex = 1;
// this is the secondary (crosscut) hash, used for
// confirmation of the key value.
th = 2;
h2 = hashpipe[0];
if ((j>>0) & 0x0001)
{
h2 = h2 + hashpipe[1] * th;
th++;
};
if ((j>>1) & 0x0001)
{
h2 = h2 + hashpipe[2] * th;
th++;
};
if ((j>>2) & 0x0001)
{
h2 = h2 + hashpipe[3] * th;
th++;
};
if ((j>>3) & 0x0001)
{
h2 = h2 + hashpipe[4] * th;
th++;
};
if (h2 == 0) h2 = 0xdeadbeef;
#endif
#ifdef SBPH
hindex = hashpipe [0]
+ ( 3 * hashpipe[1] * ((j>>0) & 0x0001))
+ ( 5 * hashpipe[2] * ((j>>1) & 0x0001))
+ ( 11 * hashpipe[3] * ((j>>2) & 0x0001))
+ ( 23 * hashpipe[4] * ((j>>3) & 0x0001));
h1 = hindex;
// and what's our primary hash index? Note that
// hindex = 0 is reserved for our version and
// usage flags, so we autobump those to hindex=1
hindex = hindex % hfsize;
if (hindex == 0) hindex = 1;
// this is the secondary (crosscut) hash, used for
// confirmation of the key value. Note that it shares
// no common coefficients with the previous hash.
h2 = 7 * hashpipe [0]
+ ( 13 * hashpipe[1] * ((j>>0) & 0x0001))
+ ( 29 * hashpipe[2] * ((j>>1) & 0x0001))
+ ( 51 * hashpipe[3] * ((j>>2) & 0x0001))
+ ( 101 * hashpipe[4] * ((j>>3) & 0x0001));
if (h2 == 0) h2 = 0xdeadbeef;
#endif
#ifdef ARBITRARY_WINDOW_LENGTH
//////////////////////////////////////////////////
//
// Generic N-length hashing.
//
// first term (0th) is always on
h1 = hashpipe[0] * hctable [0];
// 2nd and onward terms are variable.
for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++)
{
h1 = h1 + hashpipe[h] * hctable[h*2] * ((j>>(h-1))&0x0001);
};
hindex = h1;
hindex = hindex % hfsize;
if (hindex == 0) hindex = 1;
// 0th term is always turned on.
h2 = hashpipe[0] * hctable[1];
// terms 1 through N are variable
for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++)
{
h2 = h2 +hashpipe[h] * hctable[h*2+1]*((j>>(h-1))&0x0001);
};
if (h2 == 0) h2 = 0xDEADBEEF;
#endif
if (internal_trace)
fprintf (stderr, "Polynomial %ld has h1:%ld h2: %ld\n",
j, h1, h2);
//
// we now look at both the primary (h1) and
// crosscut (h2) indexes to see if we've got
// the right bucket or if we need to look further
//
incrs = 0;
while ( hashes[hindex].key != 0
&& ( hashes[hindex].hash != h1
|| hashes[hindex].key != h2 ))
{
//
incrs++;
//
// If microgrooming is enabled, and we've found a
// chain that's too long, we groom it down.
//
if (microgroom && (incrs > MICROGROOM_CHAIN_LENGTH))
{
// set the random number generator up...
// note that this is repeatable for a
// particular test set, yet dynamic. That
// way, we don't always autogroom away the
// same feature; we depend on the previous
// feature's key.
srand ( (unsigned int) h2);
//
// and do the groom.
crm_microgroom (hashes, seen_features, hfsize, hindex);
// since things may have moved after a
// microgroom, restart our search
hindex = h1 % hfsize;
if (hindex == 0) hindex = 1;
incrs = 0;
};
// check to see if we've incremented ourself all the
// way around the .css file. If so, we're full, and
// can hold no more features (this is unrecoverable)
if (incrs > hfsize - 3)
{
nonfatalerror ("Your program is stuffing too many "
"features into this size .css file. "
"Adding any more features is "
"impossible in this file.",
"You are advised to build a larger "
".css file and merge your data into "
"it.");
goto learn_end_regex_loop;
};
hindex++;
if (hindex >= hfsize) hindex = 1;
};
if (internal_trace)
{
if (hashes[hindex].value == 0)
{
fprintf (stderr,"New feature at %ld\n", hindex);
}
else
{
fprintf (stderr, "Old feature at %ld\n", hindex);
};
};
// always rewrite hash and key, as they may be incorrect
// (on a reused bucket) or zero (on a fresh one)
//
hashes[hindex].hash = h1;
hashes[hindex].key = h2;
// watch out - sense may be both + or -, so check before
// adding it...
//
if ( !seen_features || ( seen_features[hindex]== 0) )
{
if (seen_features)
seen_features[hindex]++;
if (sense > 0 &&
hashes[hindex].value + sense >=
FEATUREBUCKET_VALUE_MAX-1)
hashes[hindex].value = FEATUREBUCKET_VALUE_MAX - 1;
else if ( sense < 0 && hashes[hindex].value <= -sense )
hashes[hindex].value = 0;
else
hashes[hindex].value += sense;
}
};
};
};
}; // end the while k==0
learn_end_regex_loop:
// free up the pattern buffer:
if ( ptext[0] != '\0') crm_regfree (®cb);
regcomp_failed:
// and remember to let go of all the mmaps (full flush)
// crm_munmap_all ();
crm_force_munmap_addr (hashes);
// and let go of the seen_features array
if (seen_features) free (seen_features);
seen_features = NULL;
// Because mmap/munmap doesn't set atime, nor set the "modified"
// flag, some network filesystems will fail to mark the file as
// modified and so their cacheing will make a mistake.
//
// The fix is to do a trivial read/write on the .css ile, to force
// the filesystem to repropagate it's caches.
//
#ifdef POSIX
{
int hfd; // hashfile fd
FEATURE_HEADER_STRUCT foo;
hfd = open (&(htext[i]), O_RDWR);
read (hfd, &foo, sizeof(foo));
lseek (hfd, 0, SEEK_SET);
write (hfd, &foo, sizeof(foo));
close (hfd);
}
#endif
return (0);
}
// How to Markovian CLASSIFY some text.
//
int crm_expr_markov_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb,
char *txtptr, long txtstart, long txtlen)
{
// classify the sparse spectrum of this input window
// as belonging to a particular type.
//
// This code should look very familiar- it's cribbed from
// the code for LEARN
//
long i, j, k;
long h; // we use h for our hashpipe counter, as needed.
char ptext[MAX_PATTERN]; // the regex pattern
long plen;
// char ltext[MAX_PATTERN]; // the variable to classify
//long llen;
// the hash file names
char htext[MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN];
long htext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN;
long hlen;
// the match statistics variable
char stext[MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100)];
long stext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100);
long slen;
char svrbl[MAX_PATTERN]; // the match statistics text buffer
long svlen;
long fnameoffset;
char fname[MAX_FILE_NAME_LEN];
long eflags;
long cflags;
long not_microgroom; // is microgrooming disabled (fast-quit
// optimization if 0th feature gone)
long max_feature_terms; // how many features do we get at each
// pipe position?
struct stat statbuf; // for statting the hash file
// longest association set in the hashing
unsigned long hashpipe[MARKOVIAN_WINDOW_LEN+1];
regex_t regcb;
regmatch_t match[5]; // we only care about the outermost match
unsigned long fcounts[MAX_CLASSIFIERS]; // total counts for feature normalize
unsigned long totalcount = 0;
double cpcorr[MAX_CLASSIFIERS]; // corpus correction factors
double hits[MAX_CLASSIFIERS]; // actual hits per feature per classifier
long totalhits[MAX_CLASSIFIERS]; // actual total hits per classifier
long totalfeatures; // total features
double htf; // hits this feature got.
double tprob; // total probability in the "success" domain.
// double textlen; // text length - rougly corresponds to
// information content of the text to classify
double ptc[MAX_CLASSIFIERS]; // current running probability of this class
double renorm = 0.0;
double pltc[MAX_CLASSIFIERS]; // current local probability of this class
double plltc[MAX_CLASSIFIERS]; // current local probability of this class
// int hfds[MAX_CLASSIFIERS];
FEATUREBUCKET_TYPE *hashes[MAX_CLASSIFIERS];
long hashlens[MAX_CLASSIFIERS];
char *hashname[MAX_CLASSIFIERS];
long succhash;
long vbar_seen; // did we see '|' in classify's args?
long maxhash;
long fnstart, fnlen;
long fn_start_here;
long textoffset;
long textmaxoffset;
long bestseen;
long thistotal;
long feature_weight = 1;
double top10scores[10];
long top10polys[10];
char top10texts[10][MAX_PATTERN];
// for UNIQUE runs
long unique_mode = 0;
unsigned char *seen_features[MAX_CLASSIFIERS];
if (internal_trace)
fprintf (stderr, "executing a CLASSIFY\n");
// We get the variable block, start, and len from caller
//
// crm_get_pgm_arg (ltext, MAX_PATTERN, apb->b1start, apb->b1len);
// llen = apb->b1len;
// llen = crm_nexpandvar (ltext, llen, MAX_PATTERN);
// extract the hash file names
crm_get_pgm_arg (htext, htext_maxlen, apb->p1start, apb->p1len);
hlen = apb->p1len;
hlen = crm_nexpandvar (htext, hlen, htext_maxlen);
// extract the "this is a word" regex
//
crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len);
plen = apb->s1len;
plen = crm_nexpandvar (ptext, plen, MAX_PATTERN);
// extract the optional "match statistics" variable
//
crm_get_pgm_arg (svrbl, MAX_PATTERN, apb->p2start, apb->p2len);
svlen = apb->p2len;
svlen = crm_nexpandvar (svrbl, svlen, MAX_PATTERN);
{
long vstart, vlen;
crm_nextword (svrbl, svlen, 0, &vstart, &vlen);
memmove (svrbl, &svrbl[vstart], vlen);
svlen = vlen;
svrbl[vlen] = '\000';
};
// status variable's text (used for output stats)
//
stext[0] = '\000';
slen = 0;
// set our flags, if needed. The defaults are
// "case"
cflags = REG_EXTENDED;
eflags = 0;
if (apb->sflags & CRM_NOCASE)
{
if (user_trace)
fprintf (stderr, " setting NOCASE for tokenization\n");
cflags += REG_ICASE;
eflags = 1;
};
not_microgroom = 1;
if (apb->sflags & CRM_MICROGROOM)
{
not_microgroom = 0;
if (user_trace)
fprintf (stderr, " disabling fast-skip optimization.\n");
};
max_feature_terms = ( 1 << (MARKOVIAN_WINDOW_LEN - 1 ));
if (apb->sflags & CRM_UNIGRAM)
{
max_feature_terms = 1;
if (user_trace)
fprintf (stderr, " enabling unigram-only (Bayesian) features.\n");
};
unique_mode = 0;
if (apb->sflags & CRM_UNIQUE)
{
if (user_trace)
fprintf (stderr, " setting UNIQUE feature filtering.\n");
unique_mode = 1;
};
// compile the word regex
if ( internal_trace)
fprintf (stderr, "\nWordmatch pattern is %s", ptext);
i = crm_regcomp (®cb, ptext, plen, cflags);
if ( i > 0)
{
crm_regerror ( i, ®cb, tempbuf, data_window_size);
nonfatalerror ("Regular Expression Compilation Problem:", tempbuf);
goto regcomp_failed;
};
// Now, the loop to open the files.
bestseen = 0;
thistotal = 0;
// goodcount = evilcount = 1; // prevents a divide-by-zero error.
//cpgood = cpevil = 0.0;
//ghits = ehits = 0.0 ;
//psucc = 0.5;
//pfail = (1.0 - psucc);
//pic = 0.5;
//pnic = 0.5;
// initialize our arrays for N .css files
for (i = 0; i < MAX_CLASSIFIERS; i++)
{
fcounts[i] = 0; // check later to prevent a divide-by-zero
// error on empty .css file
cpcorr[i] = 0.0; // corpus correction factors
hits[i] = 0.0; // absolute hit counts
totalhits[i] = 0.0; // absolute hit counts
ptc[i] = 0.5; // priori probability
pltc[i] = 0.5; // local probability
};
for (i = 0; i < 10; i++)
{
top10scores[i] = 0;
top10polys[i] = 0;
strcpy (top10texts[i], "");
};
// -- probabilistic evaluator ---
// S = success; A = a testable attribute of success
// ns = not success, na = not attribute
// the chain rule we use is:
//
// P(A|S) P(S)
// P (S|A) = -------------------------
// P(A|S) P(S) + P(A|NS) P(NS)
//
// and we apply it repeatedly to evaluate the final prob. For
// the initial a-priori probability, we use 0.5. The output
// value (here, P(S|A) ) becomes the new a-priori for the next
// iteration.
//
// Extension - we generalize the above to I classes as and feature
// F as follows:
//
// P(F|Ci) P(Ci)
// P(Ci|F) = ----------------------------------------
// Sum over all classes Ci of P(F|Ci) P(Ci)
//
// We also correct for the unequal corpus sizes by multiplying
// the probabilities by a renormalization factor. if Tg is the
// total number of good features, and Te is the total number of
// evil features, and Rg and Re are the raw relative scores,
// then the corrected relative scores Cg aqnd Ce are
//
// Cg = (Rg / Tg)
// Ce = (Re / Te)
//
// or Ci = (Ri / Ti)
//
// Cg and Ce can now be used as "corrected" relative counts
// to calculate the naive Bayesian probabilities.
//
// Lastly, the issue of "over-certainty" rears it's ugly head.
// This is what happens when there's a zero raw count of either
// good or evil features at a particular place in the file; the
// strict but naive mathematical interpretation of this is that
// "feature A never/always occurs when in good/evil, hence this
// is conclusive evidence of good/evil and the probabilities go
// to 1.0 or 0.0, and are stuck there forevermore. We use the
// somewhat ad-hoc interpretation that it is unreasonable to
// assume that any finite number of samples can appropriately
// represent an infinite continuum of spewage, so we can bound
// the certainty of any meausre to be in the range:
//
// limit: [ 1/featurecount+2 , 1 - 1/featurecount+2].
//
// The prior bound is strictly made-up-on-the-spot and has NO
// strong theoretical basis. It does have the nice behavior
// that for feature counts of 0 the probability is clipped to
// [0.5, 0.5], for feature counts of 1 to [0.333, 0.666]
// for feature counts of 2 to [0.25, 0.75], for 3 to
// [0.2, 0.8], for 4 to [0.166, 0.833] and so on.
//
vbar_seen = 0;
maxhash = 0;
succhash = 0;
fnameoffset = 0;
// now, get the file names and mmap each file
// get the file name (grody and non-8-bit-safe, but doesn't matter
// because the result is used for open() and nothing else.
// GROT GROT GROT this isn't NULL-clean on filenames. But then
// again, stdio.h itself isn't NULL-clean on filenames.
if (user_trace)
fprintf (stderr, "Classify list: -%s- \n", htext);
fn_start_here = 0;
fnlen = 1;
while ( fnlen > 0 && ((maxhash < MAX_CLASSIFIERS-1)))
{
crm_nextword (htext,
hlen, fn_start_here,
&fnstart, &fnlen);
if (fnlen > 0)
{
strncpy (fname, &htext[fnstart], fnlen);
fname[fnlen] = '\000';
// fprintf (stderr, "fname is '%s' len %ld\n", fname, fnlen);
fn_start_here = fnstart + fnlen + 1;
if (user_trace)
fprintf (stderr, "Classifying with file -%s- "\
"succhash=%ld, maxhash=%ld\n",
fname, succhash, maxhash);
if ( fname[0] == '|' && fname[1] == '\000')
{
if (vbar_seen)
{
nonfatalerror ("Only one ' | ' allowed in a CLASSIFY. \n" ,
"We'll ignore it for now.");
}
else
{
succhash = maxhash;
};
vbar_seen ++;
}
else
{
// be sure the file exists
// stat the file to get it's length
k = stat (fname, &statbuf);
// quick check- does the file even exist?
if (k != 0)
{
nonfatalerror ("Nonexistent Classify table named: ",
fname);
}
else
{
// file exists - do the open/process/close
//
hashlens[maxhash] = statbuf.st_size;
// mmap the hash file into memory so we can bitwhack it
hashes[maxhash] = (FEATUREBUCKET_TYPE *) crm_mmap_file (
fname,
0, hashlens[maxhash],
PROT_READ | PROT_WRITE,
MAP_SHARED,
NULL);
if (hashes[maxhash] == MAP_FAILED )
{
nonfatalerror ("Couldn't get access to the "
"statistics file named: ",
fname);
}
else
{
// fprintf (stderr, "MMap got %lx\n", hashes);
//
// Check to see if this file is the right version
//
//long fev;
//if (hashes[maxhash][0].hash != 0 ||
// hashes[maxhash][0].key != 0 ||
// hashes[maxhash][0].value!= 0)
//{
// fev = fatalerror ("The .css file is the wrong version! Filename is: ",
// fname);
// return (fev);
//};
// set this hashlens to the length in
// features instead of the length in bytes.
hashlens[maxhash] = hashlens[maxhash] / sizeof (FEATUREBUCKET_TYPE);
hashname[maxhash] = (char *) malloc (fnlen+10);
if (!hashname[maxhash])
untrappableerror(
"Couldn't malloc hashname[maxhash]\n","We need that part later, so we're stuck. Sorry.");
strncpy(hashname[maxhash],fname,fnlen);
hashname[maxhash][fnlen]='\000';
maxhash++;
};
};
};
if (maxhash > MAX_CLASSIFIERS-1)
nonfatalerror ("Too many classifier files.",
"Some may have been disregarded");
};
};
//
// If there is no '|', then all files are "success" files.
if (succhash == 0)
succhash = maxhash;
// a CLASSIFY with no arguments is always a "success".
if (maxhash == 0)
return (0);
// now, set up the normalization factor fcount[]
if (user_trace)
fprintf (stderr, "Running with %ld files for success out of %ld files\n",
succhash, maxhash );
// sanity checks... Uncomment for super-strict CLASSIFY.
//
// do we have at least 1 valid .css files?
if (maxhash == 0)
{
fatalerror ("Couldn't open at least 2 .css files for classify().", "");
};
// do we have at least 1 valid .css file at both sides of '|'?
//if (!vbar_seen || succhash < 0 || (maxhash < succhash + 2))
// {
// nonfatalerror (
// "Couldn't open at least 1 .css file per SUCC | FAIL classes "
// " for classify().\n","Hope you know what are you doing.");
// };
{
long ifile;
long k;
for (ifile = 0; ifile < maxhash; ifile++)
{
if (unique_mode != 0)
{
// Note that we _calloc_, not malloc, to zero the memory first.
seen_features[ifile] = calloc (hashlens[ifile]+1, 1);
if ( seen_features[ifile] == NULL )
untrappableerror (" Couldn't allocate enough memory to keep "
" track of nonuniuqe features. ",
"This is deadly. ");
}
else
seen_features[ifile] = NULL;
};
// count up the total first
for (ifile = 0; ifile < maxhash; ifile++)
{
fcounts[ifile] = 0;
//
// GROT GROT GROT
//
// Comment out the next two lines to double speed!!!
// This will break your "totalcounts" diag output, but
// that doesn't really get used.
//
for (k = 1; k < hashlens[ifile]; k++)
fcounts [ifile] = fcounts[ifile] + hashes[ifile][k].value;
// k=1; // keep compiler happy.
if (fcounts[ifile] == 0) fcounts[ifile] = 1;
totalcount = totalcount + fcounts[ifile];
};
//
// calculate cpcorr (count compensation correction)
//
for (ifile = 0; ifile < maxhash; ifile++)
{
// cpcorr [ifile] = ( totalcount / (fcounts[ifile] * (maxhash-1)));
//
// disable cpcorr for now... unclear that it's useful.
cpcorr[ifile] = 1.0;
};
}
//
// now all of the files are mmapped into memory,
// and we can do the polynomials and add up points.
i = 0;
j = 0;
k = 0;
thistotal = 0;
#ifdef OLD_STUPID_VAR_RESTRICTION
if (llen > 0)
{
vhtindex = crm_vht_lookup (vht, ltext, llen );
}
else
{
vhtindex = crm_vht_lookup (vht, ":_dw:", 5);
}
if (vht[vhtindex] == NULL)
{
return (fatalerror (" Attempt to CLASSIFY from a nonexistent variable ",
ltext));
};
mdw = NULL;
if (tdw->filetext == vht[vhtindex]->valtxt)
mdw = tdw;
if (cdw->filetext == vht[vhtindex]->valtxt)
mdw = cdw;
if (mdw == NULL)
return ( fatalerror (" Bogus text block containing variable ", ltext));
textoffset = vht[vhtindex]->vstart;
textmaxoffset = textoffset + vht[vhtindex]->vlen;
textlen = (vht[vhtindex]->vlen);
if (textlen < 1.0) textlen = 1.0;
#endif
textoffset = txtstart;
textmaxoffset = txtstart + txtlen;
// init the hashpipe with 0xDEADBEEF
for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++)
{
hashpipe[h] = 0xDEADBEEF;
};
totalfeatures = 0;
// stop when we no longer get any regex matches
// possible edge effect here- last character must be matchable, yet
// it's also the "end of buffer".
while (k == 0 && textoffset <= textmaxoffset)
{
long wlen;
long slen;
//unsigned char *ptok = &(mdw->filetext[textoffset]);
//unsigned char *ptok_max = &(mdw->filetext[textmaxoffset]);
// do the regex
// slen = vht[vhtindex]->vstart + vht[vhtindex]->vlen - textoffset ;
//slen = txtlen;
slen = textmaxoffset - textoffset;
// if pattern is empty, extract non graph delimited tokens
// directly ([[graph]]+) instead of calling regexec (8% faster!)
if (ptext[0] != '\0')
{
k = crm_regexec (®cb, &(txtptr[textoffset]),
slen, 5, match, 0, NULL);
}
else
{
k = 0;
match[0].rm_so = 0;
while (!isgraph (txtptr[textoffset + match[0].rm_so])
&& textoffset + match[0].rm_so < textmaxoffset)
match[0].rm_so ++;
match[0].rm_eo = match[0].rm_so;
while (isgraph (txtptr [textoffset + match[0].rm_eo])
&& textoffset + match[0].rm_eo < textmaxoffset)
match[0].rm_eo ++;
if ( match[0].rm_so == match[0].rm_eo)
k = 1;
}
if (k != 0 || textoffset > textmaxoffset)
goto classify_end_regex_loop;
wlen = match[0].rm_eo - match[0].rm_so;
memmove (tempbuf,
&(txtptr[textoffset + match[0].rm_so]),
wlen);
tempbuf[wlen] = '\000';
if (match[0].rm_eo == 0)
{
nonfatalerror("The CLASSIFY pattern matched zero length! ",
"\n Forcing an increment to avoid an infinite loop.");
match[0].rm_eo = 1;
};
if (internal_trace)
{
fprintf (stderr,
" Classify #%ld t.o. %ld strt %ld end %ld len %ld is -%s-\n",
i,
textoffset,
(long) match[0].rm_so,
(long) match[0].rm_eo,
wlen,
tempbuf);
};
// slide previous hashes up 1
for (h = MARKOVIAN_WINDOW_LEN-1; h >= 1; h--)
{
hashpipe [h] = hashpipe [h-1];
};
// and put new hash into pipeline
hashpipe[0] = strnhash ( tempbuf, wlen);
if (0)
{
fprintf (stderr, " Hashpipe contents: ");
for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++)
fprintf (stderr, " %ld", hashpipe[h]);
fprintf (stderr, "\n");
};
// account for the text we used up...
textoffset = textoffset + match[0].rm_eo;
i++;
// is the pipe full enough to do the hashing?
if (1) // we init with 0xDEADBEEF, so the pipe is always full (i >=5)
{
int j, k, l;
unsigned th=0; // a counter used only in TSS hashing
unsigned long hindex;
unsigned long h1, h2;
int skip_this_feature = 0;
//unsigned long good, evil;
//
// Hash polynomial: h0 + 3h1 + 5h2 +9h3 +17h4
// (coefficients chosen by formula of 1 + 2^n)
//
// GROT GROT GROT make the order of the SBPH
// a compile-time parameter
th = 0;
// for ( j = 0; j < 16 ; j++)
for (j = 0;
j < max_feature_terms;
j++)
{
#ifdef FOO
//
// First Order Only - only use 1 token
//
hindex = hashpipe [0] ;
if (hindex == 0) hindex = 1;
h1 = hindex;
// this is the secondary (crosscut) hash, used for
// confirmation of the key value.
h2 = hashpipe[0] ;
if (h2 == 0) h2 = 0xdeadbeef;
j = 99999;
#endif
#ifdef TGB
//
// Token Grab Bag - ignore sequence, distance, allow
// aliasing ( note that H2 is linear ).
//
hindex = hashpipe [0]
+ ( hashpipe[1] * ((j>>0) & 0x0001))
+ ( hashpipe[2] * ((j>>1) & 0x0001))
+ ( hashpipe[3] * ((j>>2) & 0x0001))
+ ( hashpipe[4] * ((j>>3) & 0x0001));
if (hindex == 0) hindex = 1;
h1 = hindex;
// this is the secondary (crosscut) hash, used for
// confirmation of the key value.
h2 = hashpipe[0]
+ ( hashpipe[1] * ((j>>0) & 0x0001))
+ ( hashpipe[2] * ((j>>1) & 0x0001))
+ ( hashpipe[3] * ((j>>2) & 0x0001))
+ ( hashpipe[4] * ((j>>3) & 0x0001));
if (h2 == 0) h2 = 0xdeadbeef;
#endif
#ifdef TGB2
//
// Token Grab Bag - ignore sequence, distance, prevent
// aliasing (note that H2 is quadratic ).
//
hindex = hashpipe [0]
+ ( hashpipe[1] * ((j>>0) & 0x0001))
+ ( hashpipe[2] * ((j>>1) & 0x0001))
+ ( hashpipe[3] * ((j>>2) & 0x0001))
+ ( hashpipe[4] * ((j>>3) & 0x0001));
if (hindex == 0) hindex = 1;
h1 = hindex;
// this is the secondary (crosscut) hash, used for
// confirmation of the key value.
h2 = hashpipe[0] * hashpipe[0]
+ ( hashpipe[1] * hashpipe[1] * ((j>>0) & 0x0001))
+ ( hashpipe[2] * hashpipe[2] * ((j>>1) & 0x0001))
+ ( hashpipe[3] * hashpipe[3] * ((j>>2) & 0x0001))
+ ( hashpipe[4] * hashpipe[4] * ((j>>3) & 0x0001));
if (h2 == 0) h2 = 0xdeadbeef;
#endif
#ifdef TSS
//
// Token Seqence Sensitive - ignore distance, prevent
// aliasing (quadratic H2)
//
hindex = hashpipe [0]
+ ( hashpipe[1] * ((j>>0) & 0x0001))
+ ( hashpipe[2] * ((j>>1) & 0x0001))
+ ( hashpipe[3] * ((j>>2) & 0x0001))
+ ( hashpipe[4] * ((j>>3) & 0x0001));
if (hindex == 0) hindex = 1;
h1 = hindex;
// this is the secondary (crosscut) hash, used for
// confirmation of the key value.
th = 2;
h2 = hashpipe[0];
if ((j>>0) & 0x0001)
{
h2 = h2 + hashpipe[1] * th;
th++;
};
if ((j>>1) & 0x0001)
{
h2 = h2 + hashpipe[2] * th;
th++;
};
if ((j>>2) & 0x0001)
{
h2 = h2 + hashpipe[3] * th;
th++;
};
if ((j>>3) & 0x0001)
{
h2 = h2 + hashpipe[4] * th;
th++;
};
#endif
#ifdef SBPH
hindex = hashpipe [0]
+ ( 3 * hashpipe[1] * ((j>>0) & 0x0001))
+ ( 5 * hashpipe[2] * ((j>>1) & 0x0001))
+ ( 11 * hashpipe[3] * ((j>>2) & 0x0001))
+ ( 23 * hashpipe[4] * ((j>>3) & 0x0001));
if (hindex == 0) hindex = 1;
h1 = hindex;
// this is the secondary (crosscut) hash, used for
// confirmation of the key value.
h2 = 7 * hashpipe [0]
+ ( 13 * hashpipe[1] * ((j>>0) & 0x0001))
+ ( 29 * hashpipe[2] * ((j>>1) & 0x0001))
+ ( 51 * hashpipe[3] * ((j>>2) & 0x0001))
+ ( 101 * hashpipe[4] * ((j>>3) & 0x0001));
if (h2 == 0) h2 = 0xdeadbeef;
#endif
#ifdef ARBITRARY_WINDOW_LENGTH
//////////////////////////////////////////////////
//
// Generic N-length hashing.
//
// first term (0th) is always on
h1 = hashpipe[0] * hctable [0];
// 2nd and onward terms are variable.
for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++)
{
h1 = h1 + hashpipe[h] * hctable[h*2] * ((j>>(h-1))&0x0001);
};
hindex = h1;
if (hindex == 0) hindex = 1;
// 0th term is always turned on.
h2 = hashpipe[0] * hctable[1];
// terms 1 through N are variable
for (h = 0; h < MARKOVIAN_WINDOW_LEN; h++)
{
h2 = h2 +hashpipe[h] * hctable[h*2+1]*((j>>(h-1))&0x0001);
};
if (h2 == 0) h2 = 0xDEADBEEF;
#endif
//
// Note - a strict interpretation of Bayesian
// chain probabilities should use 0 as the initial
// state. However, because we rapidly run out of
// significant digits, we use a much less strong
// initial state. Note also that any nonzero
// positive value prevents divide-by-zero
//
#ifdef ENTROPIC_WEIGHTS
//
// Calculate entropic weight of this feature.
// (because these are correllated features, this is
// linear, not logarithmic)
//
// These weights correspond to the number of elements
// in the hashpipe that are used for this particular
// calculation, == 1 + bitcount(Jval)
{
long ew[16] = // Jval
{ 1, // 0
2, // 1
2, // 2
3, // 3
2, // 4
3, // 5
3, // 6
4, // 7
2, // 8
3, // 9
3, // 10
4, // 11
3, // 12
4, // 13
4, // 14
5 }; // 15
feature_weight = ew[j];
};
#endif
#ifdef MARKOV_WEIGHTS
//
// Calculate entropic weight of this feature.
// However, this is based on a Hidden Markov Model
// calculation. Maybe it's right, maybe not. It does
// seem to work better than constant or entropic...
{
long ew[16] = // Jval
{ 1, // 0
2, // 1
2, // 2
4, // 3
2, // 4
4, // 5
4, // 6
8, // 7
2, // 8
4, // 9
4, // 10
8, // 11
4, // 12
8, // 13
8, // 14
16 }; // 15
feature_weight = ew[j];
};
#endif
#ifdef SUPER_MARKOV_WEIGHTS
//
// Calculate entropic weight of this feature.
// However, this is based on a more agressive Hidden
// Markov Model calculation. Maybe it's right, maybe
// not. However, testing shows that Super-Markov is
// more accurate than constant, entropic, or straight Markov,
// with errcounts of 69, 69, 63, and 56 per 5k,respectively.
//
// hibits are 0, 1, 2. 3, 4, 5
// weights are 1, 4, 16, 64, 256, 1024
{
long ew[32] = // Jval
{ 1, // 0
4, // 1
4, // 2
16, // 3
4, // 4
16, // 5
16, // 6
64, // 7
4, // 8
16, // 9
16, // 10 - A
64, // 11 - B
16, // 12 - C
64, // 13 - D
64, // 14 - E
256, // 15 - F
4, // 16 - 10
16, // 17 - 11
16, // 18 - 12
64, // 19 - 13
16, // 20 - 14
64, // 21 - 15
64, // 22 - 16
256, // 23 - 17
16, // 24 - 18
64, // 25 - 19
64, // 26 - 1A
256, // 27 - 1B
64, // 28 - 1C
256, // 29 - 1D
256, // 30 - 1E
1024 // 31 - 1F
};
feature_weight = ew[j];
};
#endif
#ifdef BREYER_CHHABRA_SIEFKES_WEIGHTS
//
// This uses the Breyer-Chhabra-Siefkes model that
// uses coefficients of 1, 3, 13,, 75, and 541, which
// assures complete override for any shorter string in
// a single occurrence.
//
{
long ew[16] = // Jval
{ 1, // 0
3, // 1
3, // 2
13, // 3
3, // 4
13, // 5
13, // 6
75, // 7
3, // 8
13, // 9
13, // 10
75, // 11
13, // 12
75, // 13
75, // 14
541 }; // 15
feature_weight = ew[j];
};
#endif
#ifdef BCS_MWS_WEIGHTS
//
// This uses the Breyer-Chhabra-Siefkes model that
// uses coefficients of 1, 3, 13,, 75, and 541, which
// assures complete override for any shorter string in
// a single occurrence.
//
{
long ew[] = // Jval
{ 1, // 0
3, // 1
3, // 2
13, // 3
3, // 4
13, // 5
13, // 6
75, // 7
3, // 8
13, // 9
13, // 10 - A
75, // 11 - B
13, // 12 - C
75, // 13 - D
75, // 14 - E
541, // 15 - F
3, // 16 - 10
13, // 17 - 11
13, // 18 - 12
75, // 19 - 13
13, // 20 - 14
75, // 21 - 15
75, // 22 - 13
541, // 23 - 17
13, // 24 - 18
75, // 25 - 19
75, // 26 - 1A
541, // 27 - 1B
75, // 28 - 1C
541, // 29 - 1D
541, // 30 - 1E
4683 // 31 - 1F
};
feature_weight = ew[j];
};
#endif
#ifdef BCS_EXP_WEIGHTS
//
// This uses the Breyer-Chhabra-Siefkes model that
// uses coefficients of 1, 3, 13,, 75, and 541, which
// assures complete override for any shorter string in
// a single occurrence.
//
{
long ew[] = // Jval
{ 1, // 0
8, // 1
8, // 2
64, // 3
8, // 4
64, // 5
64, // 6
512, // 7
8, // 8
64, // 9
64, // 10 - A
512, // 11 - B
64, // 12 - C
512, // 13 - D
512, // 14 - E
4096, // 15 - F
8, // 16 - 10
64, // 17 - 11
64, // 18 - 12
512, // 19 - 13
64, // 20 - 14
512, // 21 - 15
512, // 22 - 13
4096, // 23 - 17
64, // 24 - 18
512, // 25 - 19
512, // 26 - 1A
4096, // 27 - 1B
512, // 28 - 1C
4096, // 29 - 1D
4096, // 30 - 1E
32768 // 31 - 1F
};
feature_weight = ew[j];
};
#endif
#ifdef BREYER_CHHABRA_SIEFKES_BASE7_WEIGHTS
//
// This uses the Breyer-Chhabra-Siefkes base 7 model that
// uses coefficients of 1, 7, 49, 343, 2401 which
// assures complete override for any shorter string in
// a single occurrence.
//
{
long ew[16] = // Jval
{ 1, // 0
7, // 1
7, // 2
49, // 3
7, // 4
49, // 5
49, // 6
343, // 7
7, // 8
49, // 9
343, // 10
343, // 11
49, // 12
343, // 13
343, // 14
2401 }; // 15
feature_weight = ew[j];
};
#endif
// Zero out "Hits This Feature"
htf = 0;
totalfeatures++;
//
// calculate the precursors to the local probabilities;
// these are the hits[k] array, and the htf total.
//
skip_this_feature = 0;
for (k = 0; k < maxhash; k++)
{
long lh, lh0;
lh = hindex % (hashlens[k]);
if (lh==0) lh=1;
lh0 = lh;
hits[k] = 0;
while ( hashes[k][lh].key != 0
&& ( hashes[k][lh].hash != h1
|| hashes[k][lh].key != h2 ))
{
lh++;
if (lh >= hashlens[k]) lh = 1;
if (lh == lh0) break; // wraparound
};
if (hashes[k][lh].hash == h1 && hashes[k][lh].key == h2)
{
//
l = hashes[k][lh].value * feature_weight;
totalhits[k] = totalhits[k] + l; // remember totalhits
hits[k] = l * cpcorr [k]; // remember corr. hits
htf = htf + hits[k]; // and hits-this-feature
if (unique_mode)
{
if (seen_features[k][lh] > 0)
skip_this_feature = 1;
if (seen_features [k][lh] < 250)
seen_features[k][lh]++;
};
};
};
// now update the probabilities.
//
// NOTA BENE: there are a bunch of different ways to
// calculate local probabilities. The text below
// refers to an experiment that may or may not make it
// as the "best way".
//
// The hard part is this - what is the local in-class
// versus local out-of-class probability given the finding
// of a particular feature?
//
// I'm guessing this- the validity is the differntial
// seen on the feature (that is, fgood - fevil )
// times the entropy of that feature as seen in the
// corpus (that is,
//
// Pfeature*log2(Pfeature)
//
// =
// totalcount_this_feature
// --------------- * log2 (totalcount_this_feature)
// totalcount_all_features
//
// (note, yes, the last term seems like it should be
// relative to totalcount_all_features, but a bit of algebra
// will show that if you view fgood and fevil as two different
// signals, then you end up with + and - totalcount inside
// the logarithm parenthesis, and they cancel out.
// (the 0.30102 converts "log10" to "log2" - it's not
// a magic number, it's just that log2 isn't in glibc)
//
// HACK ALERT- this code here is still under development
// and should be viewed with the greatest possible
// suspicion. :=)
if (! skip_this_feature )
{
#ifdef STATIC_LOCAL_PROBABILITIES
//fgood = good;
//fevil = evil;
//pmag = (fgood - fevil) / ( 2 * (fgood + fevil + 1 ) );
//plnic = 0.5 - pmag;
//plic = 1.0 - plnic;
for (k = 0; k < maxhash; k++)
{
pltc[k] = 0.5 +
(( hits[k] - (htf - hits[k]))
/ (LOCAL_PROB_DENOM * (htf + 1.0)));
};
#endif
#ifdef LENGTHBASED_LOCAL_PROBABILIIES
// fgood = good;
//fevil = evil;
//pmag = (fgood - fevil) / ( (fgood + fevil + 1) * textlen);
//plnic = 0.5 - pmag;
//plic = 1.0 - plnic;
for (k = 0; k < maxhash; k++)
{
pltc[k] = 0.5 +
(( hits[k] - (htf - hits[k])) /
((htf + 1)*textlen));
};
#endif
// Now, some funky magic. Our formula above is
// mathematically correct (if features were
// independent- something we conveniently ignore.),
// but because of the limited word length in a real
// computer, we can quickly run out of dynamic range
// early in a CLASSIFY (P(S) indistinguishable from
// 1.00) and then there is no way to recover. To
// remedy this, we use two alternate forms of the
// formula (in Psucc and Pfail) and use whichever
// form that yields the smaller probability to
// recalculate the value of the larger.
//
// The net result of this subterfuge is a nonuniform
// representation of the probability, with a huge dynamic
// range in two places - near 0.0, and near 1.0 , which
// is where we actually care.
//
// update the global probabilities
// ptemp = ( pic*plic) / ((pic * plic) + (pnic * plnic));
// pnic = (pnic*plnic) / ((pic * plic) + (pnic * plnic));
// pic = ptemp;
// calculate renormalizer (the Bayesian formula's denomenator)
renorm = 0.0;
// now calculate the per-ptc numerators
plltc[0] = 0; // keep gcc from complaining about unused vars.
#ifdef USE_PEAK
for (k = 0; k < maxhash; k++)
renorm = renorm + (ptc[k]*plltc[k]);
if ( j == 0)
for (k = 0; k < maxhash; k++)
plltc[k] = pltc[k];
if ( j < 15)
for (k = 0; k < maxhash; k++)
if (pltc[k] > plltc[k])
plltc[k] = pltc[k];
if ( j == 15)
for (k = 0; k < maxhash; k++)
ptc[k] = (ptc[k] * plltc[k]) / renorm;
#else
for (k = 0; k < maxhash; k++)
renorm = renorm + (ptc[k]*pltc[k]);
for (k = 0; k < maxhash; k++)
ptc[k] = (ptc[k] * pltc[k]) / renorm;
#endif
//
// Arne's Optimization: If the low-order feature
// (that is, hashpipe[0]) isn't there, then there's no
// possibility of other features being there either.
// So, we can abort the rest of this classification
// for this particular file at least.
//
// Of course this is only legal to do if the file has
// not been microgroomed- otherwise the low-order
// (single-word token) feature might already
// have been deleted or microgroomed away, causing
// the higher-order feature to be missed.
#define ARNE_OPTIMIZATION
#ifdef ARNE_OPTIMIZATION
if (not_microgroom && htf == 0 && j == 0)
j = 999999;
#endif
// if we have underflow (any probability == 0.0 ) then
// bump the probability back up to 10^-308, or
// whatever a small multiple of the minimum double
// precision value is on the current platform.
//
for (k = 0; k < maxhash; k++)
if (ptc[k] < 10*DBL_MIN) ptc[k] = 10 * DBL_MIN;
//
// part 2) renormalize to sum probabilities to 1.0
//
renorm = 0.0;
for (k = 0; k < maxhash; k++)
renorm = renorm + ptc[k];
for (k = 0; k < maxhash; k++)
ptc[k] = ptc[k] / renorm;
for (k = 0; k < maxhash; k++)
if (ptc[k] < 10*DBL_MIN) ptc[k] = 10 * DBL_MIN;
//if (pnic < pic)
// { pic = 1.0 - pnic;} else { pnic = 1.0 - pic; };
if (internal_trace)
{
for (k = 0; k < maxhash; k++)
{
// fprintf (stderr, "ZZZ\n");
fprintf (stderr,
" poly: %d filenum: %d, HTF: %7.0f, hits: %7.0f, Pl: %6.4e, Pc: %6.4e\n",
j, k, htf, hits[k], pltc[k], ptc[k]);
};
};
//
// avoid the fencepost error for window=1
if ( MARKOVIAN_WINDOW_LEN == 1)
{
j = 99999;
};
};
};
};
}; // end of repeat-the-regex loop
classify_end_regex_loop:
// cleanup time!
// remember to let go of the fd's and mmaps
for (k = 0; k < maxhash; k++)
{
// close (hfds [k]);
crm_munmap_file ( (void *) hashes[k]);
// and let go of the seen_features array
if (seen_features [k]) free (seen_features[k]);
seen_features[k] = NULL;
};
// and one last chance to force probabilities into the non-stuck zone
//
// if (pic == 0.0 ) pic = DBL_MIN;
//if (pnic == 0.0 ) pnic = DBL_MIN;
for (k = 0; k < maxhash; k++)
if (ptc[k] < 10*DBL_MIN) ptc[k] = 10 * DBL_MIN;
if (user_trace)
{
for (k = 0; k < maxhash; k++)
fprintf (stderr, "Probability of match for file %ld: %f\n", k, ptc[k]);
};
//
tprob = 0.0;
for (k = 0; k < succhash; k++)
tprob = tprob + ptc[k];
if (svlen > 0)
{
char buf[1024];
double accumulator;
double remainder;
double overall_pR;
long m;
buf [0] = '\000';
accumulator = 10 * DBL_MIN;
for (m = 0; m < succhash; m++)
{
accumulator = accumulator + ptc[m];
};
remainder = 10 * DBL_MIN;
for (m = succhash; m < maxhash; m++)
{
remainder = remainder + ptc[m];
};
overall_pR = log10 (accumulator) - log10 (remainder);
// note also that strcat _accumulates_ in stext.
// There would be a possible buffer overflow except that _we_ control
// what gets written here. So it's no biggie.
if (tprob > 0.5000)
{
sprintf (buf, "CLASSIFY succeeds; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR );
}
else
{
sprintf (buf, "CLASSIFY fails; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR );
};
if (strlen (stext) + strlen(buf) <= stext_maxlen)
strcat (stext, buf);
bestseen = 0;
for (k = 0; k < maxhash; k++)
if (ptc[k] > ptc[bestseen] ) bestseen = k;
remainder = 10 * DBL_MIN;
for (m = 0; m < maxhash; m++)
if (bestseen != m)
{
remainder = remainder + ptc[m];
};
sprintf (buf, "Best match to file #%ld (%s) "\
"prob: %6.4f pR: %6.4f \n",
bestseen,
hashname[bestseen],
ptc[bestseen],
(log10(ptc[bestseen]) - log10(remainder)));
if (strlen (stext) + strlen(buf) <= stext_maxlen)
strcat (stext, buf);
sprintf (buf, "Total features in input file: %ld\n", totalfeatures);
if (strlen (stext) + strlen(buf) <= stext_maxlen)
strcat (stext, buf);
for (k = 0; k < maxhash; k++)
{
long m;
remainder = 10 * DBL_MIN;
for (m = 0; m < maxhash; m++)
if (k != m)
{
remainder = remainder + ptc[m];
};
sprintf (buf,
"#%ld (%s):"\
" features: %ld, hits: %ld, prob: %3.2e, pR: %6.2f \n",
k,
hashname[k],
fcounts[k],
totalhits[k],
ptc[k],
(log10 (ptc[k]) - log10 (remainder) ) );
// strcat (stext, buf);
if (strlen(stext)+strlen(buf) <= stext_maxlen)
strcat (stext, buf);
};
// check here if we got enough room in stext to stuff everything
// perhaps we'd better rise a nonfatalerror, instead of just
// whining on stderr
if (strcmp(&(stext[strlen(stext)-strlen(buf)]), buf) != 0)
{
nonfatalerror( "WARNING: not enough room in the buffer to create "
"the statistics text. Perhaps you could try bigger "
"values for MAX_CLASSIFIERS or MAX_FILE_NAME_LEN?",
" ");
};
crm_destructive_alter_nvariable (svrbl, svlen,
stext, strlen (stext));
};
//
// Free the hashnames, to avoid a memory leak.
//
for (i = 0; i < maxhash; i++)
free (hashname[i]);
if (tprob <= 0.5000)
{
if (user_trace)
fprintf (stderr, "CLASSIFY was a FAIL, skipping forward.\n");
// and do what we do for a FAIL here
csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1;
csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1;
return (0);
};
//
// all done... if we got here, we should just continue execution
if (user_trace)
fprintf (stderr, "CLASSIFY was a SUCCESS, continuing execution.\n");
regcomp_failed:
return (0);
};
syntax highlighted by Code2HTML, v. 0.9.1