// crm_osb_hyperspace.c - Controllable Regex Mutilator, version v1.0
// Copyright 2001-2006 William S. Yerazunis, all rights reserved.
//
// This software is licensed to the public under the Free Software
// Foundation's GNU GPL, version 2. You may obtain a copy of the
// GPL by visiting the Free Software Foundations web site at
// www.fsf.org, and a copy is included in this distribution.
//
// Other licenses may be negotiated; contact the
// author for details.
//
// include some standard files
#include "crm114_sysincludes.h"
#include <math.h>
// include any local crm114 configuration file
#include "crm114_config.h"
// include the crm114 data structures file
#include "crm114_structs.h"
// and include the routine declarations file
#include "crm114.h"
// the command line argc, argv
extern int prog_argc;
extern char **prog_argv;
// the auxilliary input buffer (for WINDOW input)
extern char *newinputbuf;
// the globals used when we need a big buffer - allocated once, used
// wherever needed. These are sized to the same size as the data window.
extern char *inbuf;
extern char *outbuf;
extern char *tempbuf;
// The following sqrtf mumbojumbo because ppc_osx doesn't define sqrtf
// like it should.
#ifndef sqrtf
#define sqrtf(x) sqrt((x))
#endif
//////////////////////////////////////////////////////////////////
//
// Hyperspatial Classifiers
//
// The current statistical classifiers like Markovian and OSB form
// a single large cloud of points in hyperspace representing each
// feature ever found in a sample document. The known examples
// are averaged together and so each class is a single large
// cloud. This is equivalent to having each class (possibly
// containing hundreds of documents) averaging to a single point
// in feature hyperspace, and the classifier code then picks which
// of these averages is "closer" to the document.
//
// A hyperspatial classifier is diffierent - it keeps the document
// features separately rather than summing them all together.
// Each known example document retains it's identity and so each
// document acts independently as an example.
//
// Evaluation:
//
// 1) Closeness: the single known example vector closest
// to the unknown's vector is the winning class. Closeness is
// determined by Nth-root-of-sum-of-powers method
//
// 1A) Length Normalization: I.R. techniques suggest that
// normalizing the known document vector lengths out to a unit
// sphere of some relatively low dimensionality has a significant
// accuracy advantage, especially for "closeness". This is simply
// the exponent used in the Pythagorean distance theorem.
//
// 2) Dominance: Look for how documents either dominate or are
// dominated by (in the game-theory sense of a dominating
// strategy) the unknown text. Clearly, if a known document
// dominates an unknown text, that text is of the document's
// class. If the unknown dominates a known text, then there is a
// weaker implication of class membership.
//
// 2A) Dominance can be modulated by length normalization as well,
// so a very long document that contains lots of features will not
// dominate a short document very much at all.
//
// 3) Radiance: use a non-linear match such as radiance to determine
// distance to a class of N members. Calculate by putting unit candles
// on each known document vector endpoint; measure the power incident
// at the unknown document's vector endpoint. (that is, sum 1/R^2 of
// document endpoints). [[ this is different than closeness or dominance,
// because all members of a class help a little, no matter how far
// they are away.
//
// 3A) Raidiance with length normalization
//
//
////////////////////////////////////////////////////////////////////
//
// Data storage format for hyperspace classifiers.
//
// We sort the incoming features, so we can make linear rather than
// random probes into the database, and can store individual
// feature vectors sequentially (and save memory on vector IDs)
//
// Because we don't merge the entire learning base together, we
// drop down to 32-bit hashes, as the risk of a hash collision
// within the much smaller single-document files is much lower
// than it is in the big "all-together" hash files.
//
//
// Option A1 - inline storage - store the (32-bit? 64-bit?) hash
// codes as a sorted series. Use hash code of 0x0 as a separator.
// This doesn't allow merging of feature vectors.
//
// Option A2 - inline valued - Store the 32-bit sorted hashes as
// { hashcode, float_Weight } structs. This allows merging of
// multiple close features into a single feature vector when the
// file gets too long. (foreach feature vec pair, measure dominant
// overlaps or distances, and merge the two with either the closest
// match or the smallest dominant overlap)
//
// Option A3 - inline value with header count - Store the 32-bit
// sorted hashes as { hashcode, float_Weight), and add total count
// of merged vectors as 2nd value of header's 0x0 sentinel. Merge
// as before.
//
// Option B1 - Keep the current "global" file, add a fourth slot
// per bucket as the which-vector tag. Does not support easy
// merging.
//
// Option B2 - Keep the current "global" file, add a fourth bitmapped
// slot with 1 bit per vector (64? 128 bits?) Merge vectors when you
// run out of slots in the vector table. (advantage- can have multi
// classes in one table; put bitmap into the class. ) Advantage
// of hashing- you only touch what you need. Downside: Wastes most of he
// storage in the bitmap. Upside: can use bitmap to have multiple
// text classes in the same file.
//
// Option B3 - Use shared bit allocations, so classes use shared
// bit patterns. The bad news is that this generates phantom classes
//
// Option C1 - use MySQL to do the storage. Easy to implennt
//
// *****************************************************************
//
// For now, we will KISS, till we see how well this actually
// performs.
//
// Option K1: 64-bit hashes, sorted, 0x0/0x0 sentinels between
// class instances. No class merging. No weighting- if something
// occurs twice, put in two entries.
//
typedef struct mythical_hyperspace_cell {
unsigned long hash;
// unsigned long key;
} HYPERSPACE_FEATUREBUCKET_STRUCT;
////////////////////////////////////////////////////////////////////
//
// the hash coefficient table (hctable) should be full of relatively
// prime numbers, and preferably superincreasing, though both of those
// are not strict requirements.
//
static long hctable[] =
{ 1, 7,
3, 13,
5, 29,
11, 51,
23, 101,
47, 203,
97, 407,
197, 817,
397, 1637,
797, 3277 };
int hash_compare (void const *a, void const *b)
{
HYPERSPACE_FEATUREBUCKET_STRUCT *pa, *pb;
pa = (HYPERSPACE_FEATUREBUCKET_STRUCT *) a;
pb = (HYPERSPACE_FEATUREBUCKET_STRUCT *) b;
if (pa->hash < pb->hash)
return (-1);
if (pa->hash > pb->hash)
return (1);
// if (pa->key < pb->key)
// return (-1);
//if (pa->key > pb->key)
// return (1);
return (0);
}
//
// How to learn Osb_Hyperspacestyle - in this case, we'll include
// the single word terms that may not strictly be necessary.
//
int crm_expr_osb_hyperspace_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb,
char *txtptr, long txtstart, long txtlen)
{
// learn the osb_hyperspace transform of this input window as
// belonging to a particular type.
// learn <flags> (classname) /word/
//
long i, j, k;
long h; // h is our counter in the hashpipe;
char ptext[MAX_PATTERN]; // the regex pattern
long plen;
char htext[MAX_PATTERN]; // the hash name
char hashfilename [MAX_PATTERN]; // the hashfile name
FILE *hashf; // stream of the hashfile
long hlen;
long cflags, eflags;
struct stat statbuf; // for statting the hash file
HYPERSPACE_FEATUREBUCKET_STRUCT *hashes; // the hashes we'll sort
long hashcounts;
unsigned long hashpipe[OSB_BAYES_WINDOW_LEN+1];
//
regex_t regcb;
regmatch_t match[5]; // we only care about the outermost match
long textoffset;
long textmaxoffset;
long sense;
long microgroom;
long unique;
long use_unigram_features;
long fev;
// long made_new_file;
//
// unsigned long learns_index = 0;
// unsigned long features_index = 0;
statbuf.st_size = 0;
fev = 0;
if (internal_trace)
fprintf (stderr, "executing a Hyperspace LEARN\n");
// Keep the gcc compiler from complaining about unused variables
// i = hctable[0];
// extract the hash file name
crm_get_pgm_arg (htext, MAX_PATTERN, apb->p1start, apb->p1len);
hlen = apb->p1len;
hlen = crm_nexpandvar (htext, hlen, MAX_PATTERN);
// get the "this is a word" regex
crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len);
plen = apb->s1len;
plen = crm_nexpandvar (ptext, plen, MAX_PATTERN);
// set our cflags, if needed. The defaults are
// "case" and "affirm", (both zero valued).
// and "microgroom" disabled.
cflags = REG_EXTENDED;
eflags = 0;
sense = +1;
if (apb->sflags & CRM_NOCASE)
{
cflags = cflags | REG_ICASE;
eflags = 1;
if (user_trace)
fprintf (stderr, "turning oncase-insensitive match\n");
};
if (apb->sflags & CRM_REFUTE)
{
sense = -sense;
/////////////////////////////////////
// Take this out when we finally support refutation
////////////////////////////////////
// fprintf (stderr, "Hyperspace Refute is NOT SUPPORTED YET\n");
//return (0);
if (user_trace)
fprintf (stderr, " refuting learning\n");
};
microgroom = 0;
if (apb->sflags & CRM_MICROGROOM)
{
microgroom = 1;
if (user_trace)
fprintf (stderr, " enabling microgrooming.\n");
};
unique = 0;
if (apb->sflags & CRM_UNIQUE)
{
unique = 1;
if (user_trace)
fprintf (stderr, " enabling uniqueifying features.\n");
};
use_unigram_features = 0;
if (apb->sflags & CRM_UNIGRAM)
{
use_unigram_features = 1;
if (user_trace)
fprintf (stderr, " using only unigram features.\n");
};
//
// grab the filename, and stat the file
// note that neither "stat", "fopen", nor "open" are
// fully 8-bit or wchar clean...
i = 0;
while (htext[i] < 0x021) i++;
j = i;
while (htext[j] >= 0x021) j++;
// filename starts at i, ends at j. null terminate it.
htext[j] = '\000';
strcpy (hashfilename, &htext[i]);
// Note that during a LEARN in hyperspace, we do NOT use the mmap of
// pre-existing memory. We just write to the end of the file instead.
// malloc up the unsorted hashbucket space
hashes = calloc (HYPERSPACE_MAX_FEATURE_COUNT,
sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT));
hashcounts = 0;
// put in a zero as the start marker.
hashes[hashcounts].hash = 0;
// hashes[hashcounts].key = 0;
hashcounts++;
// compile the word regex
//
if ( internal_trace)
fprintf (stderr, "\nWordmatch pattern is %s", ptext);
i = crm_regcomp (®cb, ptext, plen, cflags);
if ( i > 0)
{
crm_regerror ( i, ®cb, tempbuf, data_window_size);
nonfatalerror ("Regular Expression Compilation Problem:", tempbuf);
goto regcomp_failed;
};
// Start by priming the pipe... we will shift to the left next.
// sliding, hashing, xoring, moduloing, and incrmenting the
// hashes till there are no more.
k = 0;
j = 0;
i = 0;
// No need to do any parsing of a box restriction.
// We got txtptr, txtstart, and txtlen from the caller.
//
textoffset = txtstart;
textmaxoffset = txtstart + txtlen;
// init the hashpipe with 0xDEADBEEF
for (h = 0; h < OSB_BAYES_WINDOW_LEN; h++)
{
hashpipe[h] = 0xDEADBEEF;
};
// and the big feature-generator loop... go through all of the text.
i = 0;
while (k == 0 && textoffset <= textmaxoffset
&& hashcounts < HYPERSPACE_MAX_FEATURE_COUNT )
{
long wlen;
long slen;
// do the regex
// slen = endpoint (= start + len)
// - startpoint (= curr textoffset)
// slen = txtlen;
slen = textmaxoffset - textoffset;
// if pattern is empty, extract non graph delimited tokens
// directly ([[graph]]+) instead of calling regexec (8% faster)
if (ptext[0] != '\0')
{
k = crm_regexec (®cb, &(txtptr[textoffset]),
slen, 5, match, 0, NULL);
}
else
{
k = 0;
// skip non-graphical characthers
match[0].rm_so = 0;
while (!isgraph (txtptr[textoffset + match[0].rm_so])
&& textoffset + match[0].rm_so < textmaxoffset)
match[0].rm_so ++;
match[0].rm_eo = match[0].rm_so;
while (isgraph (txtptr [textoffset + match[0].rm_eo])
&& textoffset + match[0].rm_eo < textmaxoffset)
match[0].rm_eo ++;
if ( match[0].rm_so == match[0].rm_eo)
k = 1;
};
if (k != 0 || textoffset > textmaxoffset)
goto learn_end_regex_loop;
{
wlen = match[0].rm_eo - match[0].rm_so;
memmove (tempbuf,
&(txtptr[textoffset + match[0].rm_so]),
wlen);
tempbuf[wlen] = '\000';
if (internal_trace)
{
fprintf (stderr,
" Learn #%ld t.o. %ld strt %ld end %ld len %ld is -%s-\n",
i,
textoffset,
(long) match[0].rm_so,
(long) match[0].rm_eo,
wlen,
tempbuf);
};
if (match[0].rm_eo == 0)
{
nonfatalerror ( "The LEARN pattern matched zero length! ",
"\n Forcing an increment to avoid an infinite loop.");
match[0].rm_eo = 1;
};
// Shift the hash pipe down one
//
for (h = OSB_BAYES_WINDOW_LEN-1; h > 0; h--)
{
hashpipe [h] = hashpipe [h-1];
};
// and put new hash into pipeline
hashpipe[0] = strnhash (tempbuf, wlen);
if (internal_trace)
{
fprintf (stderr, " Hashpipe contents: ");
for (h = 0; h < OSB_BAYES_WINDOW_LEN; h++)
fprintf (stderr, " %ld", hashpipe[h]);
fprintf (stderr, "\n");
};
// and account for the text used up.
textoffset = textoffset + match[0].rm_eo;
i++;
// is the pipe full enough to do the hashing?
if (1) // we always run the hashpipe now, even if it's
// just full of 0xDEADBEEF. (was i >=5)
{
unsigned long h1;
unsigned long h2;
long th = 0; // a counter used for TSS tokenizing
long j;
//
// old Hash polynomial: h0 + 3h1 + 5h2 +11h3 +23h4
// (coefficients chosen by requiring superincreasing,
// as well as prime)
//
th = 0;
//
if (use_unigram_features == 1)
{
h1 = hashpipe[0];
if (h1 == 0) h1 = 0xdeadbeef;
h2 = 0xdeadbeef;
if (internal_trace)
fprintf (stderr, "Singleton feature : %ld\n", h1);
hashes[hashcounts].hash = h1;
hashcounts++;
}
else
{
for (j = 1;
j < OSB_BAYES_WINDOW_LEN;
j++)
{
h1 = hashpipe[0]*hctable[0] + hashpipe[j] * hctable[j<<1];
if (h1 ==0 ) h1 = 0xdeadbeef;
// h2 = hashpipe[0]*hctable[1] + hashpipe[j] * hctable[(j<<1)-1];
//if (h2 == 0) h2 = 0xdeadbeef;
h2 = 0xdeadbeef;
if (internal_trace)
fprintf (stderr, "Polynomial %ld has h1:%ld h2: %ld\n",
j, h1, h2);
hashes[hashcounts].hash = h1;
// hashes[hashcounts].key = h2;
hashcounts++;
};
};
};
};
}; // end the while k==0
learn_end_regex_loop:
if (ptext[0] != '\0') crm_regfree (®cb);
regcomp_failed:
// Now sort the hashes array.
//
qsort (hashes, hashcounts,
sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT),
&hash_compare );
hashcounts--;
if (user_trace)
fprintf (stderr, "Total hashes generated: %ld\n", hashcounts);
// And uniqueify the hashes array
//
i = 1;
j = 1;
if (unique)
{
while ( i <= hashcounts )
{
if (hashes[i].hash != hashes[i+1].hash
// || hashes[i].key != hashes[i+1].key )
)
{
hashes[j]= hashes[i];
j++;
};
i++;
};
hashcounts = j;
};
if (user_trace)
fprintf (stderr, "Unique hashes generated: %ld\n", hashcounts);
// store hash count of this document in the first bucket's .key slot
// hashes[hashcounts].key = hashcounts;
if (sense > 0)
{
/////////////////
// THIS PATH TO LEARN A TEXT - just append the hashes.
// and open the output file
/////////////////
// Now a nasty bit. Because there are probably retained hashes of the
// file, we need to force an unmap-by-name which will allow a remap
// with the new file length later on.
crm_force_munmap_filename (hashfilename);
if (user_trace)
fprintf (stderr, "Opening hyperspace file %s for append.\n",
hashfilename);
hashf = fopen ( hashfilename , "ab+");
if (user_trace)
fprintf (stderr, "Writing to hash file %s\n", hashfilename);
// and write the sorted hashes out.
fwrite (hashes, 1,
sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT) * hashcounts,
hashf);
fclose (hashf);
// let go of the hashes.
free (hashes);
}
else
{
/////////////////
// THIS IS THE UNLEARN PATH. VERY, VERY MESSY
// What we have to do here is find the set of hashes that matches
// the input most closely - and then remove it.
//
// For this, we want the single closest set of hashes. That
// implies highest radiance, so we use the same bit of code
// we use down in classification. We also keep start and
// end of the "best match" segment.
/////////////////
long beststart, bestend;
long thisstart, thislen, thisend;
double bestrad;
long wrapup;
double kandu, unotk, knotu, dist, radiance;
long k, u;
long file_hashlens;
HYPERSPACE_FEATUREBUCKET_STRUCT *file_hashes;
// Get the file mmapped so we can find the closest match
//
{
struct stat statbuf; // for statting the hash file
// stat the file to get it's length
k = stat (hashfilename, &statbuf);
// does the file really exist?
if (k != 0)
{
nonfatalerror ("Refuting from nonexistent data cannot be done!"
" More specifically, this data file doesn't exist: ",
hashfilename);
return (0);
}
else
{
file_hashlens = statbuf.st_size;
file_hashes = (HYPERSPACE_FEATUREBUCKET_STRUCT *)
crm_mmap_file (hashfilename,
0, file_hashlens,
PROT_READ | PROT_WRITE,
MAP_SHARED,
NULL);
file_hashlens = file_hashlens
/ sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT );
};
};
wrapup = 0;
k = u = 0;
beststart = bestend = 0;
bestrad = 0.0;
while (k < file_hashlens)
{
long cmp;
// Except on the first iteration, we're looking one cell
// past the 0x0 start marker.
kandu = 0;
knotu = unotk = 10 ;
u = 0;
thisstart = k;
if (internal_trace)
fprintf (stderr,
"At featstart, looking at %ld (next bucket value is %ld)\n",
file_hashes[thisstart].hash,
file_hashes[thisstart+1].hash);
while (wrapup == 0)
{
// it's an in-class feature.
cmp = hash_compare (&hashes[u], &file_hashes[k]);
if (cmp < 0)
{ // unknown less, step u forward
// increment on u, because maybe k will match next time
unotk++;
u++;
}
if (cmp == 0) // features matched.
// These aren't the features you're looking for.
// Move along, move along....
{
u++;
k++;
kandu++;
};
if (cmp > 0) // unknown is greater, step k forward
{
// increment on k, because maybe u will match next time.
knotu++;
k++;
};
// End of the U's? If so, skip k to the end marker
// and finish.
if ( u >= hashcounts - 1 )
{
while ( k < file_hashlens
&& file_hashes[k].hash != 0)
{
k++;
knotu++;
};
};
// End of the K's? If so, skip U to the end marker
if ( k >= file_hashlens - 1
|| file_hashes[k].hash == 0 ) // end of doc features
{
unotk += hashcounts - u;
};
// end of the U's or end of the K's? If so, end document.
if (u >= hashcounts - 1
|| k >= file_hashlens - 1
|| file_hashes[k].hash == 0) // this sets end-of-document
{
wrapup = 1;
k++;
};
};
// Now the per-document wrapup...
wrapup = 0; // reset wrapup for next file
// drop our markers for this particular document. We are now
// looking at the next 0 (or end of file).
thisend = k - 2;
thislen = thisend - thisstart;
if (internal_trace)
fprintf (stderr,
"At featend, looking at %ld (next bucket value is %ld)\n",
file_hashes[thisend].hash,
file_hashes[thisend+1].hash);
// end of a document- process accumulations
// Proper pythagorean (Euclidean) distance - best in
// SpamConf 2006 paper
dist = sqrtf (unotk + knotu) ;
// PREV RELEASE VER --> radiance = 1.0 / ((dist * dist )+ 1.0);
//
// This formula was the best found in the MIT `SC 2006 paper.
radiance = 1.0 / (( dist * dist) + .000001);
radiance = radiance * kandu;
radiance = radiance * kandu;
//radiance = radiance * kandu;
if (user_trace)
fprintf (stderr, "Feature Radiance %f at %ld to %ld\n",
radiance, thisstart, thisend);
if (radiance >= bestrad)
{
beststart = thisstart;
bestend = thisend;
bestrad = radiance;
}
};
// end of the per-document stuff - now chop out the part of the
// file between beststart and bestend.
// if (user_trace)
fprintf (stderr,
"Deleting feature from %ld to %ld (rad %f) of file %s\n",
beststart, bestend, bestrad, hashfilename);
// Deletion time - move the remaining stuff in the file
// up to fill the hole, then msync the file, munmap it, and
// then truncate it to the new, correct length.
{
long newhashlen, newhashlenbytes;
newhashlen = file_hashlens - (bestend + 1 - beststart);
newhashlenbytes=newhashlen * sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT);
memmove (&file_hashes[beststart],
&file_hashes[bestend+1],
sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT)
* (file_hashlens - bestend) );
memset (&file_hashes[file_hashlens - (bestend - beststart)],
0,
sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT));
crm_force_munmap_filename (hashfilename);
if (internal_trace)
fprintf (stderr, "Truncating file to %ld cells ( %ld bytes)\n",
newhashlen,
newhashlenbytes);
k = truncate (hashfilename,
newhashlenbytes);
// fprintf (stderr, "Return from truncate is %ld\n", k);
}
};
// end of deletion path.
return (0);
}
// How to do a Osb_Bayes CLASSIFY some text.
//
int crm_expr_osb_hyperspace_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb,
char *txtptr, long txtstart, long txtlen)
{
// classify the sparse spectrum of this input window
// as belonging to a particular type.
//
// This code should look very familiar- it's cribbed from
// the code for LEARN
//
long i, j, k;
long h; // we use h for our hashpipe counter, as needed.
char ptext[MAX_PATTERN]; // the regex pattern
long plen;
// char ltext[MAX_PATTERN]; // the variable to classify
//long llen;
// the hash file names
char htext[MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN];
long htext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN;
long hlen;
// the match statistics variable
char stext [MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100)];
long stext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100);
long slen;
char svrbl[MAX_PATTERN]; // the match statistics text buffer
long svlen;
long fnameoffset;
char fname[MAX_FILE_NAME_LEN];
long eflags;
long cflags;
long use_unique;
long not_microgroom = 1;
long use_unigram_features;
// The hashes we'll generate from the unknown text - where and how many.
HYPERSPACE_FEATUREBUCKET_STRUCT *unk_hashes;
long unk_hashcount;
struct stat statbuf; // for statting the hash file
unsigned long hashpipe[OSB_BAYES_WINDOW_LEN+1];
regex_t regcb;
regmatch_t match[5]; // we only care about the outermost match
long totalhits[MAX_CLASSIFIERS]; // actual total hits per classifier
long totalfeatures; // total features
double tprob; // total probability in the "success" domain.
double ptc[MAX_CLASSIFIERS]; // current running probability of this class
HYPERSPACE_FEATUREBUCKET_STRUCT *hashes[MAX_CLASSIFIERS];
long hashlens[MAX_CLASSIFIERS];
char *hashname[MAX_CLASSIFIERS];
long succhash;
long vbar_seen; // did we see '|' in classify's args?
long maxhash;
long fnstart, fnlen;
long fn_start_here;
long textoffset;
long textmaxoffset;
long bestseen;
long thistotal;
long cls;
long nfeats; // total features
long ufeats; // features in this unknown
long kfeats; // features in the known
// Basic match parameters
// These are computed intra-document, other stuff is only done
// at the end of the document.
float knotu; // features in known doc, not in unknown
float unotk; // features in unknown doc, not in known
float kandu; // feature in both known and unknown
// Distance is the pythagorean distance (sqrt) between the
// unknown and a known-class text; we choose closest. (this
// is (for each U and K feature, SQRT of count of U ~K + K ~ U)
float dist;
float closest_dist [MAX_CLASSIFIERS];
float closest_normalized [MAX_CLASSIFIERS];
//#define KNN_ON
#ifdef KNN_ON
#define KNN_NEIGHBORHOOD_SIZE 21
double top_n_val [KNN_NEIGHBORHOOD_SIZE];
long top_n_class [KNN_NEIGHBORHOOD_SIZE];
#endif
// The collapse vector is a low-dimensioned hyperspace
// that uses the low-order N bits of the hash to
// collapse the 2^32 dimensions into a reasonable space..
//
long collapse_vec_same[256];
long collapse_vec_diff[256];
// Dominance and Submission are related to Distance:
// - Dominance is per-known - how many of the features of the
// unknown also exist in the known (for each U, count of K)
// - Submission is how many of the features of the unknown do NOT
// exist in the known. (for each U, count of ~K)
// -- Dominance minus Submission is a figure of merit of match.
float max_dominance [MAX_CLASSIFIERS];
float dominance_normalized [MAX_CLASSIFIERS];
float max_submission [MAX_CLASSIFIERS];
float submission_normalized [MAX_CLASSIFIERS];
float max_equivalence [MAX_CLASSIFIERS];
float equivalence_normalized [MAX_CLASSIFIERS];
float max_des [MAX_CLASSIFIERS];
float des_normalized [MAX_CLASSIFIERS];
// Radiance - sum of the 1/r^2 radiances of each known text
// onto the unknown. Unlike Distance and Dominance, Radiance
// is a function of an entire class, not of a single example
// in the class. More radiance is a closer match.
// Flux is like Radiance, but the standard unit candle at each text
// is replaced by a flux source of intensity proportional to the
// number of features in the known text.
float radiance;
float class_radiance [MAX_CLASSIFIERS];
float class_radiance_normalized [MAX_CLASSIFIERS];
float class_flux [MAX_CLASSIFIERS];
float class_flux_normalized [MAX_CLASSIFIERS];
// try using just the top n matches
// for thk=0.1
// N=1 --> 4/500, N=2 --> 8/500, N=3--> 8 (same exact!), N=4-->8 (same)
// N=8--> 8 (same), N=32--> 8 (same) N=128 -->8 (same) N=1024-->8(same)
// for thk=0.5
//#define TOP_N 4
//float topn[MAX_CLASSIFIERS][TOP_N];
if (internal_trace)
fprintf (stderr, "executing a CLASSIFY\n");
// make the space for the unknown text's hashes
unk_hashes = calloc (HYPERSPACE_MAX_FEATURE_COUNT,
sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT));
unk_hashcount = 0;
unk_hashcount++;
// extract the variable name (if present)
// (we now get those fromt he caller)
// extract the hash file names
crm_get_pgm_arg (htext, htext_maxlen, apb->p1start, apb->p1len);
hlen = apb->p1len;
hlen = crm_nexpandvar (htext, hlen, htext_maxlen);
// extract the "this is a word" regex
//
crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len);
plen = apb->s1len;
plen = crm_nexpandvar (ptext, plen, MAX_PATTERN);
// extract the optional "match statistics" variable
//
crm_get_pgm_arg (svrbl, MAX_PATTERN, apb->p2start, apb->p2len);
svlen = apb->p2len;
svlen = crm_nexpandvar (svrbl, svlen, MAX_PATTERN);
{
long vstart, vlen;
crm_nextword (svrbl, svlen, 0, &vstart, &vlen);
memmove (svrbl, &svrbl[vstart], vlen);
svlen = vlen;
svrbl[vlen] = '\000';
};
if (user_trace)
fprintf (stderr, "Status out var %s (len %ld)\n",
svrbl, svlen);
// status variable's text (used for output stats)
//
stext[0] = '\000';
slen = 0;
// set our flags, if needed. The defaults are
// "case"
cflags = REG_EXTENDED;
eflags = 0;
if (apb->sflags & CRM_NOCASE)
{
cflags = REG_ICASE | cflags;
eflags = 1;
};
not_microgroom = 1;
if (apb->sflags & CRM_MICROGROOM)
{
not_microgroom = 0;
if (user_trace)
fprintf (stderr, " disabling fast-skip optimization.\n");
};
use_unique = 0;
if (apb->sflags & CRM_UNIQUE)
{
use_unique = 1;
if (user_trace)
fprintf (stderr, " unique engaged - repeated features are ignored \n");
};
use_unigram_features = 0;
if (apb->sflags & CRM_UNIGRAM)
{
use_unigram_features = 1;
if (user_trace)
fprintf (stderr, " using only unigram features. \n");
};
// compile the word regex
if ( internal_trace)
fprintf (stderr, "\nWordmatch pattern is %s", ptext);
i = crm_regcomp (®cb, ptext, plen, cflags);
if ( i > 0)
{
crm_regerror ( i, ®cb, tempbuf, data_window_size);
nonfatalerror ("Regular Expression Compilation Problem:", tempbuf);
goto regcomp_failed;
};
// Now, the loop to open the files.
bestseen = 0;
thistotal = 0;
vbar_seen = 0;
maxhash = 0;
succhash = 0;
fnameoffset = 0;
// now, get the file names and mmap each file
// get the file name (grody and non-8-bit-safe, but doesn't matter
// because the result is used for open() and nothing else.
// GROT GROT GROT this isn't NULL-clean on filenames. But then
// again, stdio.h itself isn't NULL-clean on filenames.
if (user_trace)
fprintf (stderr, "Classify list: -%s- \n", htext);
fn_start_here = 0;
fnlen = 1;
while ( fnlen > 0 && ((maxhash < MAX_CLASSIFIERS-1)))
{
crm_nextword (htext,
hlen, fn_start_here,
&fnstart, &fnlen);
if (fnlen > 0)
{
strncpy (fname, &htext[fnstart], fnlen);
fn_start_here = fnstart + fnlen + 1;
fname[fnlen] = '\000';
if (user_trace)
fprintf (stderr, "Classifying with file -%s- "\
"succhash=%ld, maxhash=%ld\n",
fname, succhash, maxhash);
if ( fname[0] == '|' && fname[1] == '\000')
{
if (vbar_seen)
{
nonfatalerror ("Only one ' | ' allowed in a CLASSIFY. \n" ,
"We'll ignore it for now.");
}
else
{
succhash = maxhash;
};
vbar_seen ++;
}
else
{
// be sure the file exists
// stat the file to get it's length
k = stat (fname, &statbuf);
// quick check- does the file even exist?
if (k != 0)
{
nonfatalerror ("Nonexistent Classify table named: ",
fname);
}
else
{
// file exists - do the open/process/close
//
hashlens[maxhash] = statbuf.st_size;
// mmap the hash file into memory so we can bitwhack it
hashes[maxhash] = (HYPERSPACE_FEATUREBUCKET_STRUCT *)
crm_mmap_file (fname,
0, hashlens[maxhash],
PROT_READ,
MAP_SHARED,
NULL);
if (hashes[maxhash] == MAP_FAILED )
{
nonfatalerror ("Couldn't memory-map the table file :",
fname);
}
else
{
//
// Check to see if this file is the right version
//
// long fev;
// if (hashes[maxhash][0].hash != 0 ||
// hashes[maxhash][0].key != 0)
// {
// fev =fatalerror ("The .css file is the wrong version! Filename is: ",
// fname);
// return (fev);
// };
// read it in (this does not work either);
if (0)
{
FILE *hashf;
if (user_trace)
fprintf (stderr, "Read-opening file %s\n", fname);
hashf= fopen (fname, "rb");
fread (hashes[maxhash], 1, hashlens[maxhash], hashf);
fclose (hashf);
};
// set this hashlens to the length in features instead
// of the length in bytes.
hashlens[maxhash] = hashlens[maxhash]
/ sizeof ( HYPERSPACE_FEATUREBUCKET_STRUCT );
hashname[maxhash] = (char *) malloc (fnlen+10);
if (!hashname[maxhash])
untrappableerror(
"Couldn't malloc hashname[maxhash]\n","We need that part later, so we're stuck. Sorry.");
strncpy(hashname[maxhash],fname,fnlen);
hashname[maxhash][fnlen]='\000';
maxhash++;
};
};
};
if (maxhash > MAX_CLASSIFIERS-1)
nonfatalerror ("Too many classifier files.",
"Some may have been disregarded");
};
};
//
// If there is no '|', then all files are "success" files.
if (succhash == 0)
succhash = maxhash;
if (user_trace)
fprintf (stderr, "Running with %ld files for success out of %ld files\n",
succhash, maxhash );
// sanity checks... Uncomment for super-strict CLASSIFY.
//
// do we have at least 1 valid .css files?
if (maxhash == 0)
{
nonfatalerror ("Couldn't open at least 1 .css files for classify().", "");
};
// do we have at least 1 valid .css file at both sides of '|'?
// if (!vbar_seen || succhash < 0 || (maxhash < succhash + 2))
// {
// nonfatalerror (
// "Couldn't open at least 1 .css file per SUCC | FAIL category "
// " for classify().\n","Hope you know what are you doing.");
// };
//
// now all of the files are mmapped into memory,
// and we can do the polynomials and add up points.
i = 0;
j = 0;
k = 0;
thistotal = 0;
// we get txtstart and txtlen from the caller.
textoffset = txtstart;
textmaxoffset = txtstart + txtlen;
// init the hashpipe with 0xDEADBEEF
for (h = 0; h < OSB_BAYES_WINDOW_LEN; h++)
{
hashpipe[h] = 0xDEADBEEF;
};
totalfeatures = 0;
// stop when we no longer get any regex matches
// possible edge effect here- last character must be matchable, yet
// it's also the "end of buffer".
while (k == 0 && textoffset <= textmaxoffset
&& unk_hashcount < HYPERSPACE_MAX_FEATURE_COUNT )
{
long wlen;
long slen;
// do the regex
//
slen = textmaxoffset - textoffset;
// if pattern is empty, extract non graph delimited tokens
// directly ([[graph]]+) instead of calling regexec (8% faster)
if (ptext[0] != '\0')
{
k = crm_regexec (®cb, &(txtptr[textoffset]),
slen, 5, match, 0, NULL);
}
else
{
k = 0;
// skip non-graphical characthers
match[0].rm_so = 0;
while (!isgraph (txtptr[textoffset + match[0].rm_so])
&& textoffset + match[0].rm_so < textmaxoffset)
match[0].rm_so ++;
match[0].rm_eo = match[0].rm_so;
while (isgraph (txtptr [textoffset + match[0].rm_eo])
&& textoffset + match[0].rm_eo < textmaxoffset)
match[0].rm_eo ++;
if ( match[0].rm_so == match[0].rm_eo)
k = 1;
}
if (k != 0 || textoffset > textmaxoffset)
goto classify_end_regex_loop;
wlen = match[0].rm_eo - match[0].rm_so;
memmove (tempbuf,
&(txtptr[textoffset + match[0].rm_so]),
wlen);
tempbuf[wlen] = '\000';
if (internal_trace)
{
fprintf (stderr,
" Classify #%ld t.o. %ld strt %ld end %ld len %ld is -%s-\n",
i,
textoffset,
(long) match[0].rm_so,
(long) match[0].rm_eo,
wlen,
tempbuf);
};
if (match[0].rm_eo == 0)
{
nonfatalerror ( "The CLASSIFY pattern matched zero length! ",
"\n Forcing an increment to avoid an infinite loop.");
match[0].rm_eo = 1;
};
// slide previous hashes up 1
for (h = OSB_BAYES_WINDOW_LEN-1; h > 0; h--)
{
hashpipe [h] = hashpipe [h-1];
};
// and put new hash into pipeline
hashpipe[0] = strnhash ( tempbuf, wlen);
if (0)
{
fprintf (stderr, " Hashpipe contents: ");
for (h = 0; h < OSB_BAYES_WINDOW_LEN; h++)
fprintf (stderr, " %ld", hashpipe[h]);
fprintf (stderr, "\n");
};
// account for the text we used up...
textoffset = textoffset + match[0].rm_eo;
i++;
// is the pipe full enough to do the hashing?
if (1) // we init with 0xDEADBEEF, so the pipe is always full (i >=5)
{
int j;
unsigned th=0; // a counter used only in TSS hashing
unsigned long hindex;
unsigned long h1; //, h2;
//
th = 0;
//
if (use_unigram_features == 1)
{
h1 = hashpipe[0];
if (h1 == 0) h1 = 0xdeadbeef;
if (internal_trace)
fprintf (stderr, "Singleton feature : %ld\n", h1);
unk_hashes[unk_hashcount].hash = h1;
unk_hashcount++;
}
else
{
for (j = 1;
j < OSB_BAYES_WINDOW_LEN;
j++)
{
h1 = hashpipe[0]*hctable[0] + hashpipe[j] * hctable[j<<1];
if (h1 == 0) h1 = 0xdeadbeef;
// h2 = hashpipe[0]*hctable[1] + hashpipe[j] * hctable[(j<<1)-1];
//if (h2 == 0) h2 = 0xdeadbeef;
hindex = h1;
if (internal_trace)
fprintf (stderr, "Polynomial %d has h1:%ld \n",
j, h1);
unk_hashes[unk_hashcount].hash = h1;
// unk_hashes[unk_hashcount].key = h2;
unk_hashcount++;
};
};
};
}; // end of repeat-the-regex loop
classify_end_regex_loop:
////////////////////////////////////////////////////////////
//
// We now have the features. sort the unknown feature array so
// we can do fast comparisons against each document's hashes in
// the hyperspace vector files.
qsort (unk_hashes, unk_hashcount, sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT),
&hash_compare);
unk_hashcount--;
if (user_trace)
fprintf (stderr, "Total hashes in the unknown text: %ld\n", unk_hashcount);
// uniqueify the hashes array.
i = 1;
j = 1;
if (use_unique)
{
while (i <= unk_hashcount)
{
if (unk_hashes[i].hash != unk_hashes[i+1].hash
// || unk_hashes[i].key != unk_hashes[i+1].key)
)
{
unk_hashes[j] = unk_hashes[i];
j++;
};
i++;
};
j--;
unk_hashcount = j;
}
if (user_trace)
fprintf (stderr, "unique hashes generated: %ld\n", unk_hashcount);
totalfeatures = unk_hashcount;
// Now we have the uniqueified feature hashes of the unknown text
// ready for matching. For now, we will match with simple closeness,
// dominance, and radiosity but eventually we'll figure out what works
// best.
//
{
// Initialize this mess.
for (i = 0; i < MAX_CLASSIFIERS; i++)
{
closest_dist[i] = 1000000000.0;
closest_normalized[i] = 1000000000.0;
max_dominance[i] = 0.0;
dominance_normalized[i] = 0.0;
max_submission[i] = 0.0;
submission_normalized[i] = 0.0;
max_equivalence[i] = 0.0;
equivalence_normalized[i] = 0.0;
max_des[i] = 0.0;
des_normalized[i] = 0.0;
class_radiance[i] = 0.0;
class_radiance_normalized[i] = 0.0;
class_flux[i] = 0.0;
class_flux_normalized[i] = 0.0;
totalhits[i] = 0;
// for (j = 0; j < TOP_N; j++)
// topn[i][j] = 0.0;
};
#ifdef KNN_ON
// Initialize the KNN neighborhood.
for (i = 0; i < KNN_NEIGHBORHOOD_SIZE; i++);
{
top_n_val[i] = 0.0;
top_n_class[i] = -1;
}
#endif
if (internal_trace)
fprintf (stderr, "About to run classify loop with %ld files\n",
maxhash);
// Now run through each of the classifier maps
for (cls = 0; cls < maxhash; cls++)
{
unsigned long u, k, wrapup;
int cmp;
// Prepare for a new file
k = 0;
u = 0;
wrapup = 0;
// fprintf (stderr, "Header: %ld %lx %lx %lx %lx %lx %lx\n",
// cls,
// hashes[cls][0].hash,
// hashes[cls][0].key,
// hashes[cls][1].hash,
// hashes[cls][1].key,
// hashes[cls][2].hash,
// hashes[cls][2].key);
if (user_trace)
{
fprintf (stderr, "now processing file %ld\n", cls);
fprintf (stderr, "Hashlens = %ld\n", hashlens[cls]);
};
while (k < hashlens[cls] && hashes[cls][k].hash == 0)
k++;
while (k < hashlens[cls])
{ // This is the per-document level of the loop.
u = 0;
nfeats = 0;
ufeats = 0;
kfeats = 0;
unotk = 0.0;
knotu = 0.0;
kandu = 0.0;
wrapup = 0;
{
long j;
for ( j = 0; j < 256; j++)
{
collapse_vec_same [j] = 0;
collapse_vec_diff [j] = 0;
}
}
while (!wrapup)
{
nfeats++;
// it's an in-class feature.
cmp = hash_compare (&unk_hashes[u], &hashes[cls][k]);
if (cmp < 0)
{ // unknown less, step u forward
// increment on u, because maybe k will match next time
unotk++;
u++;
ufeats++;
collapse_vec_diff[(unk_hashes[u].hash & 0xFF)]++;
}
if (cmp == 0) // features matched.
// These aren't the features you're looking for.
// Move along, move along....
{
u++;
k++;
kandu++;
ufeats++;
kfeats++;
collapse_vec_same[(unk_hashes[u].hash & 0xFF)]++;
};
if (cmp > 0) // unknown is greater, step k forward
{
// increment on k, because maybe u will match next time.
knotu++;
k++;
kfeats++;
collapse_vec_diff[(hashes[cls][k].hash & 0xFF)]++;
};
// End of the U's? If so, skip k to the end marker
// and finish.
if ( u >= unk_hashcount - 1 )
{
while ( k < hashlens[cls]
&& hashes[cls][k].hash != 0)
{
k++;
kfeats++;
knotu++;
};
};
// End of the K's? If so, skip U to the end marker
if ( k >= hashlens[cls] - 1
|| hashes[cls][k].hash == 0 ) // end of doc features
{
unotk += unk_hashcount - u;
ufeats += unk_hashcount - u;
};
// end of the U's or end of the K's? If so, end document.
if (u >= unk_hashcount - 1
|| k >= hashlens[cls] - 1
|| hashes[cls][k].hash == 0) // this sets end-of-document
{
wrapup = 1;
k++;
};
};
// Now the wrapup...
wrapup = 0;
if (nfeats > 10)
{
// end of a document- process accumulations
// distance first;
// Pythagorean distance with unit dimensions
// DO NOT USE. Works like crap.
// dist = sqrt (knotu + unotk);
// The following distance function is "weakly founded"
// (it's the matrix determinant) but it seems to work
// MUCH better than Pythagorean distance for text
// classification.
// It can probably be extended to a larger matrix
// for more dimensions
// This formula (and then using radiance = 1/dist
// because this formula is already really distance^2;
// look at the denomenator if you don't believe me)
// trained with a thickness of 1, yelds 27 errors on
// the 10x SA corpus. Not bad. :)
// (5 / 500 on pass 1, unique, 0 thk)
// dist = (unotk * knotu + 1.0) / ( kandu * kandu + 1.0);
// actual (pythag) distance,-- note the sqrt
// PREVIOUS RELEASE VER -->>>
// dist = sqrtf ((unotk * knotu) / ( kandu * kandu + 1.0));
// Proper pythagorean (Euclidean) distance - best in
// SpamConf 2006 paper
dist = sqrtf (unotk + knotu) ;
// treat kandu better... count matches like mismatches
// 5/500 pass 1 (0 thk)
// dist = (unotk * knotu + 1.0) / ( 4 * (kandu * kandu) + 1.0);
// Something really simple - similarity. -Totally hopeless.
// dist = 1/(kandu + 1);
// unotk? 10 in last 500 of pass 1.(unigram)
// dist = unotk;
// knotu? Also 10 in last 500 of pass 1 (unigram) (0 thk)
// And also 10 in 500 @ pass1 (OSB) (0 thk)
//dist = knotu;
// knotu + unotk? 8 /500 @ pass1 (unigram, 0 thk)
// 6 / 600 @ pass1 (unique, 0 thk)
// dist = knotu + unotk;
// knotu * unotk 20 / 500 unique. 20/500 unigram 0 thk)
//dist = knotu * unotk;
// How important is the kandu term?
// result - 29 errors in first pass alone! Sucks!
//dist = (unotk * knotu + 1.0) / ( (kandu * kandu * kandu) + 1.0)
// Not as good as the above
// dist = (unotk + knotu) / (kandu + 1);
// This one's awful....
// dist = sqrt (unotk + knotu);
// this one is not much better.... slow to learn
// dist=(unotk * knotu + 1.0) / (kandu * kandu * kandu + 1.0);
// Actual determinant-based distance. works like crap
// dist = fmax ( 0.00000000001,
// (unotk * knotu) - (kandu * kandu));
// Collapse-vector-based distance:
//{
// long i;
// float num, denom;
// num = 0;
// denom = 0;
// dist = 0.0 ;
// for (i = 0; i < 255; i++)
// {
// num = (collapse_vec_diff[i]*collapse_vec_diff[i]);
// denom = (collapse_vec_same[i]*collapse_vec_same[i]);
// dist += num / ( denom + 1.0);
// };
// // dist = (sqrtf (num) / sqrtf(denom));
//}
// BOGUS ERR
//if ( dist < closest_dist [cls])
// closest_dist[cls] = dist;
// normalized distance - make each dimension of
// distance of a text be sqrt(1/doc_feat_count), so the
// text is somewhere on a unit hypersphere. Then
// calculate the distance between the texts based on
// that unit length in hyperspace. The problem is
// that this puts far too much emphasis on the text
// lengths, so we bastardize it.
//dist_normalized = dist / nfeats;
//if ( dist_normalized < closest_normalized[cls])
// closest_normalized[cls] = dist_normalized;
// dominance and submission
//dominance = knotu;
//equivalence = kandu;
//submission = unotk;
//if (dominance > max_dominance[cls])
// max_dominance[cls] = dominance;
//if (equivalence > max_equivalence[cls])
// max_equivalence[cls] = equivalence;
//if (submission > max_submission[cls])
// max_submission[cls] = submission;
//if (dominance/nfeats > dominance_normalized[cls])
// dominance_normalized[cls] = dominance/ nfeats;
//if (equivalence/nfeats > equivalence_normalized[cls])
// equivalence_normalized [cls] = equivalence/nfeats;
//if (submission/nfeats > submission_normalized[cls])
// submission_normalized[cls] = submission/sqrt(nfeats);
//des = equivalence * equivalence / (dominance * submission);
//if (des > max_des[cls])
// max_des[cls] = des;
//des_nrm = des / nfeats;
//if (des_nrm > des_normalized[cls])
// des_normalized[cls] = des_nrm;
// radiance and flux; - note that these are
// _cumulative_ over a class, not for te best member,
// so it's a +=, not if-closer-then-update.
// Radiance = inverse (distance) works pretty well...)
// radiance = 1.0 / (dist * dist + 1);
// radiance = des;
// radiance = 1.0 / (sqrt (dist) + .0000000000000000000001);
// this gives 34 errors and about 1 Mbyte
// radiance = 1.0 / (dist + .000000000000000000000001);
// this gives 27 errors and 1.3 mbytes
// radiance = 1.0 / (dist + 0.01);
// version for use with square-rooted distance
// PREV RELEASE VER --> radiance = 1.0 / ((dist * dist )+ 1.0);
//
// This formula was the best found in the MIT `SC 2006 paper.
radiance = 1.0 / (( dist * dist) + .000001);
radiance = radiance * kandu;
radiance = radiance * kandu;
//radiance = radiance * kandu;
// bad radiance design - based on similarity only.
// radiance = kandu;
// HACK HACK HACK - this is based on the empirical
// ratio of an average 4.69:1 between features in
// the correct class vs. features in the incorrect class.
//radiance = ( ( kandu ) - (knotu + unotk));
//if (radiance < 0.0) radiance = 0;
// this gives 25 errors in 1st 3 passes... skipping
// radiance = 1.0 / ( dist + 10.0);
// fprintf (stderr, "%1ld %10ld %10ld %10ld ",
// cls, kandu, unotk, knotu);
//fprintf (stderr, "%15.5f %15.5f\n", dist, radiance);
class_radiance[cls] += radiance;
class_radiance_normalized[cls] += radiance / nfeats;
// flux is a normalized radiance.
//flux = 1 / (dist + 1) ;
//flux = 100000.0 / ( sqrt (dist) + .000000000000001) ;
//class_flux[cls] += flux;
//class_flux_normalized[cls] += flux / nfeats;
// And for fun, we also keep totalhits
//BOGUS FAULT
totalhits[cls] += kandu;
#ifdef KNN_ON
// Do the TopN updates; this is a swap-sort.
//
{
float local_val;
float local_class;
float temp_val;
float temp_class;
long i;
local_val = 1 / (dist + 0.000000001) ;
local_class = cls;
for (i = 0; i < KNN_NEIGHBORHOOD_SIZE -1; i++)
{
if (local_val > top_n_val[i])
{
temp_val = top_n_val[i];
temp_class = top_n_class[i];
top_n_val[i] = local_val;
top_n_class[i] = local_class;
local_val = temp_val;
local_class = temp_class;
};
};
};
#endif
};
}; // end per-document stuff
// fprintf (stderr, "exit K = %ld\n", k);
};
// TURN THIS ON IF YOU WANT TO SEE ALL OF THE HUMILIATING DEAD
// ENDS OF EVALUATIONS THAT DIDN'T WORK OUT WELL....
if (internal_trace)
//if (1)
for (i = 0; i < maxhash; i++)
fprintf (stderr,
"f: %ld dist %f %f \n"
"dom: %f %f equ: %f %f sub: %f %f\n"
"DES: %f %f \nrad: %f %f flux: %f %f\n\n",
i,
closest_dist[i], closest_normalized[i],
max_dominance[i], dominance_normalized[i],
max_equivalence[i], equivalence_normalized[i],
max_submission[i], submission_normalized[i],
max_des[i], des_normalized[i],
class_radiance[i], class_radiance_normalized[i],
class_flux[i], class_flux_normalized[i]);
};
//////////////////////////////////////////////
//
// Class radiance via top-N documents?
//
#ifdef KNN_ON
{
long i;
long j;
for (i = 0; i < maxhash; i++)
{
class_radiance[i] = 0.0;
}
for (j = 0; j < KNN_NEIGHBORHOOD_SIZE; j++)
{
if (top_n_class[j] >= 0)
{
// class_radiance[top_n_class[j]] ++;
// class_radiance[top_n_class[j]] += KNN_NEIGHBORHOOD_SIZE - j;
class_radiance[top_n_class[j]] += top_n_val[j];
};
}
}
#endif
///////////////////////////////////////////////////////
//
// Now we have the relative match values in closest_dist,
// class_radiance, and class_radiance_normalized. We
// must choose one. For now, use class_radiance.
//
// To translate from radiance to probability, we just renormalize...
{
tprob = 0.0;
for (i = 0; i < maxhash; i++)
{
// the following works OK. But we gotta try something else.
ptc[i] = class_radiance [i] ;
// ptc[i] = 1/closest_dist[i];
// ptc[i] = max_des[i];
// ptc[i] = class_flux[i];
if (ptc[i] < 0.000000000000000000000001)
ptc[i] = 0.000000000000000000000001;
tprob = tprob + ptc[i];
};
for (i = 0; i < maxhash; i++)
ptc[i] = ptc[i] / tprob;
if (user_trace)
{
for (k = 0; k < maxhash; k++)
fprintf (stderr, "Match for file %ld: radiance: %f prob: %f\n",
k, class_radiance[k], ptc[k]);
};
}
//
tprob = 0.0;
for (k = 0; k < succhash; k++)
tprob = tprob + ptc[k];
if (svlen > 0)
{
char buf[1024];
double accumulator;
double remainder;
double overall_pR;
long m;
buf [0] = '\000';
accumulator = 1000 * DBL_MIN;
for (m = 0; m < succhash; m++)
{
accumulator = accumulator + ptc[m];
};
remainder = 1000 * DBL_MIN;
for (m = succhash; m < maxhash; m++)
if (bestseen != m)
{
remainder = remainder + ptc[m];
};
// overall_pR = 10 * (log10 (accumulator) - log10 (remainder));
//overall_pR = 10 * (accumulator - remainder);
overall_pR = 10 * (log10 (accumulator) - log10(remainder));
// Rescaled for +/-10 pR units of optimal thick training threshold.
//overall_pR = 250 * (log10 (accumulator) - log10 (remainder));
// going to 1500+ as 250x will do. A little much, so we will
// rescale yet again, for +/- 1.0 units.
//overall_pR = 25 * (log10 (accumulator) - log10(remainder));
// note also that strcat _accumulates_ in stext.
// There would be a possible buffer overflow except that _we_ control
// what gets written here. So it's no biggie.
if (tprob > 0.5000)
{
sprintf (buf, "CLASSIFY succeeds; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR );
}
else
{
sprintf (buf, "CLASSIFY fails; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR );
};
if (strlen (stext) + strlen(buf) <= stext_maxlen)
strcat (stext, buf);
bestseen = 0;
for (k = 0; k < maxhash; k++)
if (ptc[k] > ptc[bestseen] ) bestseen = k;
remainder = 1000 * DBL_MIN;
for (m = 0; m < maxhash; m++)
if (bestseen != m)
{
remainder = remainder + ptc[m];
};
sprintf (buf, "Best match to file #%ld (%s) "\
"prob: %6.4f pR: %6.4f \n",
bestseen,
hashname[bestseen],
ptc[bestseen],
10 * (log10(ptc[bestseen]) - log10(remainder))
// Rescaled for +/- 10.0 thick training threshold optimal
//250 * (log10(ptc[bestseen]) - log10(remainder))
// 10 * (ptc[bestseen] - remainder)
// rescaled yet again for pR from 1500 to 150
// 25 * (log10(ptc[bestseen]) - log10(remainder))
);
if (strlen (stext) + strlen(buf) <= stext_maxlen)
strcat (stext, buf);
sprintf (buf, "Total features in input file: %ld\n", totalfeatures);
if (strlen (stext) + strlen(buf) <= stext_maxlen)
strcat (stext, buf);
for (k = 0; k < maxhash; k++)
{
long m;
remainder = 1000 * DBL_MIN;
for (m = 0; m < maxhash; m++)
if (k != m)
{
remainder = remainder + ptc[m];
};
sprintf (buf,
"#%ld (%s):"\
" features: %ld, hits: %ld, radiance: %3.2e, prob: %3.2e, pR: %6.2f \n",
k,
hashname[k],
hashlens[k],
totalhits[k],
class_radiance[k],
ptc[k],
10 * (log10 (ptc[k]) - log10 (remainder) ) );
// Rescaled for +/- 10 pR units optimal thick threshold
//250 * (log10 (ptc[k]) - log10 (remainder) ) );
// rescaled yet again for pR from 1500 to 150
//25 * (log10 (ptc[k]) - log10 (remainder) ) );
// strcat (stext, buf);
if (strlen(stext)+strlen(buf) <= stext_maxlen)
strcat (stext, buf);
};
// check here if we got enough room in stext to stuff everything
// perhaps we'd better rise a nonfatalerror, instead of just
// whining on stderr
if (strcmp(&(stext[strlen(stext)-strlen(buf)]), buf) != 0)
{
nonfatalerror( "WARNING: not enough room in the buffer to create "
"the statistics text. Perhaps you could try bigger "
"values for MAX_CLASSIFIERS or MAX_FILE_NAME_LEN?",
" ");
};
crm_destructive_alter_nvariable (svrbl, svlen,
stext, strlen (stext));
};
// cleanup time!
// remember to let go of the fd's and mmaps
for (k = 0; k < maxhash; k++)
{
// close (hfds [k]);
crm_munmap_file ((void *) hashes[k]);
};
// and let go of the regex buffery
if (ptext[0] != '\0') crm_regfree (®cb);
// and drop the list of unknown hashes
free (unk_hashes);
//
// Free the hashnames, to avoid a memory leak.
//
for (i = 0; i < maxhash; i++) {
///////////////////////////////////////
// ! XXX SPAMNIX HACK!
//! -- by Barry Jaspan
//
//! Without the statement "k = i" (which should have no effect),
//! the for statement crashes on MacOS X when compiled with gcc
//! -O3. I've examined the pointers being freed, and they appear
//! valid. I've run this under Purify on Windows, valgrind on
//! Linux, and efence on MacOS X; none report a problem here
//! (though valgrind reports umrs in the VHT code; see my post to
//! crm114-developers). I've also examined the assembler produced
//! with various changes here and, though I don't speak PPC, w/o
//! the k = i it is qualitatively different.
//!
//! For now, I'm concluding it is an optimizer bug, and fixing it
//! with the "k = i" statement. This occurs on MacOS X 10.2 with
//! Apple Computer, Inc. GCC version 1175, based on gcc version
//! 3.1 20020420 (prerelease).
//
k = i;
free (hashname[i]);
}
if (tprob > 0.5000)
{
// all done... if we got here, we should just continue execution
if (user_trace)
fprintf (stderr, "CLASSIFY was a SUCCESS, continuing execution.\n");
}
else
{
if (user_trace)
fprintf (stderr, "CLASSIFY was a FAIL, skipping forward.\n");
// and do what we do for a FAIL here
csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1;
csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1;
return (0);
};
//
regcomp_failed:
return (0);
};
syntax highlighted by Code2HTML, v. 0.9.1