//crm_fscm.c //sequence correlation monster // by Joe Langeway derived from crm_bit_entropy.c and produced for the crm114 so: // // This software is licensed to the public under the Free Software // Foundation's GNU GPL, version 2. You may obtain a copy of the // GPL by visiting the Free Software Foundations web site at // www.fsf.org, and a copy is included in this distribution. // // Other licenses may be negotiated; contact Bill for details. // ///////////////////////////////////////////////////////////////////// // // crm_fscm.c - //fast substring compression matcher // // Original spec by Bill Yerazunis, original code by Joe Langeway, // recode for CRM114 use by Bill Yerazunis. // // This code section (crm_scm and subsidiary routines) is // dual-licensed to both William S. Yerazunis and Joe Langeway, // including the right to reuse this code in any way desired, // including the right to relicense it under any other terms as // desired. // ////////////////////////////////////////////////////////////////////// /* This file is part of on going research and should not be considered a finished product, a reliable tool, an example of good software engineering, or a reflection of any quality of Joe's besides his tendancy towards long hours. Here's what's going on: We learn documents by copying them into a text space of stored documents and caching every contiguous three characters with there offset so that we can find common substrings relatively fast. The text space was set to 1MB for TREC. Associated with every character position in the text space is an index (s->indeces) to the correspond prefix node. Prefix nodes point to the postion in the stored text of particular three digit prefixes. Every prefix node of the same three charecter prefix belongs to a chain pointed to by a hash node. The first prefix node of a chain points to the hash node's index with it's prev field with a (-i - 1) to mark it as the first node. Hashnodes are chained, the first in a chain is pointed to by the hash root (s->hash_root) so that we can mod a key by n_bytes and look up the first node in the chain. The first hash node in a chain points back to it's position in the hash root with a -i - 1. The average length of chains should be close to one and never more than two. When we finally classify we score documents as belonging to a class by awarding pionts for every nonoverlapping substring match we can make, longest first. Currently it is (length - 2)^1.5 points per match. That seems optimal. */ // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the command line argc, argv extern int prog_argc; extern char **prog_argv; // the auxilliary input buffer (for WINDOW input) extern char *newinputbuf; // the globals used when we need a big buffer - allocated once, used // wherever needed. These are si zed to the same size as the data window. extern char *inbuf; extern char *outbuf; extern char *tempbuf; #define NULL_INDEX 2147483647 typedef struct mythical_scm_header { long n_bytes; //this is the length of rememebred text, the number //of hashbuckets, and the size of the hash root long n_trains; //how many times have we had to train this guy long n_features; //number of bytes we've eaten up to n_bytes long free_hash_nodes; //index of first in free chain long free_prefix_nodes; //index of first in free chain long hash_root_offset; long hash_offset; long prefix_offset; long text_offset; long text_pos; //we wrap around when we fill the buffer long indeces_offset; } SCM_HEADER_STRUCT; //nodes for our hash table of three character prefixes typedef struct mythical_hash { char prefix_text[4]; //we make it 4 bytes so thing align nicely unsigned long key; //hash key of the three charactor prefix long prev, next; //in hash chain long first; //first prefix node } HASH_STRUCT; //one node for every contiguous three characters in stored text typedef struct mythical_prefix { long offset; long prev, next; } PREFIX_STRUCT; //pointers to runtime structures to pass around so that we don't have a // bizillion globals typedef struct mythical_scm_state { SCM_HEADER_STRUCT *header; //we dup some stuff from the header to shorten things up long *text_pos, n_bytes, *free_hash_nodes, *free_prefix_nodes; //s->hash_root[key % n_bytes] is the first hash_node in the chain that key //would be in long *hash_root; //s->hashee[i] is the ith hash node HASH_STRUCT *hashee; //and so forth... PREFIX_STRUCT *prefix; char *text; char *learnfilename; //the classifier file we're working on long *indeces; } SCM_STATE_STRUCT; //this is the one global that can be passed in from crm. //it determines the amount of previous text we remember //1 megabyte gives good accuracy and enough speed for TREC //512K is twice as fast but ever so slightly less accurate long n_bytes = 1048576; // this is also switched on by internal_trace at both entry points so it is // safe to keep int do_struct_audit = 0; //fill in a fresh classifier state assuming that the space is already allocated // and mapped static void make_scm_state(SCM_STATE_STRUCT *s, void *space) { SCM_HEADER_STRUCT *h = space; char *o = space; long i; h->n_bytes = n_bytes; h->n_trains = 0; h->n_features = 0; h->free_prefix_nodes = 0; h->free_hash_nodes = 0; h->hash_root_offset = sizeof(SCM_HEADER_STRUCT); h->hash_offset = sizeof(SCM_HEADER_STRUCT) + n_bytes * sizeof(long); h->prefix_offset = sizeof(SCM_HEADER_STRUCT) + n_bytes * (sizeof(long) + sizeof(HASH_STRUCT)); h->text_offset = sizeof(SCM_HEADER_STRUCT) + n_bytes * (sizeof(long) + sizeof(HASH_STRUCT) + sizeof(PREFIX_STRUCT)); h->text_pos = 0; h->indeces_offset = sizeof(SCM_HEADER_STRUCT) + n_bytes * (sizeof(long) + sizeof(HASH_STRUCT) + sizeof(PREFIX_STRUCT) + sizeof(char)); s->header = h; s->text_pos = &h->text_pos; s->n_bytes = h->n_bytes; s->free_hash_nodes = &h->free_hash_nodes; s->free_prefix_nodes = &h->free_prefix_nodes; s->hash_root = (long *) &o[h->hash_root_offset]; s->hashee = (HASH_STRUCT *) &o[h->hash_offset]; s->prefix = (PREFIX_STRUCT *) &o[h->prefix_offset]; s->text = (char *) &o[h->text_offset]; s->indeces = (long *) &o[h->indeces_offset]; for(i = 0; i < n_bytes; i++) { s->hash_root[i] = NULL_INDEX; s->text[i] = '\0'; s->hashee[i].key = 0; s->hashee[i].next = i + 1; s->hashee[i].first = NULL_INDEX; s->prefix[i].offset = NULL_INDEX; s->prefix[i].next = i + 1; s->indeces[i] = NULL_INDEX; } s->hashee[n_bytes - 1].next = NULL_INDEX; s->prefix[n_bytes - 1].next = NULL_INDEX; } //map a classifier state, make a new one if need be static void map_file(SCM_STATE_STRUCT *s, char *filename) { struct stat statbuf; if(stat (filename, &statbuf)) { long i, filesize; FILE *f; void *space; filesize = sizeof(SCM_HEADER_STRUCT) + n_bytes * ( sizeof(long) + sizeof(HASH_STRUCT) + sizeof(PREFIX_STRUCT) + sizeof(char) + sizeof(long) ); f = fopen(filename, "wb"); i = filesize + 1024; while(i--) fputc('\0', f); fclose(f); space = crm_mmap_file (filename, 0, filesize, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); if(!space) { nonfatalerror("failed to do mmap of freshly created file", filename); s->header = NULL; return; } make_scm_state(s, space); } else { char *o; SCM_HEADER_STRUCT *h; s->header = crm_mmap_file (filename, 0, statbuf.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); if(!s->header) { nonfatalerror("failed to do mmap of existing file", filename); return; } o = (char*)s->header; h = s->header; s->text_pos = &h->text_pos; s->n_bytes = h->n_bytes; s->free_hash_nodes = &h->free_hash_nodes; s->free_prefix_nodes = &h->free_prefix_nodes; s->hash_root = (long *) &o[h->hash_root_offset]; s->hashee = (HASH_STRUCT *) &o[h->hash_offset]; s->prefix = (PREFIX_STRUCT *) &o[h->prefix_offset]; s->text = (char *) &o[h->text_offset]; s->indeces = (long *) &o[h->indeces_offset]; } s->learnfilename = filename; } static void unmap_file(SCM_STATE_STRUCT *s) { crm_munmap_file ((void *) s->header); #ifdef POSIX // Because mmap/munmap doesn't set atime, nor set the "modified" // flag, some network filesystems will fail to mark the file as // modified and so their cacheing will make a mistake. // // The fix is to do a trivial read/write on the .css ile, to force // the filesystem to repropagate it's caches. // { int hfd; // hashfile fd FEATURE_HEADER_STRUCT foo; hfd = open (s->learnfilename, O_RDWR); read (hfd, &foo, sizeof(foo)); lseek (hfd, 0, SEEK_SET); write (hfd, &foo, sizeof(foo)); close (hfd); } #endif } //are the three charecters at b the same as the three characters in the stored // text? wrapping around the stored text buffer if need be static int match_prefix(SCM_STATE_STRUCT *s, long a, char *b) { if(s->text[a++] != b[0]) return 0; if(a == s->n_bytes) a = 0; if(s->text[a++] != b[1]) return 0; if(a == s->n_bytes) a = 0; if(s->text[a] != b[2]) return 0; return 1; } //whats the hashcode of the three characters at spot a in the stored text static unsigned long get_text_hash(SCM_STATE_STRUCT *s, long a) { char b[3]; b[0] = s->text[a++]; if(a == s->n_bytes) a = 0; b[1] = s->text[a++]; if(a == s->n_bytes) a = 0; b[2] = s->text[a]; return strnhash(b, 3); } //get the three characters from the stored text static void copy_prefix(SCM_STATE_STRUCT *s, long a, char *b) { b[0] = s->text[a++]; if(a == s->n_bytes) a = 0; b[1] = s->text[a++]; if(a == s->n_bytes) a = 0; b[2] = s->text[a]; } //check every global structure for consistency, if there's a bug, this will // find it! We just go into a loop on error here because the only apropriate // action to be taken is to attach the debugger static int audit_structs(SCM_STATE_STRUCT *s) { long i, j, k; long n_p = 0, n_h = 0; for(i = 0; i < s->n_bytes; i++) if(s->hash_root[i] != NULL_INDEX) { if(s->hashee[ s->hash_root[i] ].prev != -i - 1) { fatalerror("FSCM: INCONSISTENT INTERNAL STATE!", "Please submit bug" " report"); return -1; } for(j = s->hash_root[i]; j != NULL_INDEX; j = s->hashee[j].next) { n_h++; if(s->hashee[j].next != NULL_INDEX && s->hashee[ s->hashee[j].next ].prev != j) { fatalerror("FSCM: INCONSISTENT INTERNAL STATE!", "Please submit bug" " report"); return -1; } if(s->prefix[ s->hashee[j].first ].prev != -j - 1) { fatalerror("FSCM: INCONSISTENT INTERNAL STATE!", "Please submit bug" " report"); return -1; } for(k = s->hashee[j].first; k != NULL_INDEX; k = s->prefix[k].next) { n_p++; if(s->prefix[k].next != NULL_INDEX && s->prefix[ s->prefix[k].next].prev != k) { fatalerror("FSCM: INCONSISTENT INTERNAL STATE!", "Please submit bug" " report"); return -1; } if(!match_prefix(s, s->prefix[k].offset, s->hashee[j].prefix_text)) { fatalerror("FSCM: INCONSISTENT INTERNAL STATE!", "Please submit bug" " report"); return -1; } } } }; for( i = *s->free_hash_nodes; i != NULL_INDEX && n_h < s->n_bytes + 1; i = s->hashee[i].next ) n_h++; if(n_h != s->n_bytes) { fatalerror("FSCM: INCONSISTENT INTERNAL STATE!", "Please submit bug" " report"); return -1; } for( i = *s->free_prefix_nodes; i != NULL_INDEX && n_p < s->n_bytes + 1; i = s->prefix[i].next ) n_p++; if(n_p != s->n_bytes) { fatalerror("FSCM: INCONSISTENT INTERNAL STATE!", "Please submit bug" " report"); return -1; } return 0; } //insert the three character prefix starting at postion t into the tables for // fast lookup in the future. This happens to every contiguous three characters // in a document during learning, right after concatenating it into the stored // text // static long add_prefix(SCM_STATE_STRUCT *s, long t) { unsigned long key = get_text_hash(s, t); long i = s->hash_root[key % s->n_bytes], j; //find the proper hashnode or set i to NULL_INDEX while(!(i == NULL_INDEX || ( s->hashee[i].key == key && match_prefix(s, t, s->hashee[i].prefix_text) ) ) ) i = s->hashee[i].next; if(i == NULL_INDEX) { //we need to insert a new hashnode //grab a fesh hash struct i = *s->free_hash_nodes; *s->free_hash_nodes = s->hashee[i].next; //insert it at front of chain j = key % s->n_bytes; s->hashee[i].prev = -j - 1; s->hashee[i].next = s->hash_root[j]; if( s->hash_root[j] != NULL_INDEX ) s->hashee[ s->hash_root[j] ].prev = i; s->hash_root[j] = i; //fill in key and prefix s->hashee[i].key = key; copy_prefix(s, t, s->hashee[i].prefix_text); //grab fresh prefix node and make it start of chain from this hash j = s->hashee[i].first = *s->free_prefix_nodes; *s->free_prefix_nodes = s->prefix[j].next; s->prefix[j].prev = -i - 1; s->prefix[j].next = NULL_INDEX; s->prefix[j].offset = t; //increment feature count for this classifier s->header->n_features++; } else { //i is the proper hashnode to stick a new prefix to so grab a fresh prefix j = *s->free_prefix_nodes; *s->free_prefix_nodes = s->prefix[j].next; //insert it at beginning of chain s->prefix[j].next = s->hashee[i].first; s->prefix[j].prev = -i - 1; if( s->prefix[j].next != NULL_INDEX ) s->prefix[ s->prefix[j].next ].prev = j; s->hashee[i].first = j; s->prefix[j].offset = t; //increment feature count for this classifier s->header->n_features++; } return j; } //we're writing over the position in the text corresponding to this prefix, so // remove it from tables or we'll have big trouble! static void delete_prefix(SCM_STATE_STRUCT *s, long p) { long i; if(s->prefix[p].prev < 0) { //this prefix is the start of a chain, we need to update a hashnode i = -(s->prefix[p].prev) - 1; if(s->prefix[p].next == NULL_INDEX) { //this prefix was the only one attached to the hash so delete the hash too if(s->hashee[i].prev < 0) //was this hash the first in its chain? s->hash_root[-(s->hashee[i].prev) - 1] = s->hashee[i].next; else s->hashee[ s->hashee[i].prev ].next = s->hashee[i].next; if(s->hashee[i].next != NULL_INDEX) s->hashee[ s->hashee[i].next ].prev = s->hashee[i].prev; //free hash node s->hashee[i].next = *s->free_hash_nodes; *s->free_hash_nodes = i; //free prefix node s->prefix[p].offset = NULL_INDEX; s->prefix[p].prev = NULL_INDEX; s->prefix[p].next = *s->free_prefix_nodes; *s->free_prefix_nodes = p; //decrement feature count for this classifier s->header->n_features--; return; } else { //this hash node has other prefixes so just update hash node s->hashee[i].first = s->prefix[p].next; //and free prefix node s->prefix[s->prefix[p].next].prev = s->prefix[p].prev; s->prefix[p].offset = NULL_INDEX; s->prefix[p].prev = NULL_INDEX; s->prefix[p].next = *s->free_prefix_nodes; *s->free_prefix_nodes = p; //decrement feature count s->header->n_features--; return; } } else //this prefix was not the start of a chain s->prefix[ s->prefix[p].prev ].next = s->prefix[p].next; if(s->prefix[p].next != NULL_INDEX) s->prefix[ s->prefix[p].next ].prev = s->prefix[p].prev; s->prefix[p].offset = NULL_INDEX; s->prefix[p].prev = NULL_INDEX; s->prefix[p].next = *s->free_prefix_nodes; *s->free_prefix_nodes = p; s->header->n_features--; } //find the longest match of contiguous characters to *text in stored text static void find_longest_match ( SCM_STATE_STRUCT *s, char *text, long max_len, long *prefix, long *len ) { unsigned long key; long i, j, k; if(max_len < 3) { //then we don't have enough text to lookup a three character prefix *prefix = NULL_INDEX; *len = 0; return; } //get a hashcode and find the hashchain for the three characters that start // the string we're matching key = strnhash(text, 3); i = s->hash_root[key % s->n_bytes]; //find the right hashnode or set i to NULL_INDEX while(! ( i == NULL_INDEX || ( s->hashee[i].key == key && text[0] == s->hashee[i].prefix_text[0] && text[1] == s->hashee[i].prefix_text[1] && text[2] == s->hashee[i].prefix_text[2] ) ) ) i = s->hashee[i].next; if(i == NULL_INDEX) {//then there was no hashnode for these three characters *prefix = NULL_INDEX; *len = 0; return; } //else go through the prefix chain and find the longest match *len = 0; *prefix = NULL_INDEX; //loop over all the spots in the stored text where the first three characters // in *text happened contiguously for(i = s->hashee[i].first; i != NULL_INDEX; i = s->prefix[i].next) { k = (s->prefix[i].offset + 3) % s->n_bytes; for(j = 3; j < max_len && s->text[k] == text[j]; j++) if(++k == s->n_bytes) k = 0; if(j > *len) { *prefix = i; *len = j; } } } //find all substring matches of three characters or more between *t and the // stored text which do not overlap, giving preference to longest first, put // the corresponding matches indeces from the stored text in *starts, from *t // into *locals and the lengths of those matches into *lens, return the number // of matches // static int deflate(SCM_STATE_STRUCT *s, char *t, long len, long *starts, long *locals, long *lens, long max_n) { //at each place in *t remember the best match found so far, bmi[...] is a // prefix node index, bml[...] is a length, open[..] is whether or not we can // still make a match at this spot long *bmi = (long *)inbuf; long *bml = (long *)outbuf; int *open = (int *) tempbuf; long i, j, k, n; //fill arrays for(i = 0; i < len; i++) { find_longest_match(s, t + i, len - i, &bmi[i], &bml[i]); open[i] = 1; } //and make matches until we reach the maximum number or can not make anymore // nonoverlapping matches n = 0; for(;;) { //find longest potential match not yet made j = NULL_INDEX; k = 0; for(i = 0; i < len; i++) if(open[i] && bml[i] > k) { j = i; k = bml[j]; } //if we couldn't find one then we're done if(j == NULL_INDEX) break; //push this match to return it *starts++ = s->prefix[bmi[j]].offset; *locals++ = j; *lens++ = k; //increment count and bail if we're full if(++n >= max_n) break; //close all positions we just matched over k += j; for(i = j; i < k; i++) open[i] = 0; //readjust potential matches so not to overlap with the one we just made for(i = 0; i < j; i++) if(open[i] && i + bml[i] >= j) find_longest_match(s, t + i, j - i, &bmi[i], &bml[i]); } return n; } //we need to raise substring match lengths to a power quickly static int pow_table2_init = 1; static double pow_table2[256]; static double power2(long i) { double pow2 = 1.5; //who would have thought that this would give best results? //it was thought that 2.0 would be most justifiable. //1.0 would be almost identical to seeing how much we could compress a text //with LZ77, lacking only huffman coding. if(i >= 256) return pow((double)i, pow2); if(pow_table2_init) { long j; pow_table2_init = 0; for(j = 0; j < 256; j++) pow_table2[j] = pow((double)j, pow2); }; return pow_table2[i]; } //this is the number of substring matches we can make, // > one third of max length of input is useless since each match must be of at // least three characters, less just means less accurate #define MAX_N 1065 //deflate document and give points for each substring match static double score_document(SCM_STATE_STRUCT *s, char *doc, long len) { double score = 0.0; long i, n; long starts[MAX_N], locals[MAX_N], lens[MAX_N]; n = deflate(s, doc, len, starts, locals, lens, MAX_N); for(i = 0; i < n; i++) score += power2(lens[i] - 2); return score; } static void refute_document(SCM_STATE_STRUCT *s, char *doc, long len) { long starts[MAX_N], locals[MAX_N], lens[MAX_N], i, j, k, n; n = deflate(s, doc, len, starts, locals, lens, MAX_N); for(i = 0; i < n; i++) for(j = 0, k = starts[i]; j < lens[i]; j++, k++) { if(k == s->n_bytes) k = 0; if(s->indeces[k] != NULL_INDEX) { delete_prefix(s, s->indeces[k]); s->indeces[k] = NULL_INDEX; } } } //entry point for learning int crm_expr_fscm_learn ( CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen ) { char filename[MAX_PATTERN]; char htext[MAX_PATTERN]; long htext_len; SCM_STATE_STRUCT S, *s = &S; long i, j; long doc_start; if(internal_trace) do_struct_audit = 1; if(internal_trace) fprintf(stderr, "entered crm_expr_fscm_learn (learn)\n"); //parse out .fscm file name crm_get_pgm_arg (htext, MAX_PATTERN, apb->p1start, apb->p1len); htext_len = apb->p1len; htext_len = crm_nexpandvar (htext, htext_len, MAX_PATTERN); i = 0; while (htext[i] < 0x021) i++; j = i; while (htext[j] >= 0x021) j++; htext[j] = '\000'; strcpy (filename, &htext[i]); //map it map_file(s, filename); if(!s->header) //then we couldn't map the file and already wined about it return 0; if(do_struct_audit && audit_structs(s)) return 0; txtptr += txtstart; doc_start = *s->text_pos; if(apb->sflags & CRM_REFUTE) { refute_document(s, txtptr, txtlen); s->header->n_trains--; } else { //remember hor many documents we've eaten s->header->n_trains++; //cat it to our other text for(i = 0; i < txtlen; i++) { //delete previous prefixes here so that we get all of them if(s->indeces[*s->text_pos] != NULL_INDEX) delete_prefix(s, s->indeces[*s->text_pos]); s->indeces[*s->text_pos] = NULL_INDEX; s->text[(*s->text_pos)++] = txtptr[i]; if(*s->text_pos >= s->n_bytes) *s->text_pos -= s->n_bytes; } if(do_struct_audit && audit_structs(s)) return 0; //cache all the three character prefixes for(i = doc_start, j = txtlen; j > 2; i++, j--) { if(i >= s->n_bytes) i = 0; s->indeces[i] = add_prefix(s, i); } } if(internal_trace) fprintf(stderr, "leaving crm_expr_fscm_learn (learn)\n"); if(do_struct_audit && audit_structs(s)) return 0; unmap_file(s); crm_force_munmap_filename (s->learnfilename); return 0; } //pR is hard to figure for this guy but this one seems to do ok static double calc_pR(double p) { double m = 10.0 * fabs(p - 0.5); m = pow(m, 3.32); return p < 0.5 ? -m : m; } //entry point for classifying int crm_expr_fscm_classify ( CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen ) { SCM_STATE_STRUCT S, *s = &S; char filenames_field[MAX_PATTERN]; long filenames_field_len; char filenames[MAX_CLASSIFIERS][MAX_FILE_NAME_LEN]; char out_var[MAX_PATTERN]; long out_var_len; char params[MAX_PATTERN]; long params_len; regex_t regee; //for extracting params regmatch_t pp[2]; long i, j, k, n_classifiers; long fail_on = MAX_CLASSIFIERS; //depending on where the vbar is double scores[MAX_CLASSIFIERS], probs[MAX_CLASSIFIERS], norms[MAX_CLASSIFIERS], bn, pR[MAX_CLASSIFIERS]; long n_features[MAX_CLASSIFIERS]; long out_pos; double tot_score = 0.0, suc_prob = 0.0, suc_pR; long max_scorer, min_scorer; if(internal_trace) do_struct_audit = 1; //grab filenames field crm_get_pgm_arg (filenames_field, MAX_PATTERN, apb->p1start, apb->p1len); filenames_field_len = apb->p1len; filenames_field_len = crm_nexpandvar(filenames_field, filenames_field_len, MAX_PATTERN); //grab output variable name crm_get_pgm_arg (out_var, MAX_PATTERN, apb->p2start, apb->p2len); out_var_len = apb->p2len; out_var_len = crm_nexpandvar (out_var, out_var_len, MAX_PATTERN); //check second slashed group for parameters crm_get_pgm_arg (params, MAX_PATTERN, apb->s2start, apb->s2len); params_len = apb->s2len; params_len = crm_nexpandvar (params, params_len, MAX_PATTERN); params[params_len] = '\0'; if(crm_regcomp(®ee, "n_bytes[[:space:]]*=[[:space:]]*([0-9]+)", 40, REG_EXTENDED) ) //This should never ever happen fprintf(stderr, "regex compilation problem! I'm about to segfault!\n"); else if(!crm_regexec( ®ee, params, params_len, 2, pp, 0, NULL)) { params[ pp[1].rm_eo ] = '\0'; n_bytes = atol(params + pp[1].rm_so); }; //a tiny automata for your troubles to grab the names of our classifier files // and figure out what side of the "|" they're on for(i = 0, j = 0, k = 0; i < filenames_field_len && j < MAX_CLASSIFIERS; i++) if(filenames_field[i] == '\\') //allow escaped in case filename is wierd filenames[j][k++] = filenames_field[++i]; else if(isspace(filenames_field[i]) && k > 0) {//white space terminates filenames filenames[j][k] = '\0'; k = 0; j++; } else if(filenames_field[i] == '|') { //found the bar, terminate filename if we're in one if(k > 0) { k = 0; j++; } fail_on = j; } else if(isgraph(filenames_field[i])) //just copy char otherwise filenames[j][k++] = filenames_field[i]; if(j < MAX_CLASSIFIERS) filenames[j][k] = '\0'; if(k > 0) n_classifiers = j + 1; else n_classifiers = j; if(internal_trace) { fprintf(stderr, "fail_on = %ld\n", fail_on); for(i = 0; i < n_classifiers; i++) fprintf(stderr, "filenames[%ld] = %s\n", i, filenames[i]); }; //loop over classifiers and calc scores for(i = 0; i < n_classifiers; i++) { map_file(s, filenames[i]); if(!s->header) { //then we couldn't map this guy for some reason and already wined to // strderr n_features[i] = 0; scores[i] = 0; continue; }; n_features[i] = s->header->n_features; norms[i] = (double)s->header->n_trains; scores[i] = score_document(s, txtptr + txtstart, txtlen); if(do_struct_audit && audit_structs(s)) return 0; unmap_file(s); } if(internal_trace) for(i = 0; i < n_classifiers; i++) fprintf(stderr, "scores[%ld] = %f\n", i, scores[i]); max_scorer = 0; for(j = 1; j < n_classifiers; j++) if(scores[j] > scores[max_scorer]) max_scorer = j; min_scorer = 0; for(j = 1; j < n_classifiers; j++) if(scores[j] < scores[min_scorer]) min_scorer = j; //subtract 80% of lowest score from everybody to remove features having to //do with medium bn = scores[min_scorer] * 0.8; out_pos = 0; tot_score = 0.0; for(j = 0; j < n_classifiers; j++) probs[j] = scores[j] - bn; for(j = 0; j < n_classifiers; j++) tot_score += probs[j]; if(tot_score > 0.0) for(j = 0; j < n_classifiers; j++) probs[j] /= tot_score; else for(j = 0; j < n_classifiers; j++) probs[j] = 1.0 / n_classifiers; for(j = 0; j < fail_on; j++) suc_prob += probs[j]; suc_pR = calc_pR(suc_prob); for(j = 0; j < n_classifiers; j++) pR[j] = calc_pR(probs[j]); if(internal_trace) { fprintf(stderr, "suc_prob = %f\n", suc_prob); fprintf(stderr, "tot_score = %f\n", tot_score); for(i = 0; i < n_classifiers; i++) fprintf(stderr, "scores[%ld] = %f\n", i, scores[i]); } if(suc_prob > 0.5 ) //test for nan as well out_pos += sprintf ( outbuf + out_pos, "CLASSIFY succeeds; success probability: %f pR: %6.4f\n", suc_prob, suc_pR); else out_pos += sprintf ( outbuf + out_pos, "CLASSIFY fails; success probability: %f pR: %6.4f\n", suc_prob, suc_pR); out_pos += sprintf ( outbuf + out_pos, "Best match to file #%ld (%s) prob: %6.4f pR: %6.4f \n", max_scorer, filenames[max_scorer], probs[max_scorer], pR[max_scorer]); out_pos += sprintf ( outbuf + out_pos, "Total features in input file: %ld\n", txtlen ); for(i = 0; i < n_classifiers; i++) out_pos += sprintf ( outbuf + out_pos, "#%ld (%s): features: %ld, score: %3.2e, prob: %3.2e, " "pR: %6.2f\n", i, filenames[i], n_features[i], scores[i], probs[i], pR[i] ); if(out_var_len) crm_destructive_alter_nvariable(out_var, out_var_len, outbuf, out_pos); if (suc_prob <= 0.5) { csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1; } return 0; }