// crm_bit_entropy.c - Controllable Regex Mutilator, version v1.0
// Copyright 2001-2006 William S. Yerazunis, all rights reserved.
//
// This software is licensed to the public under the Free Software
// Foundation's GNU GPL, version 2. You may obtain a copy of the
// GPL by visiting the Free Software Foundations web site at
// www.fsf.org, and a copy is included in this distribution.
//
// Other licenses may be negotiated; contact the
// author for details.
//
// include some standard files
#include "crm114_sysincludes.h"
// include any local crm114 configuration file
#include "crm114_config.h"
// include the crm114 data structures file
#include "crm114_structs.h"
// and include the routine declarations file
#include "crm114.h"
// the command line argc, argv
extern int prog_argc;
extern char **prog_argv;
// the auxilliary input buffer (for WINDOW input)
extern char *newinputbuf;
// the globals used when we need a big buffer - allocated once, used
// wherever needed. These are sized to the same size as the data window.
extern char *inbuf;
extern char *outbuf;
extern char *tempbuf;
// The following mumbo-jumbo needed for BSD to compile cleanly, because
// BSD's logl function is not defined in all builds! What a crock!
#ifdef logl
#define crm_logl(x) logl((x))
#else
#define crm_logl(x) log((x))
#endif
#ifndef sqrtf
#define sqrtf(x) sqrt((x))
#endif
// End BSD crapola.
//////////////////////////////////////////////////////////////////
//
// Entropic Classification Basics
//
// This is an implementation of an entropic bitwise lattice Markov
// classifier. This is very roughly based on some ideas in the DMC
// (Dynamic Markov Compressor) data compression technique of Gordon
// Cormack et al and inspired by Andrej Bratko's use of it in spam
// filtering ( Gordon originally proposed DMC for compression and
// then Andrej used it for spam filtering. Matthew Young-Lai
// then proposed a technique for merging nodes which is limited
// to re-merging the "cloned" nodes in Gordon's work. Our merging
// method here has no such limitation.)
//
// This particular implementation is entirely separate, similar in
// tht it uses a Markov chain, but does not use cloning. Rather it
// uses cross- linking, in a minimum-entropy format, and supports
// "open-ended" Markov chains, rather than the toroidal lattices
// required in Cormack's DMC.
//
// For those of you into references as "closest prior", see:
// 1) Cormack, Gordon V. et al, Data Compression using Dynamic Markov
// Modeling (Computer Journal #6, 1987
// 2) Bratko, Andrej, Markov Modeling for Spam Filtering (?) TREC
// 2005
// 3) Matthew Young-Lai, Adding state merging to the DMC data
// compression algorithm, Information Processing Letters 70
// (1999) 223–228.
//
//
// The basic concept is to construct a Markov chain that describes
// the sequential bit pattern of a message, and then use that chain
// to compress the incoming unknown text; the smaller the entropy
// the better the quality of match. Yes, this sounds like
// a horrible idea, but Andrej Bratko at TREC 2005 has shown that
// it does work. One advantage is that it can work on files that
// have no obvious tokenizations, like JPGs.
//
// Like every other Markov chain, there's a start node, and then
// for each observed 1 or 0 there's an observed freqency of 1 or 0,
// and the associated next node for a 1 or 0. The quality of match
// is just the sums of the entropy (*) of each path taken.
//
// More correctly, entropy is the weighted sum for an ensemble of
// signals, and is -Psig * log2 (Psig), and the sum over an
// ensemble of possible texts yields the entropy which is the
// average bit rate. Whichever Markov chain shows less entropy
// is the one that matches best. However, in this case we're
// only given one example, which is the text in consideration,
// and so the probability of each transition is 1.0000 exactly
// (as we have only a single sample of the unknown). Thus, we
// actually sum the bits required; that is, log2 (Psig).
//
// Nota Bene 1: in order to make this code a little bit more
// general, we have each node accept an alphabet of N different
// symbols, where N defaults to 2 (that is, bitwise) but we'll
// write the code to be more general than that; it's a compile-time
// change rather than a hack-the-code change. States *are* always
// numbered starting at zero, though, so consider the alphabet to
// always be an ENUM.
//
// Nota Bene 2: If you happen to be familiar with the other CRM114
// classifiers, note that this classifier does NOT use the tokenizer
// regex system, nor the SBPH or the OSB feature set. Instead,
// all that state information is captured by where you are in the
// Markov chain, and the concept of "token" is replaced by the
// concept of "alphabet". By default, right now the "alphabet"
// is single bits.
//
// Nota Bene 3: As currently designed, the learning algorithm
// defaults to initialize with a toroidal perfect shuffle- thus the
// default model always takes constant memory and you can't run off
// the "edge" of the model. We still need to fix it so that
// "crosslink" will allow a jump to another more appropriate (lower
// entropy) part of the braid; that's not there yet.
//
// Nota Bene 4: As the toroidal shuffle didn't give extremely good
// results, there's also a "unique state" classifier, set by the
// flag < unique >. This generates a *loopless* chain where each
// text learned produces a single linear chain; this means we
// follow whatever model has already had the same bit pattern, but
// whenever we diverge, we start allocating new bit nodes
// immediately. This works decently but uses up a lot of nodespace
// quickly. It also means that you can run off the "edge" of the
// model because it's not closed like the toroidal shuffle. By
// default, running off the edge causes a restart at the START node
// (node 1, as it turns out; node 0 is a
// bookkeeping/anchor/sentinel node). This works, but doesn't give
// very high accuracy as the reSTARTs are not time-aligned with the
// model; they might even occur in the middle of a byte.
//
// Nota Bene 5: In the quest for better accuracy, instead of doing
// a reSTART when we're classifying and fall off the edge of a
// 'unique' model, we branch to a node that has a very similar
// prior history. How this is done is seen below but it boils down
// to finding a node that already exists that has had a very
// similar prior history bit pattern. This can actually be done in
// near-constant (and pretty small!) time; the magic to do this is
// all described below.
//
// Nota Bene 6: In the quest for smaller (and faster!) models and
// the ability to not run out of nodes, we have the <crosslink>
// flag, which allows the LEARNing process to do the same branching
// that is done when we run off the end of the model chain during
// classification. More specifically, before allocating a new
// node, we look for the closest prior history in already-allocated
// nodes, and if it's "close enough" (as set by a threshold), we
// make that the next node in our chain. This produces Markov
// models with both acyclic joins and repeating loops, and
// effectively prevents the model from growing without bound, as
// the model reuses nodes quite effectively (sometimes more than a
// 300:1 reuse!). It also produces a model that is quite accurate
// when trained with SSTTT. Although we can argue that ROCAC isn't
// the best possible figure of merit, SSTTT can yield very good
// ROCAC values (0.03 per cent); the terminal error rate on the
// TREC06 public corpus can be as low as 3 errors per 10,000 (that
// is, three errors in the final ten thousand messages, of the
// 92Kmsgs corpus.)
//
///////////////////////////////////////////////////////////////////
//
// Algorithm for Classification:
//
// Follow the Markov chain from START with total entropy of 0 to
// begin, then walk the Markov chain taking each branch as
// instructed by the successive bits of incoming text. Along the
// path, accumulate the compressed size as defined by - log2 (Pt)
// for the current transition. Here, we define probability for a
// transition Pt = Nt + c / (Nt + N~t + 2c). Note that for c = 0
// this can yield probabilities == 0.00 exactly, which will yield
// an infinite entropy (log of zero is... not a good thing).
// Gordon Cormack's paper indicates a "small" value of c is good
// for shorter texts; unfortunately there's no mention of what
// "small" means.
//
// If you run off the end of the chain (some learning algorithms
// generate terminating chains, others always generate loops) then
// do one of : (1) stop accumulating entropy; you can't go any
// further; (2) go back to the START node and keep accumulating, or
// (3) jump to the node with the closest prior history and keep
// accumulating.
//
// Do this on each bitwise markov file; the file which accumulates
// the lowest compressed size is the best matching file.
//
///////////////////////////////////////////////////////////////////
//
// Algorithm for Learning - Overview
//
// Before you learn in the first text, initialize the data
// structure with a START node with NULL exit nodes for each of the
// possible exits. If you want, you can put something bigger or
// more complex in, but it's not necessary. If you're running with
// constant storage, allocate the rest of the storage, preformat
// them as a linked list of free nodes, and off you go.
//
// Follow the chain, incrementing "seen" counts as you go. If
// you're learning a file, and you end up at a node exit that's
// a NULL (meaning no node is yet linked in for the current bit
// pattern) then grab a node off the free list (or malloc one).
// See below under "advanced methods" for how to get more
// nodes by merging very similar nodes together.
//
// Whenever a node is seen with both a sufficiently large number of
// seens, and a sufficiently balanced number of differing symbols,
// "clone" the node so that the Markov chain branches on the
// previous node. This is to split the chain where the current
// state is insufficient to model the underlying process. This
// only can work if the Markov model somehow contains a merge;
// otherwise each node can only be reached from a single other node
// (and so on, all the way back to the START node).
//
// In Cormack's version, when you run out of free nodes, you must
// reinitialize the storage and rebuild the model (using only the
// last part of the corpus data), but there are ways to merge
// similar nodes (see "advanced methods" below) to get more free
// nodes without throwing away data.
//
// Note - there is a "classical" method for building a hidden
// Markov model, but it's O(n^2) in the number of nodes in the
// output model! It basically uses hillclimbing techniques to form
// the convex hull and then climbing to the local top of the hull
// repeatedly. That's fine for small HMMs but here, when we may
// have a million nodes, that's way too slow.
//
// Method 1 - for both initial-loop or terminated Markov chains
//
// Follow the Markov chain from START, incrementing the seen-0
// and seen-1 bits of the Markov chain. When you hit a 0 or 1
// choice pointing to NULL, grab an unused node off of the free
// list. If the free list is empty, use an advanced method to
// get more.
//
////////////////////////////////////////////////////////////////////
//
// Advanced Method - Cloning a Node
//
// If a node has more than one infeeder, there is always the
// possibility that the node really should be two nodes. This
// could happen either because of a prior merge, a prior clone, or
// because the initial node setup had a loop in it (which it may or
// may not; it can work both ways.)
//
// A node should be cloned when there is a sufficient number of
// incoming events to be statistically significant, and the
// outgoing events are sufficiently balanced (i.e. the current node
// is of sufficiently low predictive value - that is, it's high
// entropy no matter what you do ) to lend credence to the idea
// that this one node really represents two states and splitting
// the node is a good idea.
//
// Of course, there's no reason (or way!) to clone a node that has
// only one infeeder even if the node state itself has low
// predictive power; in this case what's really happening is that
// the real corpus' hidden Markov model is diverging at a prior
// node. There simply isn't any predictive state available.
//
// Topic for further work - given only a single infeeder, maybe we
// should back propagate the "split" up the chain of single priors
// until a state is found with at least two infeeders, and clone
// *that* node and all successor nodes down to the current node.
// Note that if we hit the START node we don't do this as there's
// no way to clone START.
//
// The simple and obvious way to clone a node of low predictive
// power is to keep track of the previous node while doing a
// training traverse, and only clone off the path from the previous
// node. This means creating a new node with the same successor
// nodes as the current node (or not; it depends on whether the
// current node has a successor or is "end of the line"), and has
// only a single infeeder, which we know is at best a single line
// of prior states of arbitrary length, and at worst a series of
// nodes that all looked similar at some time in the past.
//
//////////////////////////////////////////////////////////////////
//
// Advanced methods - getting free nodes by merging nodes
//
// NOTA BENE: the code to implement merging is not debugged because the
// open-graph training with crosslinking works so well. Consider
// this stuff more as "design notes for the future".
//
//
// merging nodes - in order to represent loops in the markov model,
// it's sometimes necessary to loop back into prior Markov states.
// Finding the appropriate prior state can be a hassle. Cormack and
// Lai both postulate a "way" to find the appropriate prior state,
// but don't describe an efficient way to do so.
//
// Here's a shortcut, using arithmetic to get an infinite impulse
// response out of a floating point number; the already established
// node with the closest floating point number to the current
// generated floating point number is the "closest node". This is
// a different version of "arithmetical coding", but it can be
// lossy (in a controlled way). See the Guazzo coding method in
// Cormack's paper - note that we do NOT use Guazzo coding for
// generating output but rather just for quickly finding the node
// whth the closest prior state for us to jump to when we run
// off the edge of a model.
//
// Clearly, one could weight the prior bit by a factor of 1/2
// (really 1/(2^1)), and weight the prior float by 1 - (1/2) in
// which case the prior float's mantissa is just the last N bits,
// in reverse order. Unfortunately, this also gaurantees that
// prior history utterly disappears as soon as it rolls out of the
// number of bits in the matissa of the local floating point
// operation.
//
// More interesting is larger values of the exponent; for example a
// value of 2 yields a half-bit fractional representation, so an
// error of 1 bit seen N steps in the past is overrideable by two
// correct bits at N+1 and N+2 bits in the past. For example, the
// with an exponent of 2, the most recent bit has significance
// 1/4 = 0.250, while the second-most-recent is 3/4 * 1/4 = 3/16 =
// .1875, and the bit before that is 3/4 * 3/16 = 9/64 = .04687,
// and so on.
//
// Higher exponents just stretch out this series even further.
// There is no need to assume only an integer exponent. Indeed,
// it's only because of the finite precision of the computer that
// this a finite impulse response (FIR) filter; infinite precision
// would give infinite impulse response and "total recall" of prior
// states in the Markov process - but also the ability for some
// long-ago series of perfectly correct steps to blow away the more
// recent entropy. For this reason, we usually use an exponent of
// 1, which yields a prior bit weight of 1/2 (whch is just the
// floating-point mantissa as described above). (NB: as of 20070101,
// the "floating point" has been replaced by a "long long" with 64 bits
// of precision. This uses less space)
//
// Values of the exponent less than one yield a situation where the
// range of possible outcome FIR numbers becomes discontinuous;
// that is, significant chunks of the range become unreachable by
// any possible prior bit series. Consider the FIR value 0.5000
// with exponent of 0.5. The most recent bit then has weighting of
// 0.70711, with only .29289 of weight available for all following
// bits. The maximum value achieveable for a pattern 0111111.. is
// then .29289, while the minimum value achieveable for pattern
// 100000... is 0.70711, so the range of possible FIR numbers does
// not contain any value between .29289 and .70711. Recursively,
// similar subranges cease to be accessible as well.
//
// The exponent for "exact ambivalence" between an error in state N
// versus two errors in state N-1 and N-2 is around 1.385; the
// exponent for ambivalence between an error in state N versus
// errors in states N-1, N-2, and N-3 is about 1.13 . Note that both of
// these numbers yield rather short "memories" in the FIR filter
// before the Nth prior bit underflows and is lost completely.
//
// By using the FIR coding of prior state, we need only keep a
// table of the FIR path floating point number generated on the
// path that first generated this node (which can be sorted,
// yielding a _very_ fast node merging capability; just make one
// pass through the sorted table looking for the two closest
// floating point numbers.) Those close pairings of nodes are good
// merge candidates.
//
// It's probably worth checking these merge candidates to see if
// they have similar local probabilities - that is, if they have similar
// 0-counts and 1-counts. If one node goes 0 mostly and the other
// goes 1 mostly, then there's a good chance that this isn't a good
// merge. How to weight FIR prior history match versus the local
// outbound probability is a good question.
//
// Another merge candidate is to look at nodes which have identical
// 0 or 1 destinations. However, finding this requires either an
// extensible list of back pointers or brute-force search (albeit
// over a very small subset of the entire Marov Chain).
//
// If we're running in <unique> mode (that is, nodes are allocated
// on each unique path, and no node has ancestors with any
// possibility of significantly deviating FIR values, We can also
// back-calculate the approximate "feeder" node FIR values that
// could lead to a particular node, given the decreasing exponent.
// For example, if the current node's FIR is .3700000, then it can
// only have been reached by a 0-branch from a node whose prior was
// (2 * .3700000 = .7400000 ) plus or minus any crosslink-error
// threshold (which is typically under 1 in a million or less).
// So, we can just use our FIR lookaside table to find all possible
// ancestors to this node.
//
// When merging two nodes, we (speaking in the 0/1 tense; larger
// alphabets are an obvious extension) we perform the following steps:
//
// * update the to-be-kept node with the sums of the outbound 0-seen
// and 1-seen counts. Without any loss, we can choose the
// to-be-kept node is the node with the lower index number.
// * we keep whichever output chain had the highest counts of each
// of the 0-seen and 1-seen categories
// * we set the FIR prior history to the weighted average of the
// two prior weighted averages
// * for the two output nodes which were _not_ kept (a 0-chain and
// a 1-chain) mark these chains for eligibility for garbage
// if we choose active collection;
// * put the merged-away node on the free list.
// * make an update pass thorugh the nodeset; any node outlet pointing to
// the merged-away node is reset to point to the to-be-kept node.
//
// We may choose to merge more than one node when we encounter
// an empty free list. Each merge gives us one node for sure, plus
// the possibility of many others.
//
// At some point, a garbage collection is necessary to reclaim nodes
// that no longer have a prior node. We can do this either by:
// * reference counts - each node can keep an accurate reference count
// of nodes that can transfer to this node. Since there are only
// three events that can cause the reference count to change (those
// being "allocate from free list == refcount = 1", "clone node
// refcount = split incomings, and "merge with another node
// == refcount1 + refcount2" then the refcounts can be maintained at
// relatively low cost.
// * we can run a "classic" mark and sweep. However, there's a
// better way...
// * we could add back pointers in each node. Unfortunately, that
// means the size of a node could grow rather large. There is
// still a better way....
// * We could add chaining pointers on the _output_ of each seen
// slot. The "to" node gets a single slot added, to point to
// the first node that can transfer in; each such node points
// to the next node that can also transfer to the target node.
// Preferably, the insertions are done in order so the list
// remains sorted.
//
// When learning a new text, we also compute the running FIR prior
// floating point value, and on each node we encounter in our path,
// we update the local prior state FIR value according to the
// weighted average of the new text (with weight 1) and the prior
// texts (with weight = total times this node has been visited in
// the past = sum of the 0-seen and 1-seen output values ). For
// this reason, node FIR prior encodings may change so it's necessary
// for the sorted FIR vector to contain not just the FIR values, but
// also backpointers to the FIR nodes that held them.
//
// Improvement: rather than a separate sorted list of FIR values,
// embed the FIR value in the cell, along with the "next less"
// and "next greater" indices. Thus, each cell contains within it
// pointers to it's most likely merge candidates.
//
// Secondary improvement - when allocating a new node from the
// free list, how does one efficiently link the new node into the
// next-less and next-greater lists? Sequential search clearly
// works, but is inefficient. Clearly a binary (or near-binary)
// tree would assist this but it's a programming hassle to
// maintain the two data structures in synchrony.
//
// Alternatively, a fixed-size table of nodes at or close to
// various "thresholds" can be maintained; when a node changes
// it's FIR value, it only takes constant time to round the FIR to
// a table entry value and check to see if the new value is closer
// to the "perfect" value of FIR for that table entry versus the
// node that is currently in that slot. If it's better (not just
// as good, "better", to minimize thrashing) then the new node
// index is written into the table; otherwise the old node stays.
//
// Note that nothing prevents a node from being in the table more
// than once, nor is there any correctness "need" for the table to
// be exactly right. Instead, the lookaside table is just a
// speedup that helps a lot most of the time. When a node changes
// value, it will move (some) so incorrectness is part of the
// game. If you really want, you can move the node to a new
// lookaside table slot, and then look at the previous next-less
// and next-greater nodes, and choose one of them to go into
// the table, replacing the (now moved) node. (NB: in implementation,
// it turns out that the FIR lookaside table is *critical* for
// reasonable performance; any bug in that code that causes
// weak inital guesses will slow the classifier down by a factor
// of 10 or more.
//
////////////////////////////////////////////////////////////////////////
//
// Futher Hacks and Improvements: resuming a CLASSIFY past NULL.
//
// If the classifier runs off the end of the known Markov chains,
// what do we do? Just drop the following text? Possibly - but
// there is an alternative.
//
// We can use the FIR arithmetical coding to resume the Markov
// chain at a near-optimal position. The difference between where
// we are and where we resume should be as small as possible;
// fortunately the FIR-next-least and FIR-next-greater indices in the
// current node give the two nearest side-steps at the current Markov
// node; we can then explore up and down the FIR chain to find a
// preexisting node that actually has a further non-NULL chain.
// The entropy increase due to this side-step outside the Markov
// chain is not entirely computable from the FIR values, because
// the prior information that's already rolled off the least significant
// bit of the FIR calculation.
//
// (note: this is currently the default action)
////////////////////////////////////////////////////////////////////
//
// IMPLEMENTATION AND STORAGE DETAILS
//
// Data storage format for bitwise Markov classifiers. Note that
// we always use indices rather than pointers as we want the data
// structure to be stored and reloaded from disk via mmap.
//
// We have several "global static" values, which are stored in
// the meta-start node which is always node 0:
//
// * the index of the START node (should always be 1) - stored
// in alphabet-zero slot.
//
// * the index of the node with the lowest FIR value - node 0 is
// * always "lower than the lowest"
//
// * the index of the node with the highest FIR value - node 0 is
// * also always the node "higher than the highest".
//
// If we have a FIR lookaside speedup table, that also needs to
// have space allocated.
//
// Each normal node contains a header containing:
//
// * the floating-point FIR value that codes the prior states
//
// * the index to the node with the next larger FIR value
//
// * the index to the node with the next smallest FIR value
//
// These FIR prior values are updated whenever a node is
// visited during learning. This may mean minor motion up and
// down the FIR chain.
//
// Each node also contains BITMARKOV_ALPHABET_SIZE slots; each slot
// contains the following (so there's one such slot for each
// member of the alphabet A). There are two kinds of data stored in
// each slot; they are related but not interdependent. The forward
// markov slot data is used during classification; the backward slots
// are used only during maintenance.
//
// Forward Markov slots:
//
// * how many times this alphabet member was seen as the input at
// this node.
//
// * when it's seen, what the index of the next node to transition to
// is (index, not pointer, for position-independent code)
//
// Backward slots for maintenance (clone and merge)
//
// * index of the first incoming node that can transfer to this
// node when that prior node saw this member of the alphabet A
// (note that this is NOT the alphabet member that the current
// node is seeing. This just gives an array of
// backlinks for each member of the alphabet). NULL if there's
// no known encounters of an incoming with the most recent prior
// for this alphabet member.
//
// * index iof the _next_ incoming node that can transfer to the
// same outgoing node as this node. In combination with the
// prior index above, this forms a linked list of all of the
// nodes that can reach a particular node. As we don't need
// to traverse this list in any particular order, it's strictly
// a "stack" (newest added at the front).
//
//
// Using 64-bit storage for the floating-point FIR values and
// for the visit counts, and 32-bit storage for all indices, then
// assuming we're using a binary alphabet, we have:
//
// Per-cell
// FIR prior float = 8 bytes
// FIR up and down links - 2 * 4 bytes = 8 bytes
// Slot within cell
// forward count - 8 bytes * 2 = 16 bytes
// forward index - 4 bytes * 2 = 8 bytes
// backward first - 4 bytes * 2 = 8 bytes
// backward next - 4 bytes * 2 = 8 bytes
//
// Total - 56 bytes storage per nodem, 18K nodes per megabyte.
// Note that the absolute minimum size is 24 bytes/node with no
// advanced features, so we're about a factor of two worse in
// space, and for that we get near-linear in size for both learning
// and classification.
//
// If we add in a 0.1% overhead size for the FIR prior lookaside
// accelleration table (containing 1 index per 4-byte word) then
// for every 4 bytes (1 index) of FIR prior lookaside there are
// 4Kbytes (about 73) FIR value cells, so with the FIR prior
// lookaside and an assumption of roughly uniform FIR prior values,
// we need to traverse an average of 1/4 of 73 or about 18 markov
// chain cells. With a 1.0% overhead (1 byte per 100 bytes, or 1
// index slot per 400 bytes) then there is 1 index slot per 7
// markov chain cells, and (again assuming a uniform distribution
// of FIR priors) an average of just 1.8 markov chain cells visited
// per FIR lookup.
//
// If the distribution of FIR values is too nonuniform, an adaptive
// grid method may be used to smooth it out. But we'll climb that
// mountain when we come to it.
//
// Note to self: don't use the mathlib LOG function!
// It's far too slow. We'll custom craft our own that uses
// linear interpolation over a lookup table.
//
///////////////////////////////////////////////////////////////////
//
// An interesting information-theoretic philosophical point - if
// we only have a million nodes in the database, then at most we
// encode 20 bits worth of state. A 500-megabyte file like Andrej
// uses would have at most 62 million nodes (12 bytes/node not
// with short counts), or 41 million nodes = 27 bits to define
// state. Since the FIR-prior value encodes at least 64 bits, can
// go to 96 bits (GCC 4.0 "long double") and possibly 128 (if your
// compiler supports "-mlong-double-128") shouldn't we be sticking
// with the FIR-prior and simply *ignoring* the actual links, as
// they (and the state nodes) encode far less information?
//
// Submitted as an "ultimate" pare-down - the memory contains
// ONLY the times-previously-seen information. It's a big array,
// indexed by the most significant N bits of the prior, with the
// high-order bit being the transition now being executed. All
// that's there in the array is the number of times this transition
// has been executed in LEARN mode; everything else is embedded
// in the state (which _is_ the index).
//
// That optimization would concieveably allow one to write an
// embedded form of bit-entropic classification in as little as
// two to four megabytes of space per class; in short, something
// that would fit on an embedded system.
//
//////////////////////////////////////////////////////////////////
//
// File Structure and Layout for Bit Entropy
//
// The file starts with a header block, of size 4 kybtes; the first
// four longwords give the indexes (in longwords, for 32-bit alignment)
// of the start and length of the FIR lookaside table and the node list.
//
/////////////////////////////////////////////////////////////////
//
// **** NOTE - THIS IS ALL UNNECESSARY AND DEPRECATED IN THE CURRENT CODE!
//
// Per-node memory layout
//
// We have three pointers in the node slot; each of these is replicated
// for each member of the alphabet in the Markov graph.
// * The .nextcell says what the next cell is when we see this alph
// * The .backfirst cell says what the first cell is that could
// get us here when it saw an alph
// * The .samenext cell says what the next cell that also gets you
// you to the same nextcell when you see the same alph.
//
// Yes, this is slightly redundant, but it's very fast. (but note the
// approximation above that would save us all this memory!)
//
// A diagram here helps to understand the slotting structure
//
// node
// .nextcell >------------------->--> node
// .backfirst <---\ />------->/ .nextcell >---------->
// .samenext -| \---------------------< .backfirst
// | /
// node <-| /
// .nextcell >---/
//
//
// Note- "backfirst" and "samenext" are redundant given the FIRlat
// system; we can always find referrers to the current node by
// looking at the possible origins - a node of FIR value 0.260 can
// only come from the alpha link of a node of FIR value 0.520 ; a
// node of value 0.779 can only have come from the 1-link of a FIR
// of .558 so we can just search the FIR around there.
//
//
//
// Helper function to calculate entropy from a probability
//
// We speed this up with a lookup table for relatively small numbers
#define ENT_CACHE_MAXCOUNT 256
#define ENT_CACHE_MAXTOTAL 256
static double stats_2_entropy (long count, long total)
{
static long init = 0;
static double loglookups[ENT_CACHE_MAXCOUNT][ENT_CACHE_MAXTOTAL];
double value;
// if no prior information, this is 1 bit exactly of data.
if (total < 0.5)
return (1.00);
if (count > total)
return (0.0);
// value = ( - (crm_logl (
// (count + BIT_ENTROPIC_PROBABILITY_NERF)
// / (total + BIT_ENTROPIC_PROBABILITY_NERF)))
// / 0.69314718 );
// return (value);
// Do we need to initialize the cache to "no data", which we
// code for as a -1 as the entropy.
if (init == 0)
{
long i, j;
init = 1;
for (i = 0; i < ENT_CACHE_MAXCOUNT; i++)
for (j = 0; j < ENT_CACHE_MAXTOTAL; j++)
loglookups[i][j] = -1;
}
// See if we've already cached this log probability value?
if ( count < ENT_CACHE_MAXCOUNT && total < ENT_CACHE_MAXTOTAL)
{
value = loglookups[count][total];
if (value > 0.0)
return value;
}
// Nope. Nothing in the cache. Calculate it and cache it.
//
// "correct" entropy is factored by the unknown prior:
// value = ( - ( ( count + BIT_ENTROPIC_PROBABILITY_NERF)
// / ( total + BIT_ENTROPIC_PROBABILITY_NERF))
// * (crm_logl (
// (count + BIT_ENTROPIC_PROBABILITY_NERF)
// / (total + BIT_ENTROPIC_PROBABILITY_NERF)))
// / 0.69314718 );
// But here, we know the event has come to pass and so the
// prior is 1.0 (the event itself is a certainty at this point;
// we are now just counting bits needed to encode it ! ).
value = ( - crm_logl (
(count + BIT_ENTROPIC_PROBABILITY_NERF)
/ (total + BIT_ENTROPIC_PROBABILITY_NERF)))
/ 0.69314718 ;
// if it fits, put it in the cache
if ( count < ENT_CACHE_MAXCOUNT && total < ENT_CACHE_MAXTOTAL)
loglookups[count][total] = value;
// and we're done.
return (value);
}
// Helper functions for dealing with the FIRlat
// Given a FIR value, what slot would this reside in?
static long fir_2_slot (double fir, long firlatlen)
{
long ifir;
ifir = (fir * (firlatlen - 1.00));
if (ifir < 0) ifir = 0;
if (ifir >= firlatlen) ifir = firlatlen - 1;
return (ifir);
}
static double slot_2_fir (long slot, long firlatlen)
{
double outval;
outval = (slot + 0.5) / (firlatlen - 1);
if (outval > 1.00) outval = 1.00;
if (outval < 0.0) outval = 0.0;
return (outval);
}
// Dump just the significant entries in the FIRlat
static void firlat_significant (ENTROPY_FEATUREBUCKET_STRUCT *nodes,
long *firlat, long firlatlen)
{
long i;
fprintf (stderr, "**** FIRLAT significant scan *****\n");
fprintf (stderr, "root node low %ld high %ld\n",
nodes[0].fir_smaller, nodes[0].fir_larger);
for (i = 0; i < firlatlen; i++)
{
if (firlat[i] > 0)
{
fprintf (stderr,
"FIRLAT slot %ld pval %f node %ld fv %f down %ld up %ld\n",
i, slot_2_fir(i, firlatlen),
firlat[i], nodes[firlat[i]].fir_prior,
nodes[firlat[i]].fir_smaller,
nodes[firlat[i]].fir_larger);
};
};
}
static void firlat_sanity_scan (long *firlat, long firlatlen,
ENTROPY_FEATUREBUCKET_STRUCT *nodes,
long nodeslen)
{
long i;
long stepcounter;
// Scan the FIRLat for errors (optional)
//
for (i = 0; i < firlatlen; i++)
{
if (firlat[i] > nodeslen || firlat[i] < 0 )
fprintf (stderr,
"Internal FIRLAT error: slot %ld claims OOB node %ld\n",
i, firlat[i]);
};
for (i = 0; i < nodeslen; i++)
{
if ( nodes[i].fir_smaller < -1
|| nodes[i].fir_larger < -1
|| nodes[i].fir_smaller > nodeslen
|| nodes[i].fir_larger > nodeslen )
fprintf (stderr,
"Internal FIRchain error at node %ld (%f) smaller: %ld larger %ld\n",
i, nodes[i].fir_prior,
nodes[i].fir_smaller, nodes[i].fir_larger);
};
//
// Now verify the FIRLAT chain integrity upward
//
stepcounter = 0;
i = nodes[0].fir_larger;
while (stepcounter < nodeslen + 1
&& nodes[i].fir_larger > 0
&& nodes[i].fir_larger < nodeslen)
{
stepcounter++;
i = nodes[i].fir_larger;
};
if (stepcounter > nodeslen+1)
fprintf (stderr, "ERROR: the FIR chain is figure-6ed upward\n");
if (nodes[i].fir_larger != 0)
fprintf (stderr, "ERROR: the FIR chain goes off to node %ld\n",
nodes[i].fir_larger);
// and again, check chain integrity downward
stepcounter = 0;
i = nodes[0].fir_smaller;
while (stepcounter < nodeslen + 1
&& nodes[i].fir_smaller > 0
&& nodes[i].fir_smaller < nodeslen)
{
stepcounter++;
i = nodes[i].fir_smaller;
};
if (stepcounter > nodeslen+1)
fprintf (stderr, "ERROR: the FIR chain is figure-6ed downward\n");
if (nodes[i].fir_smaller != 0)
fprintf (stderr, "ERROR: the FIR chain goes off to node %ld\n",
nodes[i].fir_smaller);
}
//
// Helper function to find the smallest node that's larger than
// the called FIR.
//
// Originally both this routine and find_closest returned exact
// answers- but a bug caused the hash-table to find exact entry
// points to overload buckets representing fairly common similar
// states, and so we'd end up with exponentially increasing runtime
// during the sequential search part of the run (measured at over
// 2000 steps in sequential search average, when the average number
// of steps should have been 5 (five).
//
// The intermediate fix is to only call this (exact) function when
// you need to move a node - that is, during training. Otherwise,
// "close enough is close enough", and use the "close" call down
// below. But the _real_ bug has now been fixed, and the "close"
// routine is now just an efficiency firewall. It never should
// have been; but hey... it's here.
//
static long firlat_find_smallest_larger
( ENTROPY_FEATUREBUCKET_STRUCT *nodes,
long nodeslen,
long *firlat,
long firlatlen,
double my_fir)
{
long hit_node;
long firlat_entry;
long step_count;
// Start out with FIRlat option.
if (my_fir < 0.00 || my_fir >= 1.00 || firlatlen < 10 || firlatlen > 200000)
fprintf (stderr, "My FIR is outrageous (value: %e, firlatlen %ld)\n",
my_fir,
firlatlen);
firlat_entry = fir_2_slot ( my_fir, firlatlen);
if (internal_trace)
fprintf (stderr, "FFSL: Searching for FIR: %f slot %ld \n",
my_fir, firlat_entry);
if ((firlat_entry < 0) || (firlat_entry >= firlatlen))
fprintf (stderr,
"FIRLAT is very hosed (myfir: %e, entry number %ld)!\n",
my_fir,
firlat_entry);
if (internal_trace)
fprintf
(stderr,
"FFSL search: FIR: %f, slot: %ld node: %ld, smaller: %ld, larger:%ld\n",
(float) my_fir, firlat_entry, firlat[firlat_entry],
nodes[firlat[firlat_entry]].fir_smaller,
nodes[firlat[firlat_entry]].fir_larger );
if (firlat[firlat_entry] > nodeslen
|| nodes[firlat[firlat_entry]].fir_smaller < 0
|| nodes[firlat[firlat_entry]].fir_larger < 0 )
fprintf (stderr, "Internal FIR chain error at slot %ld node %ld (%f)\n",
firlat_entry, firlat[firlat_entry], my_fir);
// Move down in the firlat till we find a live node.
while ( firlat_entry > 0 && firlat[firlat_entry] <= 0)
{
// fprintf (stderr,
// "firlat_entry: %ld points to node %ld so stepdown \n",
// firlat_entry, firlat[firlat_entry]);
firlat_entry--;
if (internal_trace)
fprintf (stderr, "_");
}
// Do a little metering to see if we're getting firlat entries?
//if (firlat_entry == initial_firlat_entry)
// {
// fprintf (stderr, "`");
// }
//else
// {
// fprintf (stderr, " %ld", firlat_entry - initial_firlat_entry);
// };
// Stupidity check - are we running down off the end of the FIRLAT?
if (firlat_entry == 0)
{
// Yes. Resort to sequential search
hit_node = nodes[0].fir_larger;
if (internal_trace)
fprintf (stderr,
"FFSL: Need to start from rootptr- next node is %ld ( %f) for fir %f \n",
hit_node, nodes[hit_node].fir_prior, my_fir);
}
else
{
// This non-null entry is where we start to look.
hit_node = firlat[firlat_entry];
if (internal_trace)
fprintf (stderr,
"FFSL: non-null ENTRY at slot %ld node: %ld prior: %f down %ld up %ld\n",
firlat_entry,
hit_node, nodes[hit_node].fir_prior,
nodes[hit_node].fir_smaller, nodes[hit_node].fir_larger);
};
// Now we're on the closest "less than" firlat entry (which, if we
// had a firlat hit, may still be above our desired place.
// Part 1 - find a place where we are clearly smaller than
// the node above us.
step_count = 0;
while ( hit_node > 0 && my_fir > nodes[hit_node].fir_prior )
{
hit_node = nodes[hit_node].fir_larger;
step_count++;
if (internal_trace)
fprintf (stderr, "+");
};
// Part 2 - assure that (because of FIRlat issues) we are not "above"
// yet other nodes that we are smaller than.
while ( hit_node > 0 && my_fir < nodes[hit_node].fir_prior )
{
hit_node = nodes[hit_node].fir_smaller;
step_count++;
if (internal_trace)
fprintf (stderr, "-");
};
// and again, try going up one last time.
while ( hit_node > 0 && my_fir > nodes[hit_node].fir_prior )
{
hit_node = nodes[hit_node].fir_larger;
step_count++;
if (internal_trace)
fprintf (stderr, "*");
};
// hit_node is now the smallest node that's larger than us
if (internal_trace)
fprintf (stderr, "FFSL result node: %ld with FIR %f \n",
hit_node,
nodes[hit_node].fir_prior);
if (internal_trace)
fprintf (stderr, "Steps used: %ld\n", step_count);
return (hit_node);
}
static long firlat_find_closest_node
( ENTROPY_FEATUREBUCKET_STRUCT *nodes,
long nodeslen,
long *firlat,
long firlatlen,
double currentfir)
{
long larger;
long smaller;
long closest;
double error_larger, error_smaller;
#define COMPLEX_FIND_CLOSEST
#ifdef COMPLEX_FIND_CLOSEST
// Complex find closest code. The idea here is to do
// increasing radius searches to find the closest nearby
// strings in Hamming distance. To generate this, we first
// search at +/- one LSB in the floating point mantissa, and
// then successively double it, which sweeps through the
// single-bit errors. We can then repeat for double-bit
// errors, triple-bit errors, and so on... which leads to a
// huge number of possible next states. For IEEE floating
// point, the limiting case is an underflowing double-bit
// error representation. Of course, the value of this varies
// with the local FIR value, but for a FIR value of 1.0 the
// double that touches only the lowest-order mantissa of a
// double float is hex 0x3bf0000000000000 (approximately
// 5.421011E-20). For a FIR of 0.50000, the value is 1/2
// that, and so on. However, this is really not
// computationally tractable right now.
#endif
// return the index of the node that's closest in
// FIRLAT value to our current node.
// assume larger is closer;
larger = firlat_find_smallest_larger
(nodes, nodeslen, firlat, firlatlen, currentfir);
error_larger = nodes[larger].fir_prior - currentfir;
closest = larger;
// now make sure that smaller isn't better.
smaller = nodes[larger].fir_smaller;
error_smaller = currentfir - nodes[smaller].fir_prior;
if (error_smaller < error_larger)
closest = smaller;
return (closest);
}
// How we remove a node from the FIR chain and firlat
static void firlat_remove_node ( ENTROPY_FEATUREBUCKET_STRUCT *nodes,
long nodeslen,
long *firlat,
long firlatlen,
long curnode)
{
// cut it out of the chain first
long smaller_node, larger_node;
long firlat_entry;
double perfect_fir;
// set up larger and smaller nodes
smaller_node = nodes[curnode].fir_smaller;
larger_node = nodes[curnode].fir_larger;
{
if (internal_trace)
{
fprintf (stderr, "About to FIR chop node %ld (%ld %ld)\n",
curnode, smaller_node, larger_node);
}
if (smaller_node < 0 && larger_node < 0)
{
// what do we do if the node ain't in the FIR chain - nuthin!
// we might be initiated on a shuffle or somesuch.
// fprintf (stderr, " BLARG!!!! removing a non-FIRchained node!\n");
}
else
{
// cut the node out of the FIR chain.
nodes[smaller_node].fir_larger = larger_node;
nodes[larger_node].fir_smaller = smaller_node;
}
}
// Now we're definitely out of the chain. Mark us so....
nodes[curnode].fir_smaller = -1;
nodes[curnode].fir_larger = -1;
if (internal_trace)
fprintf
( stderr,
"FIRchain delete %ld (%lf) between %ld (%lf) and %ld (%lf)\n",
curnode, nodes[curnode].fir_prior,
smaller_node, nodes[curnode].fir_prior,
larger_node, nodes[curnode].fir_prior);
// Since we're not in the chain (and unfindable that way, we can
// now also be removed from the FIRLAT lookaside table.
// this is where we think we are - which might not be where we are.
if (nodes[curnode].firlat_slot > -1)
{
firlat_entry = nodes[curnode].firlat_slot;
if ( firlat[firlat_entry] == curnode)
firlat[firlat_entry] = 0;
}
// This is the slot where we once were. Fix it up.
//
firlat_entry = fir_2_slot ( nodes[curnode].fir_prior, firlatlen);
if (firlat[firlat_entry] == curnode)
{
long correct_node;
perfect_fir = slot_2_fir ( firlat_entry, firlatlen);
correct_node = firlat_find_closest_node
( nodes, nodeslen, firlat, firlatlen, perfect_fir);
if (fir_2_slot(nodes[correct_node].fir_prior, firlatlen) == firlat_entry)
{
firlat[firlat_entry] = correct_node;
if (internal_trace)
fprintf (stderr, "FIRtable insert node %ld at slot %ld\n",
correct_node, firlat_entry);
};
};
if (internal_trace)
firlat_significant (nodes, firlat, firlatlen);
}
// insert a node into the FIR chain and firlat
static void firlat_insert_node
( ENTROPY_FEATUREBUCKET_STRUCT *nodes,
long nodeslen,
long *firlat,
long firlatlen,
long curnode)
{
long smaller_node, larger_node;
long current_occupant, firlat_slot;
double perfect_fir_value, old_fir_error, new_fir_error;
if ( nodes[curnode].fir_smaller > 0
|| nodes[curnode].fir_larger > 0)
fprintf (stderr, "BLARG !!! We're reinserting node %ld twice!\n",
curnode);
if (internal_trace)
firlat_significant (nodes, firlat, firlatlen);
// Get the next larger node
larger_node = firlat_find_smallest_larger
( nodes, nodeslen, firlat, firlatlen, nodes[curnode].fir_prior);
// and the smaller node
smaller_node = nodes[larger_node].fir_smaller;
if (internal_trace)
fprintf (stderr,
"FIRchain insert between node S: %ld (%f) and L: %ld (%f) \n",
smaller_node, nodes[smaller_node].fir_prior,
larger_node, nodes[larger_node].fir_prior);
// Plug ourselves into the fir chain
nodes[larger_node].fir_smaller = curnode;
nodes[curnode].fir_larger = larger_node;
nodes[smaller_node].fir_larger = curnode;
nodes[curnode].fir_smaller = smaller_node;
if (internal_trace)
fprintf
( stderr,
"FIRchain done - inserted %ld (%lf) between %ld (%lf) and %ld (%lf)\n",
curnode, nodes[curnode].fir_prior,
smaller_node, nodes[smaller_node].fir_prior,
larger_node, nodes[larger_node].fir_prior);
// and put ourselves into the FIRLat if appropriate.
firlat_slot = fir_2_slot (nodes[curnode].fir_prior, firlatlen);
current_occupant = firlat[firlat_slot ];
if (current_occupant == 0)
{
// No current occupant in the FIRLAT- put us in.
firlat [firlat_slot] = curnode;
nodes[curnode].firlat_slot = firlat_slot;
if (internal_trace)
fprintf (stderr,
"Inserted node %ld into empty slot at %ld (%f)\n",
curnode, firlat_slot, nodes[curnode].fir_prior);
}
else
{
// There's a prior occupant- need to see who's more accurate.
perfect_fir_value = slot_2_fir ( firlat_slot, firlatlen);
old_fir_error = fabs (nodes[current_occupant].fir_prior
- perfect_fir_value);
new_fir_error = fabs (nodes[curnode].fir_prior
- perfect_fir_value);
if (new_fir_error < old_fir_error)
{
firlat[firlat_slot] = curnode;
nodes[curnode].firlat_slot = firlat_slot;
if (internal_trace)
fprintf (stderr,
"Insert over %ld at slot %ld (%f) with node %ld\n",
current_occupant, firlat_slot,
nodes[curnode].fir_prior, curnode);
};
};
if (internal_trace)
firlat_significant (nodes, firlat, firlatlen);
}
//
// Routine to dump what's in the node set and lookaside table
//
static void dump_bit_entropy_data (
ENTROPY_FEATUREBUCKET_STRUCT *nodes,
long nodeslen,
long *firlat,
long firlatlen)
{
int i;
fprintf (stderr,
" node A0 c A1 c PFIR FIR< FIR> \n");
for (i = 0; i < nodeslen && nodes[i].fir_prior > -1.0; i++)
{
fprintf (stderr,
"%5d %5ld %5ld %5ld %5ld %5.3f %5ld %5ld \n",
i,
nodes[i].abet[0].nextcell,
nodes[i].abet[0].count,
nodes[i].abet[1].nextcell,
nodes[i].abet[1].count,
nodes[i].fir_prior,
nodes[i].fir_smaller,
nodes[i].fir_larger);
};
fprintf (stderr, "FIRlat contents:");
for (i = 0; i < firlatlen; i++)
{
if (firlat[i] > 0 )
{
fprintf (stderr, "\n %d %ld (perf: %f actual: %f) ",
i, firlat[i],
((i + 0.5) / firlatlen),
nodes[firlat[i]].fir_prior);
}
}
fprintf (stderr, "\n");
}
// Create a T tall, W wide shuffle network in node memory.
// This is destructive; note that nodes get their backchannel
// set correctly, but their FIRs are not changed, and they
// are NOT inserted into the FIRlat.
//
// Note also that this starts the shuffle block at the node
// pointed to by "firstnode", and returns the index of the first
// node NOT part of the shuffle block, which (handily) is the
// start of the nominal freelist.
//
static long nodes_init_shufflenet (
ENTROPY_FEATUREBUCKET_STRUCT *nodes,
long nodeslen,
long height,
long width,
long firstnode)
{
long w, h, a ;
long node, target, f;
// long trow, tcol;
f = firstnode ; // because firstnode is too long!
for (w = 0; w < width; w++) // zero to width-1
for (h = 0; h < height; h++) // zero to height-1
{
// What is our "from" node?
node = f + h + (w * height);
// Mark these nodes "not yet in the FIR chain".
nodes[node].fir_smaller = -1;
nodes[node].fir_larger = -1;
// set each member of the alphabet to the shuffle of the
// next column (modulo width, so the whole thing loops)
for (a = 0; a < ENTROPY_ALPHABET_SIZE; a++)
{
// zero the count
nodes[node].abet[a].count = 0;
//tcol = (w + 1) % width;
//trow = ((h * ENTROPY_ALPHABET_SIZE) + a) % height;
//target = f
// + tcol * height
// + trow;
// where does this node.abet point?
target = f // start of shuffle
+ ((w+1) % width) * height // go to the next column
+ (((h*ENTROPY_ALPHABET_SIZE) + a) % height) ; // within col
//
//fprintf (stderr, "\nlinking from node %ld slot %ld to %ld",
// node, a, target);
// aim our current node.abet there.
nodes[node].abet[a].nextcell = target;
// and initialize the prior
nodes[node].fir_prior = -1.0;
}
}
// fprintf (stderr, "\n");
// dump_bit_entropy_data (nodes, nodeslen, NULL, 0);
return (height * width + firstnode);
};
//
// How to learn Bit Entropic style
//
int crm_expr_bit_entropy_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb,
char *txtptr, long txtstart, long txtlen)
{
// learn the osb_bayes transform spectrum of this input window as
// belonging to a particular type.
// learn <flags> (classname) /word/
//
long i, j, k;
ENTROPY_FEATUREBUCKET_STRUCT *nodes; // the node array (after mapping!)
long nodeslen = 0; // how many nodes do we have?
long nodebytes; // how many bytes of nodes?
long *firlat; // the FIR prior lookaside table
ENTROPY_HEADER_STRUCT *headers; // the what-is-where headers.
long *fmap; // catcher for where the file gets mapped.
long firlatlen = -1; // how long is the FIR lookaside table?
long firlatbytes; // how many bytes of firlat
long long *totalbits; // How many total feature bits in this file
double localfir; // the FIR value we've accumulated on this path
long curnode; // the current node in our search
// Text to be learned stuff:
long textoffset;
long textmaxoffset;
long bitnum;
long sense;
long crosslink;
double crosslink_thresh, crosslink_mincount;
char clerrtxt[MAX_PATTERN];
long clerrlen;
double bitweight;
long fev; // fev is for fatal errors
// node file details
char htext[MAX_PATTERN]; // the node filename working buffer
long hlen;
char *learnfilename; // the real node file name
long hfsize; // size of the node file
struct stat statbuf; // for statting the node file
long made_new_file;
long unique_states = 0;
if (internal_trace)
fprintf (stderr, "executing a bit-entropic LEARN\n");
// extract the node file name
crm_get_pgm_arg (htext, MAX_PATTERN, apb->p1start, apb->p1len);
hlen = apb->p1len;
hlen = crm_nexpandvar (htext, hlen, MAX_PATTERN);
// set our cflags, if needed. The defaults are
// "case" and "affirm", (both zero valued).
// and "crosslink" disabled.
sense = 1;
if (apb->sflags & CRM_REFUTE)
{
sense = -sense;
if (user_trace)
fprintf (stderr, " refuting bit-entropic learning\n");
};
crosslink = 0;
if (apb->sflags & CRM_CROSSLINK)
{
crosslink = 1;
if (user_trace)
fprintf (stderr, " enabling crosslinking.\n");
};
unique_states = 0;
if (apb->sflags & CRM_UNIQUE)
{
unique_states = 1;
if (user_trace)
fprintf (stderr, " enabling unique states.\n");
};
//
// grab the filename, and stat the file
// note that neither "stat", "fopen", nor "open" are
// fully 8-bit or wchar clean...
i = 0;
while (htext[i] < 0x021) i++;
j = i;
while (htext[j] >= 0x021) j++;
// filename starts at i, ends at j. null terminate it.
htext[j] = '\000';
learnfilename = strdup ( &(htext[i] ));
// and stat it to get it's length
k = stat (learnfilename, &statbuf);
made_new_file = 0;
// quick check- does the file even exist?
if (k != 0)
{
// file didn't exist... create it
FILE *f;
if (user_trace)
{
fprintf (stderr, "\nHad to create new BEN file %s\n", &htext[i]);
fprintf (stderr, "Opening %s for BEN file creation\n", &htext[i]);
};
f = fopen (&htext[i], "wb");
if (!f)
{
fprintf (stderr,
"\n Couldn't open your new BEN file %s for writing; errno=%d .\n",
&htext[i], errno);
if (engine_exit_base != 0)
{
exit (engine_exit_base + 23);
}
else
exit (EXIT_FAILURE);
};
// did we get a value for sparse_spectrum_file_length?
// (yes, even though we aren't using "sparse spectra", we'll
// use the same -S or -s parameter. If we're in unique_states
// mode, then use the DEFAULT_ENTROPY_FILE_LENGTH. Otherwise,
// only allocate enough nodes for the header, the firlat, and
// the standard toroid (plus a little because we don't trust
// ourselves).
if (sparse_spectrum_file_length == 0 )
{
if (unique_states)
{
sparse_spectrum_file_length =
DEFAULT_BIT_ENTROPY_FILE_LENGTH;
}
else
{
sparse_spectrum_file_length =
( BIT_ENTROPIC_SHUFFLE_HEIGHT * BIT_ENTROPIC_SHUFFLE_WIDTH )
+ 1000;
};
};
// How many bytes of nodes do we need?
nodeslen = sparse_spectrum_file_length;
nodebytes = nodeslen
* (sizeof (ENTROPY_FEATUREBUCKET_STRUCT));
firlatlen = sparse_spectrum_file_length
* BIT_ENTROPIC_FIR_LOOKASIDE_FRACTION;
firlatbytes = firlatlen * sizeof (long);
if (user_trace)
fprintf (stderr,
"No such file, must create it, length %ld nodes, %ld lats ( %ld nodebytes, %ld latbytes)\n",
nodeslen,
firlatlen,
nodebytes,
firlatbytes);
// Write them bytes, all NULs. (the 1024 is just some padding)
//
for (j = 0; j < (1024 + firlatbytes + nodebytes +
(ENTROPY_RESERVED_HEADER_LEN * sizeof (long))); j++)
fputc ('\000', f);
made_new_file = 1;
fclose (f);
// and reset the statbuf to be correct
k = stat (learnfilename, &statbuf);
};
//
hfsize = statbuf.st_size;
//
// map the bit-entropy file into memory
//
fmap = ( long *) crm_mmap_file (&(htext[i]),
0, hfsize,
PROT_READ | PROT_WRITE,
MAP_SHARED,
NULL);
if (fmap == MAP_FAILED)
{
fev = fatalerror5 ("Couldn't get to the bit-entropic file named: ",
&htext[i], CRM_ENGINE_HERE);
return (fev);
};
headers = (ENTROPY_HEADER_STRUCT *) fmap;
/////////////////////////////////////////////////////////////////
//
// Now the file is memory mapped - if we just made it, we must
// put the right format into it.
if ( made_new_file )
{
int i;
// Start with initializing the headers.
//
headers->firlatstart = ENTROPY_RESERVED_HEADER_LEN; //Start of the FIRlat
//
firlat = (long *) & fmap[headers->firlatstart];
//
headers->firlatlen = firlatlen;
//
// the actual nodes start after the FIRlat
headers->nodestart = headers->firlatstart + firlatlen;
//
// and the actual address of the first node is:
nodes = (ENTROPY_FEATUREBUCKET_STRUCT *) & (fmap[headers->nodestart]);
//
// and the number of nodes is...
headers->nodeslen = nodeslen;
//
// headers [4] and [5] are for the long long totalbits.
headers->totalbits = 0;
// initialize the FIR lookaside to contain the metanode (node 0) as
// a sentinel value, except for the two extremum nodes.
for (i = 0; i < firlatlen; i++)
firlat[i] = 0;
// initialize the metanode (node 0) which points to the
// start and end of the FIR chains, and to the free list.
nodes[0].fir_smaller = 3; // yes, the initial DB looks like an
nodes[0].fir_larger = 2; // infinity sign, with node 0 "twisted"
// the start node is node 1, at node 0, alphabet 1 -> next, == 1
nodes[0].abet[1].nextcell = 1;
if (! unique_states )
{
// put in a shuffled initial state.
long first_freenode;
long width, height;
// width should be the square root of the size, rounded down
// to a multiple of 8 (assuming byte-wise orientation)
width = ((int) (sqrt (sparse_spectrum_file_length)) / 8 ) * 8;
// height should be as many as possible with complete rows
height = sparse_spectrum_file_length / width;
if (user_trace)
fprintf (stderr, "New toroid. width: %ld, height %ld\n",
width, height);
first_freenode = nodes_init_shufflenet
(nodes, nodeslen,
height, width,
1); // the 1 means "node 1 is first node!
// mark node 1 as the start of the actual Markov chain
nodes[0].abet[1].nextcell = 1;
// chain the rest of the nodes into a freelist.
for (i = first_freenode; i < nodeslen; i++)
{
nodes[i].abet[0].nextcell = i+1 ;
nodes[i].fir_prior = -1.0;
}
// mark the last node in the freelist as the end of the node
// freelist.
nodes[0].abet[0].nextcell = first_freenode;
nodes[nodeslen-1].abet[0].nextcell = 0;
}
else
{// Else very basic init - nodes 1, 2, and 3 in a vee, with
// nodes 2 and 3 with no sucessors whatsoever.
// initialize the START node (node 1) to point to node 2 on a 0
nodes[1].abet[0].count = 0;
nodes[1].abet[0].nextcell = 2;
// initialize the START node (node 1) to point to node 3 on a 1
nodes[1].abet[1].count = 0;
nodes[1].abet[1].nextcell = 3;
// initialize node 2 (that is, first bit is zero)
// initialize the outoput on a 0 to be "nowhere"
nodes[2].abet[0].count = 0;
nodes[2].abet[0].nextcell = 0;
// initialize the output on 1 to be "nowhere"
nodes[2].abet[1].count = 0;
nodes[2].abet[1].nextcell = 0;
// initialize node 3 (that is, first bit is 1)
// initialize the outoput on a 0 to be "nowhere"
nodes[3].abet[0].count = 0;
nodes[3].abet[0].nextcell = 0;
// initialize the output on 1 to be "nowhere"
nodes[3].abet[1].count = 0;
nodes[3].abet[1].nextcell = 0;
// initialize nodes 4 through n into a free list. We don't care
// about any other field than the next node field while on the
// free list.
for ( i = 4; i < nodeslen; i++)
{
nodes[i].abet[0].nextcell = i+1 ;
nodes[i].fir_prior = -1.0;
}
nodes[0].abet[0].nextcell = 4;
// mark the last node in the freelist as the end of the node
// freelist.
nodes[nodeslen-1].abet[0].nextcell = 0;
};
// Either way, we have to initialize the start of the FIR system
// initialize the START node (node 1) into the FIR system
nodes[1].fir_prior = 0.5;
nodes[1].fir_smaller = 2;
nodes[1].fir_larger = 3;
//firlat[fir_2_slot (0.5, firlatlen)] = 1;
//nodes[1].firlat_slot = fir_2_slot(0.5, firlatlen);
// initialize node 2 into the FIR system
nodes[2].fir_prior = 0.0 ;
nodes[2].fir_smaller = 0;
nodes[2].fir_larger = 1;
//firlat[fir_2_slot (0.0, firlatlen)] = 2;
//nodes[2].firlat_slot = fir_2_slot(0.0, firlatlen);
// initialize node 3 into the FIR system
nodes[3].fir_prior = 1.0;
nodes[3].fir_smaller = 1;
nodes[3].fir_larger = 0;
//firlat[fir_2_slot (1.0, firlatlen)] = 3;
//nodes[3].firlat_slot = fir_2_slot(1.0, firlatlen);
}
// If this isn't a new file, grab stuff out of the header:
firlat = (long *) & (fmap [headers->firlatstart]);
firlatlen = headers->firlatlen;
firlatbytes = firlatlen * (sizeof (long));
nodes = (ENTROPY_FEATUREBUCKET_STRUCT *) & (fmap[headers->nodestart]);
nodeslen = headers->nodeslen;
nodebytes = nodeslen * (sizeof (ENTROPY_FEATUREBUCKET_STRUCT));
// headers [4] and [5] are for the long long totalbits.
// totalbits = (long long *) &headers[4];
totalbits = & (headers->totalbits);
// In this format, the first BIT_ENTROPIC_FIR_LOOKASIDE_FRACTION
// longwords contain the FIR prior lookaside table and the
// remainder contain actual nodes, and start immediately after
// the FIR lookaside table.
//
if (internal_trace)
fprintf (stderr,
"Firlatlen = %ld (%ld bytes), nodeslen = %ld (%ld bytes) \n",
firlatlen, firlatbytes, nodeslen, nodebytes);
if (user_trace)
fprintf (stderr, "Bit-entropic file %s has length %ld nodes\n",
&htext[i], nodeslen);
// If this is a new file, we have to write in the FIR lookaside
// table and the free node list. Otherwise, the free list will
// recurse and GCing will not work. Additionally, we need to
// initialize node 0 (the metanode) and node 1 (the start node);
// we'll also initialize nodes 2 and 3 to be the 0- and 1-successor
// of node 1.
//
////////////////////////////////////////////////////////////////
// Crosslink Threshold is dependent on the number of nodes...
// crosslink_thresh = 1.0 / nodeslen;
crosslink_thresh = 0.5 / nodeslen;
// crosslink_thresh = 0.25 / nodeslen;
// crosslink_thresh = 0.1 / nodeslen;
// crosslink_thresh = 0.05 / nodeslen;
// crosslink_thresh = 0.01 / nodeslen;
// crosslink_thresh = 0.003 / nodeslen;
// crosslink_thresh = 0.001 / nodeslen;
////////////////////////////////////////////////////////////
// Crosslink Mincount is the minimum number of good bits before we
// allow a crosslink
crosslink_mincount = 2;
// Running 1 megaslot at 1E-7 thresh overflows TREC06 public.
// Running 2 megaslot at 1E-7 thresh uses up 76% of TREC06 public, but
// with lousy TER (36/10000, 24/1000) and 5140 total errors
// even better might be an _adaptive_ threshold, that looks at the
// available freelist length and modulates according to that.
// Boundary conditions: with empty file, threshold is very small.
// and with a full file, threshold is 10/nodeslen.
// (we use the parameterized form of a line here - the first term can
// be dropped, because it's always times zero. But we leave it here
// in case we actually want a nonzero initial threshold.)
// If user specified an overriding value, use that.
if (apb->s2len > 0)
{
crm_get_pgm_arg (clerrtxt, MAX_PATTERN, apb->s2start, apb->s2len);
clerrlen = apb->s2len;
clerrlen = crm_nexpandvar (clerrtxt, clerrlen, MAX_PATTERN);
crosslink_thresh = strtod (clerrtxt, NULL) * crosslink_thresh;
};
////////////////////////////////////////////////////////////////
//
// Setup is complete. We now just churn through the
// incoming bitstream, incrementing and allocating new cells
// as needed. Magic Merging and Cloning action may happen as well.
//
textoffset = txtstart;
textmaxoffset = txtstart + txtlen;
// we'll number bits from 7 to 0, with 7 being most significant (and
// also, in most ASCII text, it's zero)
bitnum = 8;
// The current node starts at the START node, which is the
// the abet[1] link of the metanode (node 0)
curnode = nodes[0].abet[1].nextcell;
bitweight = BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT;
// localfir is the value of the FIR summary as we sit on
// the current node. Thus, it contains the influence of the
// most recent step that got us here, but NOT the effect of the current
// alph.
//
// Of course, the starting is DEFINED as the FIR of the
// START node.
localfir = nodes[curnode].fir_prior;
if (internal_trace)
fprintf (stderr, "Going to do %ld characters.\n",
(unsigned long) textmaxoffset - (unsigned long) textoffset);
while (textoffset + 1 < textmaxoffset || bitnum > 0)
{
unsigned short thischar;
unsigned short thisalph;
unsigned short nextalph;
unsigned short nextnextalph;
unsigned short nextnextnextalph;
double newnodefir;
// get the next alph member.
bitnum = bitnum - ENTROPY_CHAR_SIZE;
if (bitnum < 0)
{
bitnum = 7;
textoffset ++;
};
thischar = txtptr [ textoffset ];
thisalph = ( thischar >> bitnum ) & ENTROPY_CHAR_BITMASK;
if (internal_trace)
firlat_sanity_scan (firlat, firlatlen, nodes, nodeslen);
// nextalph is a lookahead one bit. We use it for smarter
// crosslinking.
nextalph = ( thischar >> (bitnum - 1) ) & ENTROPY_CHAR_BITMASK;
if (textoffset < textmaxoffset && bitnum == 0)
nextalph = ((txtptr [textoffset+1]) >> 7) & ENTROPY_CHAR_BITMASK;
// nextnextalph is lookahead two bits. Yet smarter crosslinking
nextnextalph = ( thischar >> (bitnum - 2) ) & ENTROPY_CHAR_BITMASK;
if (textoffset < textmaxoffset && bitnum == 0)
nextnextalph = txtptr [textoffset+1] >> 6 & ENTROPY_CHAR_BITMASK;
if (textoffset < textmaxoffset && bitnum == 1)
nextnextalph = txtptr [textoffset+1] >> 7 & ENTROPY_CHAR_BITMASK;
// nextnextnextalph is lookahead three bits. Even more smarter
nextnextnextalph = ( thischar >> (bitnum - 3) ) & ENTROPY_CHAR_BITMASK;
if (textoffset < textmaxoffset && bitnum == 0)
nextnextalph = txtptr [textoffset+1] >> 5 & ENTROPY_CHAR_BITMASK;
if (textoffset < textmaxoffset && bitnum == 1)
nextnextnextalph = txtptr [textoffset+1] >> 6 & ENTROPY_CHAR_BITMASK;
if (textoffset < textmaxoffset && bitnum == 2)
nextnextnextalph = txtptr [textoffset+1] >> 7 & ENTROPY_CHAR_BITMASK;
if (internal_trace)
fprintf (stderr, "Working char '%c' (%ld untouched), bit %ld.\n",
thischar,
(unsigned long) textmaxoffset - (unsigned long) textoffset -1,
bitnum);
if (internal_trace)
fprintf (stderr,
"\nNow at node %ld, alph %d, (bit %ld) localFIR %f\n",
curnode, thisalph, bitnum, localfir);
// remove this node from the FIRlat and the fir chain
if (internal_trace)
fprintf (stderr, "Removing node %ld from FIRlat \n",
curnode);
firlat_remove_node (nodes, nodeslen, firlat, firlatlen, curnode);
///////////////////////////////////////////////////////////////
//
// calculate node's new FIR and where it will go into the FIRlat
//
{
long totalcount, i;
totalcount = 0;
for (i = 0; i < ENTROPY_ALPHABET_SIZE; i++)
totalcount += nodes[curnode].abet[i].count;
newnodefir =
( localfir + (totalcount * nodes[curnode].fir_prior) )
/ ( totalcount + 1.0 );
};
nodes[curnode].fir_prior = newnodefir;
//////////////////////////////////////////////////////
//
// Increment our exit path from curnode; remember that we
// might be "unlearning" so use "sense" rather than just incrementing
// Of course, this doesn't completely unlearn, as there's always the
// chance we allocated a new node, or did a merge or clone on
// the initial learn and there's no way to "undo" that for certain.
//
nodes[curnode].abet[thisalph].count += sense;
// and keep track of the total visits this file has seen.
*totalbits += sense;
// GROT GROT GROT
// on an UNLEARN, we might actually decrement total_count to
// zero, meaning that this node never actually got hit. In
// which case, smoe gyrations are appropriate. But for now, let's
// ignore the problem.
////////////////////////////////////////////////////////////
//
// Now put the node back where it belongs.
//
if (internal_trace)
{
fprintf (stderr,
"Current Status before replacement at node %ld\n", curnode);
dump_bit_entropy_data (nodes, nodeslen, firlat, firlatlen);
};
firlat_insert_node (nodes, nodeslen, firlat, firlatlen, curnode);
if (internal_trace)
{
fprintf (stderr,
"Current Status after replacement at node %ld\n", curnode);
dump_bit_entropy_data (nodes, nodeslen, firlat, firlatlen);
};
//////////////
// we've fixed up the FIR chain;
// we've fixed up the FIR lookaside table.
// It's now time to move on to the next node.
// If there is no next node, the fun begins and we need to
// allocate yet another node.
/////////////////////////////////////////////////////
//
// Calculate the new local FIR for this path (NOT the
// node, but for the current path we will have taken at
// the next step in the node model.
//
localfir = ( thisalph * BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT)
+ ( localfir * ( 1.0 - BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT ));
if (localfir > 1.0000) localfir = 1.00;
if (localfir < 0.0000) localfir = 0.00;
//////////////////////////////////////////////////////////
// Time to move to the next node.
//
// do we have a real cell as a successor already (i.e. have
// we been here before?) Or is this the first time?
//
if (nodes[curnode].abet[thisalph].nextcell > 0)
{
// yes, there's a node already for this node's successor
// and this particular alph element.
// Make that the current node, and move along!
curnode = nodes[curnode].abet[thisalph].nextcell;
#ifdef BEN_GRAPHIC
fprintf (stderr, ".");
#endif
if (internal_trace)
fprintf (stderr, "next old cell %ld\n", curnode);
}
else
{
//
// no successor on this alphabet slot yet, so we need a new one.
//
// If we have no free nodes, we have no choice but to crosslink.
// If we have free nodes, and crosslinking isn't enabled, we
// just allocate a new node.
// If crosslinking is enabled, we see what the preferred
// new FIR will be, and see what node will work for that.
//
// Note that this is a lattice - try A, then B, then C.
// so we have to do it in two phases.
long further_node, i, do_crosslink;
double crosslink_err;
//fprintf (stderr, "-");
// Sentinel to know we need to calculate a crosslink
further_node = -10001;
// fprintf (stderr,
// "Ran off the end of entropy model at node %ld\n",
// curnode);
// Assume no crosslink unless we find out otherwise.
do_crosslink = 0;
// Is crosslink forced because we're out of nodes?
if (nodes[0].abet[0].nextcell == 0)
{
do_crosslink = 1;
// fprintf (stderr, "$");
if (internal_trace)
fprintf (stderr, "Empty freelist, forced crosslink\n");
};
// do we possibly want to crosslink anyway? At least,
// we may need to calculate the best next node, which is
// the one with the closest FIR to what our nextnode FIR is.
if ( crosslink || do_crosslink )
{
further_node = firlat_find_closest_node
( nodes,
nodeslen,
firlat,
firlatlen,
localfir);
crosslink_err = fabs (localfir - nodes[further_node].fir_prior);
if (
crosslink_err < crosslink_thresh
// this node OK?
&& nodes[further_node].abet[nextalph].count > crosslink_mincount
// One node lookahead
&& nodes[further_node].abet[nextalph^0x1].count == 0
&& nodes[further_node].abet[nextalph].nextcell > 0
// Two nodes lookahead
&& nodes[nodes[further_node].abet[nextalph].nextcell].abet[nextnextalph].count > 0
&& nodes [nodes[further_node].abet[nextalph].nextcell].abet[nextnextalph^0x1].count == 0
// THREE nodes lookahead
&& nodes[nodes[nodes[further_node].abet[nextalph].nextcell].abet[nextnextalph].nextcell].abet[nextnextnextalph].count > 0
&& nodes[nodes[nodes[further_node].abet[nextalph].nextcell].abet[nextnextalph].nextcell].abet[nextnextnextalph^0x1].count == 0
)
{
do_crosslink = 1;
#ifdef BEN_GRAPHIC
fprintf (stderr, "!");
#endif
if (internal_trace)
fprintf (stderr, "Opportunistic crosslink accepted %lf\n",
crosslink_err);
}
else
// try one higher in the FIR stream...
{
further_node = nodes[further_node].fir_larger;
crosslink_err = fabs
(localfir - nodes[further_node].fir_prior);
if (crosslink_err < crosslink_thresh
&& nodes[further_node].abet[nextalph].count > 0
&& nodes[further_node].abet[nextalph^0x1].count == 0
)
{
do_crosslink = 1;
#ifdef BEN_GRAPHIC
fprintf (stderr, "!^");
#endif
if (internal_trace)
fprintf (stderr,
"Opportunistic up-crosslink accepted %lf\n",
crosslink_err);
}
else
// try one lower in the FIR stream....
{
further_node = nodes[further_node].fir_smaller;
further_node = nodes[further_node].fir_smaller;
crosslink_err = fabs
(localfir - nodes[further_node].fir_prior);
if (crosslink_err < crosslink_thresh
&& nodes[further_node].abet[nextalph].count > 0
&& nodes[further_node].abet[nextalph^0x1].count == 0
)
{
do_crosslink = 1;
#ifdef BEN_GRAPHIC
fprintf (stderr, "!v");
#endif
if (internal_trace)
fprintf (stderr,
"Opportunistic down-crosslink accepted %lf\n",
crosslink_err);
}
}
}
// Note that further_node is set either way, so we don't
// need to recalculate it.
};
// Are we crosslinking?
if (do_crosslink)
{
// Enforce sanity on further_node before crosslink
if (further_node < 1)
{
if (further_node < 1)
fprintf (stderr,
"Bogus crosslink %ld! Branching back to node 1.\n",
further_node);
further_node = 1;
fprintf (stderr, "?");
};
// Put furnode in as the next node for this alph
nodes[curnode].abet[thisalph].nextcell = further_node;
}
else
{
// Time for a new node. We link it in with the
// current alph's slot in curnode..
#ifdef BEN_GRAPHIC
fprintf (stderr, "@");
#endif
further_node = nodes[0].abet[0].nextcell;
if (internal_trace)
fprintf (stderr, "Allocating new cell %ld\n",
further_node);
nodes[curnode].abet[thisalph].nextcell = further_node;
// cut this node out of the front of the free list.
nodes[0].abet[0].nextcell =
nodes[further_node].abet[0].nextcell;
// wipe out any old data that might be there.
for (i = 0; i < ENTROPY_ALPHABET_SIZE ; i++)
{
nodes[further_node].abet[i].count = 0;
nodes[further_node].abet[i].nextcell = 0;
};
// initialize the new node with the current FIR prior
nodes[further_node].fir_prior = localfir;
// hook the new node into the FIR chain
firlat_insert_node
(nodes, nodeslen, firlat, firlatlen, further_node);
}
if (internal_trace)
fprintf (stderr,
"Now moving from node %ld to %ld (nfir %f, pfir %f)\n",
curnode, further_node,
nodes[further_node].fir_prior, localfir);
curnode = further_node;
}
}
// All done munging the markov chain.
// Time to clean up and go home.
// and remember to let go of the mmap and the pattern bufffer
crm_force_munmap_addr ((void *) fmap);
#ifdef POSIX
// Because mmap/munmap doesn't set atime, nor set the "modified"
// flag, some network filesystems will fail to mark the file as
// modified and so their cacheing will make a mistake.
//
// The fix is to do a trivial read/write on the .css ile, to force
// the filesystem to repropagate it's caches.
//
{
int hfd; // hashfile fd
char foo;
hfd = open (learnfilename, O_RDWR);
read (hfd, &foo, sizeof(foo));
lseek (hfd, 0, SEEK_SET);
write (hfd, &foo, sizeof(foo));
close (hfd);
}
#endif
free (learnfilename);
return (0);
}
int crm_expr_bit_entropy_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb,
char *txtptr, long txtstart, long txtlen)
{
// fprintf (stderr, "Sorry, Not Yet Implemented\n");
//return (0);
// classify the sparse spectrum of this input window
// as belonging to a particular type.
//
// This code should look very familiar- it's cribbed from
// the code for LEARN
//
long i, j, k;
// the statistics file names
char htext[MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN];
long htext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN;
long hlen;
// the match statistics variable
char stext[MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100)];
long stext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100);
long slen;
char svrbl[MAX_PATTERN]; // the match statistics text buffer
long svlen;
long fnameoffset;
char fname[MAX_FILE_NAME_LEN];
long crosslink;
struct stat statbuf; // for statting the hash file
long totalfeatures = 0; // total features
double tprob; // total probability in the "success" domain.
// variables used when tracing the markov graph
long curnode;
double total_entropy[MAX_CLASSIFIERS];
// Variables for calculating entropy
double ptc[MAX_CLASSIFIERS]; // current running probability of this class
double renorm = 0.0;
double entropy_renorm = 0.0;
double entropy_sum = 0.0;
// int hfds[MAX_CLASSIFIERS];
ENTROPY_FEATUREBUCKET_STRUCT *nodestarts[MAX_CLASSIFIERS];
ENTROPY_FEATUREBUCKET_STRUCT *nodes;
ENTROPY_HEADER_STRUCT *headers[MAX_CLASSIFIERS]; // pointers in the files.
long *fmaps[MAX_CLASSIFIERS];
long nodelens[MAX_CLASSIFIERS];
long nodeslen;
long *firlats[MAX_CLASSIFIERS];
long firlatlens[MAX_CLASSIFIERS];
char *hashname[MAX_CLASSIFIERS];
long hashlens[MAX_CLASSIFIERS];
long firjumps[MAX_CLASSIFIERS];
long long *totalbits[MAX_CLASSIFIERS];
long succhash;
long vbar_seen; // did we see '|' in classify's args?
long maxhash;
long fnstart, fnlen;
long fn_start_here;
long textoffset;
long textmaxoffset;
long bestseen;
long thistotal;
if (internal_trace)
fprintf (stderr, "executing a bit-entropic CLASSIFY\n");
// extract the hash file names
crm_get_pgm_arg (htext, htext_maxlen, apb->p1start, apb->p1len);
hlen = apb->p1len;
hlen = crm_nexpandvar (htext, hlen, htext_maxlen);
// extract the optional "match statistics" variable
//
crm_get_pgm_arg (svrbl, MAX_PATTERN, apb->p2start, apb->p2len);
svlen = apb->p2len;
svlen = crm_nexpandvar (svrbl, svlen, MAX_PATTERN);
{
long vstart, vlen;
crm_nextword (svrbl, svlen, 0, &vstart, &vlen);
memmove (svrbl, &svrbl[vstart], vlen);
svlen = vlen;
svrbl[vlen] = '\000';
};
// status variable's text (used for output stats)
//
stext[0] = '\000';
slen = 0;
// set our flags, if needed. The defaults are
// "case"
// Crosslink, in this case, means to allow
// FIR jumping whenever we've not seen this
// situation before.
crosslink = 0;
if (apb->sflags & CRM_CROSSLINK)
{
crosslink = 1;
if (user_trace)
fprintf (stderr, " enabling crosslinking.\n");
};
// Now, the loop to open the files.
bestseen = 0;
thistotal = 0;
// initialize our arrays for N .css files
for (i = 0; i < MAX_CLASSIFIERS; i++)
{
ptc[i] = 0.5; // priori probability
};
// Our evaluator is to simply traverse the Markov chain that
// starts at nodes[0].abet[0].nextcell, and add together all of
// the entropies - that is, the -log2(prob) of each node. Of
// course, we have to nerf the probabilities away from 0, as
// that would yield a log2(0.0) which is infinite and thus not
// well-represented in IEEE floating point.
//
// For our purposes, we will use:
// probability = this_count + BIT_ENTROPIC_PROBABILITY_NERF
// / total_count
//
// If we run off the "end" of the Markov chain, we use the FIRs
// to quickly find another node that's "really similar" to this
// node. (this can only happen if the initialized Markov
// contained open ends; this doesn't happen when we initialize
// with a perfect shuffle graph which is toroidal and has no
// "end" to run off of.
//
vbar_seen = 0;
maxhash = 0;
succhash = 0;
fnameoffset = 0;
// now, get the file names and mmap each file
// get the file name (grody and non-8-bit-safe, but doesn't matter
// because the result is used for open() and nothing else.
// GROT GROT GROT this isn't NULL-clean on filenames. But then
// again, stdio.h itself isn't NULL-clean on filenames.
if (user_trace)
fprintf (stderr, "Classify list: -%s- \n", htext);
fn_start_here = 0;
fnlen = 1;
while ( fnlen > 0 && ((maxhash < MAX_CLASSIFIERS-1)))
{
crm_nextword (htext,
hlen, fn_start_here,
&fnstart, &fnlen);
if (fnlen > 0)
{
strncpy (fname, &htext[fnstart], fnlen);
fn_start_here = fnstart + fnlen + 1;
fname[fnlen] = '\000';
if (user_trace)
fprintf (stderr, "Classifying with file -%s- "\
"succhash=%ld, maxhash=%ld\n",
fname, succhash, maxhash);
if ( fname[0] == '|' && fname[1] == '\000')
{
if (vbar_seen)
{
nonfatalerror5 ("Only one ' | ' allowed in a CLASSIFY. \n" ,
"We'll ignore it for now.", CRM_ENGINE_HERE);
}
else
{
succhash = maxhash;
};
vbar_seen ++;
}
else
{
// be sure the file exists
// stat the file to get it's length
k = stat (fname, &statbuf);
// quick check- does the file even exist?
if (k != 0)
{
nonfatalerror5 ("Nonexistent Classify table named: ",
fname, CRM_ENGINE_HERE);
}
else
{
// file exists - do the open/process/close
//
hashlens[maxhash] = statbuf.st_size;
// mmap the hash file into memory so we can bitwhack it
fmaps[maxhash] =
crm_mmap_file ( fname,
0, hashlens[maxhash],
PROT_READ | PROT_WRITE,
MAP_SHARED,
NULL);
headers[maxhash] = (ENTROPY_HEADER_STRUCT *)
fmaps[maxhash];
if (headers[maxhash] == MAP_FAILED )
{
nonfatalerror5 ("Couldn't memory-map the table file",
fname, CRM_ENGINE_HERE);
}
else
{
//
// Check to see if this file is the right version
//
//long fev;
//if (hashes[maxhash][0].hash != 0 ||
// hashes[maxhash][0].key != 0)
//{
// fev =fatalerror ("The .css file is the wrong version! Filename is: ",
// fname);
// return (fev);
//};
// Pull the file info out of the header (the
// first 1024 bytes is the header)
//firlats[maxhash] = (long *)
// & headers[maxhash][headers[maxhash][0]];
firlats[maxhash]= (long *)
& (fmaps[maxhash] [headers[maxhash]->firlatstart]);
//firlatlens[maxhash] = headers[maxhash][1];
firlatlens[maxhash] = headers[maxhash]->firlatlen;
//nodestarts[maxhash] = (ENTROPY_FEATUREBUCKET_STRUCT *)
// & (headers[maxhash][headers[maxhash][2]]);
nodestarts[maxhash] = (ENTROPY_FEATUREBUCKET_STRUCT *)
& (fmaps[maxhash] [headers[maxhash]->nodestart]);
//nodelens[maxhash] = headers[maxhash][3];
nodelens[maxhash] = headers[maxhash]->nodeslen;
// totalbits[maxhash] = (long long *) &headers[4];
totalbits[maxhash] = (long long *)
&(headers[maxhash]->totalbits);
if (internal_trace)
fprintf (stderr,
"File #%ld firlat %lx len %ld and nodes %lx\n",
maxhash,
(unsigned long ) firlats[maxhash],
firlatlens [maxhash],
(unsigned long )nodestarts[maxhash]);
// Keep a copy of the data filename for later.
hashname[maxhash] = (char *) malloc (fnlen+10);
if (!hashname[maxhash])
untrappableerror(
"Couldn't malloc hashname[maxhash]\n",
"We need that part later, so we're stuck. Sorry.");
strncpy(hashname[maxhash],fname,fnlen);
hashname[maxhash][fnlen]='\000';
maxhash++;
};
};
};
if (maxhash > MAX_CLASSIFIERS-1)
nonfatalerror5 ("Too many classifier files.",
"Some may have been disregarded",
CRM_ENGINE_HERE);
};
};
//
// If there is no '|', then all files are "success" files.
if (succhash == 0)
succhash = maxhash;
// a CLASSIFY with no arguments is always a "success".
if (maxhash == 0)
return (0);
// sanity checks... Uncomment for super-strict CLASSIFY.
//
// do we have at least 1 valid .css files?
if (maxhash == 0)
{
fatalerror5 ("Couldn't open at least 2 .css files for classify().", "",
CRM_ENGINE_HERE);
};
// do we have at least 1 valid .css file at both sides of '|'?
//if (!vbar_seen || succhash < 0 || (maxhash < succhash + 2))
// {
// nonfatalerror (
// "Couldn't open at least 1 .ben file per SUCC | FAIL classes "
// " for classify().\n","Hope you know what are you doing.");
// };
// now all of the files are mmapped into memory,
// and we can start following the chains and adding up entropy.
i = 0;
j = 0;
k = 0;
thistotal = 0;
//////////////////////////////////////////////////////////////
//
// The actual graph-following starts here. To maximize locality
// of reference, we do the graph-following first for one graph,
// then for another, then for another. That way, we tend to stay
// in the CPU cache better.
//
// Note to self - doing this may or may not speed up the other
// classifiers- but because they tend to use less memory it may
// make less of a difference.
{
long c; // c is going to be our classifier counter
unsigned short thischar;
unsigned short thisalph;
double localfir;
long bitnum;
long nextnode;
// zero out our entropy accumulators
//
for (c = 0; c < MAX_CLASSIFIERS; c++)
{
total_entropy[c] = 0.0;
firjumps[c] = 0;
}
for (c = 0; c < maxhash; c++)
{
if (internal_trace)
fprintf (stderr, "Now running against stats file %ld\n", c);
// initialize our per-graph-following stuff:
totalfeatures = 0;
nodes = nodestarts[c];
nodeslen = nodelens[c];
// initialize our starting conditions.
curnode = nodes[0].abet[1].nextcell;
localfir = nodes[nodes[0].abet[1].nextcell].fir_prior;
textoffset = txtstart;
textmaxoffset = txtstart + txtlen;
bitnum = 8;
// Now the big loop to follow the graph
while (textoffset + 1 < textmaxoffset || bitnum > 0)
{
long nodetotcount, itc;
float add_entropy;
bitnum = bitnum - ENTROPY_CHAR_SIZE;
if (bitnum < 0)
{
bitnum = 7;
textoffset ++;
}
thischar = txtptr [textoffset];
thisalph = ( thischar >> bitnum ) & ENTROPY_CHAR_BITMASK;
totalfeatures++;
if (internal_trace)
firlat_sanity_scan (firlats[c], firlatlens[c], nodes, nodeslen);
// update our local FIR to what it will be when we're
// at the next node. (this is effective not at the
// current node, but one step further into the future.)
localfir = thisalph * BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT
+ localfir * (1.0 - BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT);
// and go where it told us. Note that we may want to
// FIR jump if we've never been here before, or there is
// no "next node", or there _is_ no next node.
// we *have* been to before.
nodetotcount = 0;
for (itc = 0; itc < ENTROPY_ALPHABET_SIZE; itc++)
nodetotcount += nodes[curnode].abet[itc].count;
if (internal_trace)
fprintf (stderr, "%ld/%ld ",
nodes[curnode].abet[thisalph].count, nodetotcount);
if (
( nodetotcount < 1)
|| nodes[curnode].abet[thisalph].count < 1
|| nodes[curnode].abet[thisalph].nextcell < 1
)
{
// Either no corpus history, or
// this cell has never gone here before, or
// we're off the edge of the model completely
// In any case, we're better off FIR-jumping no
// matter what because we'll never be back "in corpus"
double error;
nextnode = firlat_find_closest_node
(nodes, nodeslen, firlats[c], firlatlens[c], localfir);
error = localfir - nodes[nextnode].fir_prior;
if (internal_trace)
fprintf (stderr,
"FIR jumping, error = %12lg, new node %ld \n",
error, nextnode);
if (error < 0.00) error = - error;
// and add the error entropy estimate
// due to the FIR jump; since this is knowable
// due to it being "no choice", the entropy is very small.
add_entropy =
// pow (2.0, (error / BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT));
// stats_2_entropy (1, nodelens[c]) ;
// the following one seems to work well.. entirely
// unjustifiably, but it works. ROC=0.128
// stats_2_entropy (1,10);
stats_2_entropy (1,1000);
// This one is horrible. 5% error rates...
// stats_2_entropy (nodes[nextnode].total_count,
// nodes[0].total_count);
//stats_2_entropy (0, nodetotcount);
//0.5;
// This one is justifiable, and is very close to
// the unjustifiable one above, but not quite as good.
// GROT GROT GROT
//stats_2_entropy (1, nodetotcount + 1);
total_entropy[c] = total_entropy[c] + add_entropy;
curnode = nextnode;
firjumps[c]++;
}
else
// else no FIRjump, things look good.
// we're still "in the corpus" or we've not enabled
// FIR jumping.
{
add_entropy =
stats_2_entropy ( nodes[curnode].abet[thisalph].count,
nodetotcount );
total_entropy[c] =
total_entropy[c] +
add_entropy;
// and move to the next node.
curnode = nodes[curnode].abet[thisalph].nextcell;
// go back to the start cell if we're off the end and
// we've not enabled crosslinked FIRjumping
if (curnode < 1)
{
fprintf (stderr, "X");
curnode = nodes[0].abet[0].nextcell;
};
};
if (internal_trace)
fprintf (stderr, "%f \n", add_entropy);
}
}
// Test Results on TREC public testset
// No node-merge or node-clone
// Unique states
// TOE - 1000000 nodes - 6339/92189, ~356/144/64 min (filled@ ~10K)
// 128 x 128 lattice:
// no FIR-hop
// TOE
// 256 x 256 lattice:
// no FIR-hop
// FIR-hopping
// TOE shuffle - 3342/91996, time 199/71/15 minutes
// 512 x 512 lattice:
// no FIR-hop
// TOE - 3035/92189 ( ~3 % ) time: 233 / 74 / 25 min
// TEFT - 8609/75962 and 9508/92189, 272 minutes
// 1024x1024 lattice:
// FIR-hop
// TOE 7914/92189 and ?/96/60 minutes
// Fixed 5109/92189 and 378/153/63 minutes (with some suspiciously
// long delays during a few messages)
// 2048x2048 lattice:
// no FIR-hop
// TOE 3780 / 92189 318/87/60 min
if (internal_trace)
{
for (k = 0; k < maxhash; k++)
fprintf (stderr, "entropy for classifier %ld is %f\n",
k, total_entropy[k]);
};
////////////////////////////////////////////////////
//
// Now build the output report (if desired, that is, svlen > 0
//
// How we convert entropy into probability and pR:
//
// Basically, we cheat a lot. If you consider the definition of
// entropy, it's the log of probability. The probability of a
// particular document being generated by a particular markov
// model is equal to the product of the probabilities of each
// particular transition. The entropy is the sum of the log of
// the individual transition probabilities, and (remembering
// that multiplying in the real domain is equivalent to summing
// in the log domains) we see that the absolute probabililities
// (that is, the monkeys-on-typewriters probability) of each
// document are just 1 / (2 ^ entropy), and the relative
// probabilities can then be scaled out by summing and dividing
// in the real domain.
//
// However, those probabilities are REALLY SMALL - well below
// the lower limit of IEEE floating point numbers, and so we can
// renormalize them to close to 1 by addition of an arbitrary
// constant (which is equivalent to multiplying the
// probabilities by an arbitrary constant), thus converting them
// to relative rather than absolute probabilities (oh- we should
// do a renormalization so they really do add to 1.000).
//
// Once we have relative probabilities, we can then jump right
// back into the standard summing code to decide on success or
// failure.
// Step 1: find the minimum needed to add to start renormalization
entropy_renorm = total_entropy[0];
entropy_sum = 0.0;
for (k = 0; k < maxhash; k++)
{
if (internal_trace)
fprintf (stderr, "entropy class %ld is %f\n", k, total_entropy[k]);
if (total_entropy[k] < entropy_renorm )
{
entropy_renorm = total_entropy [k];
}
entropy_sum += total_entropy[k];
};
if (internal_trace)
fprintf (stderr, "entropy_renorm = %f\n", entropy_renorm);
// Step 2: add entropy renorm constant, and convert to unnormalized P
// Note that this is 2 raised to very small powers, and often
// returns zero, so we have to nerf it with DBL_MIN.
for (k = 0; k < maxhash; k++)
{
// Bad formula - technically accurate but pR maxes out
// at 304 for nearly all texts
//ptc [k] = 1 /
// ( pow (2.00, (total_entropy[k] - entropy_renorm)) + 1000 * DBL_MIN);
// better formula - probs very close to 0.5
// ptc [k] = (entropy_sum - total_entropy[k] ) / entropy_sum;
//
// try a much smaller exponential to bring things back into line:
// Note: the 1.1 below is _empirical_, chosen to normalize this
// classifier to a +/1 10 pR units for good thick threshold training.
// It has no other meaning beyond that.
ptc [k] = 1 /
(pow (1.1, (total_entropy[k] - entropy_renorm)) + (1000 * DBL_MIN)) ;
if (internal_trace)
fprintf (stderr, "class %ld ptc %f\n", k, ptc[k]);
};
// Step 3: renormalize the probabilities to sum to 1.000
renorm = 0.0;
for (k = 0; k < maxhash; k++)
renorm = renorm + ptc[k];
for (k = 0; k < maxhash; k++)
{
ptc[k] = (ptc[k] / renorm) + 1000 * DBL_MIN ;
if (internal_trace)
fprintf (stderr, "class %ld RENORM ptc %f\n", k, ptc[k]);
};
// Now we're in probability space in the ptc array. Away we go!
tprob = 0.0;
for (k = 0; k < succhash; k++)
tprob = tprob + ptc[k];
if (svlen > 0)
{
char buf[1024];
double accumulator;
double remainder;
double overall_pR;
long m;
buf [0] = '\000';
accumulator = 1000 * DBL_MIN;
for (m = 0; m < succhash; m++)
{
accumulator = accumulator + ptc[m];
};
remainder = 1000 * DBL_MIN;
for (m = succhash; m < maxhash; m++)
if (bestseen != m)
{
remainder = remainder + ptc[m];
};
overall_pR = (log10 (accumulator) - log10 (remainder));
// note also that strcat _accumulates_ in stext.
// There would be a possible buffer overflow except that _we_ control
// what gets written here. So it's no biggie.
if (tprob > 0.5000)
{
sprintf (buf, "CLASSIFY succeeds; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR );
}
else
{
sprintf (buf, "CLASSIFY fails; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR );
};
if (strlen (stext) + strlen(buf) <= stext_maxlen)
strcat (stext, buf);
bestseen = 0;
for (k = 0; k < maxhash; k++)
if (ptc[k] > ptc[bestseen] ) bestseen = k;
remainder = 1000 * DBL_MIN;
for (m = 0; m < maxhash; m++)
if (bestseen != m)
{
remainder = remainder + ptc[m];
};
sprintf (buf, "Best match to file #%ld (%s) " \
"prob: %6.4f pR: %6.4f \n",
bestseen,
hashname[bestseen],
ptc[bestseen],
(log10(ptc[bestseen]) - log10(remainder)));
if (strlen (stext) + strlen(buf) <= stext_maxlen)
strcat (stext, buf);
sprintf (buf, "Total features in input file: %ld\n", totalfeatures);
if (strlen (stext) + strlen(buf) <= stext_maxlen)
strcat (stext, buf);
for (k = 0; k < maxhash; k++)
{
long m;
float pctused;
remainder = 1000 * DBL_MIN;
for (m = 0; m < maxhash; m++)
if (k != m)
{
remainder = remainder + ptc[m];
};
nodes = nodestarts[k];
// percent usage- hack due to "0" being a sentinel for no space
pctused = nodes[0].abet[0].nextcell;
if (nodes[0].abet[0].nextcell == 0) pctused = nodelens[k];
pctused = 100.0 * pctused / nodelens[k];
sprintf (buf,
"#%ld (%s):" \
" features: %lld (%.0f%%), entropy: %8f, jumps: %ld, prob: %3.2e, pR: %6.2f \n",
k,
hashname[k],
// nodes[0].total_count,
*totalbits[k],
pctused,
total_entropy[k],
firjumps[k],
ptc[k],
(log10 (ptc[k]) - log10 (remainder) ) );
// strcat (stext, buf);
if (strlen(stext)+strlen(buf) <= stext_maxlen)
strcat (stext, buf);
};
// check here if we got enough room in stext to stuff everything
// perhaps we'd better rise a nonfatalerror, instead of just
// whining on stderr
if (strcmp(&(stext[strlen(stext)-strlen(buf)]), buf) != 0)
{
nonfatalerror5( "WARNING: not enough room in the buffer to create "
"the statistics text. Perhaps you could try bigger "
"values for MAX_CLASSIFIERS or MAX_FILE_NAME_LEN?",
" ", CRM_ENGINE_HERE);
};
crm_destructive_alter_nvariable (svrbl, svlen,
stext, strlen (stext));
};
};
// cleanup time!
// remember to let go of the fd's and mmaps
for (k = 0; k < maxhash; k++)
{
// close (hfds [k]);
crm_munmap_file ((void *) fmaps[k]);
};
//
// Free the hashnames, to avoid a memory leak.
//
for (i = 0; i < maxhash; i++) {
///////////////////////////////////////
// ! XXX SPAMNIX HACK!
//! -- by Barry Jaspan
//
//! Without the statement "k = i" (which should have no effect),
//! the for statement crashes on MacOS X when compiled with gcc
//! -O3. I've examined the pointers being freed, and they appear
//! valid. I've run this under Purify on Windows, valgrind on
//! Linux, and efence on MacOS X; none report a problem here
//! (though valgrind reports umrs in the VHT code; see my post to
//! crm114-developers). I've also examined the assembler produced
//! with various changes here and, though I don't speak PPC, w/o
//! the k = i it is qualitatively different.
//!
//! For now, I'm concluding it is an optimizer bug, and fixing it
//! with the "k = i" statement. This occurs on MacOS X 10.2 with
//! Apple Computer, Inc. GCC version 1175, based on gcc version
//! 3.1 20020420 (prerelease).
//
k = i;
free (hashname[i]);
}
if (tprob <= 0.5000)
{
if (user_trace)
fprintf (stderr, "CLASSIFY was a FAIL, skipping forward.\n");
// and do what we do for a FAIL here
csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1;
csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1;
return (0);
};
//
// all done... if we got here, we should just continue execution
if (user_trace)
fprintf (stderr, "CLASSIFY was a SUCCESS, continuing execution.\n");
return (0);
};
syntax highlighted by Code2HTML, v. 0.9.1