// crm_bit_entropy.c - Controllable Regex Mutilator, version v1.0 // Copyright 2001-2006 William S. Yerazunis, all rights reserved. // // This software is licensed to the public under the Free Software // Foundation's GNU GPL, version 2. You may obtain a copy of the // GPL by visiting the Free Software Foundations web site at // www.fsf.org, and a copy is included in this distribution. // // Other licenses may be negotiated; contact the // author for details. // // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the command line argc, argv extern int prog_argc; extern char **prog_argv; // the auxilliary input buffer (for WINDOW input) extern char *newinputbuf; // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *inbuf; extern char *outbuf; extern char *tempbuf; // The following mumbo-jumbo needed for BSD to compile cleanly, because // BSD's logl function is not defined in all builds! What a crock! #ifdef logl #define crm_logl(x) logl((x)) #else #define crm_logl(x) log((x)) #endif #ifndef sqrtf #define sqrtf(x) sqrt((x)) #endif // End BSD crapola. ////////////////////////////////////////////////////////////////// // // Entropic Classification Basics // // This is an implementation of an entropic bitwise lattice Markov // classifier. This is very roughly based on some ideas in the DMC // (Dynamic Markov Compressor) data compression technique of Gordon // Cormack et al and inspired by Andrej Bratko's use of it in spam // filtering ( Gordon originally proposed DMC for compression and // then Andrej used it for spam filtering. Matthew Young-Lai // then proposed a technique for merging nodes which is limited // to re-merging the "cloned" nodes in Gordon's work. Our merging // method here has no such limitation.) // // This particular implementation is entirely separate, similar in // tht it uses a Markov chain, but does not use cloning. Rather it // uses cross- linking, in a minimum-entropy format, and supports // "open-ended" Markov chains, rather than the toroidal lattices // required in Cormack's DMC. // // For those of you into references as "closest prior", see: // 1) Cormack, Gordon V. et al, Data Compression using Dynamic Markov // Modeling (Computer Journal #6, 1987 // 2) Bratko, Andrej, Markov Modeling for Spam Filtering (?) TREC // 2005 // 3) Matthew Young-Lai, Adding state merging to the DMC data // compression algorithm, Information Processing Letters 70 // (1999) 223–228. // // // The basic concept is to construct a Markov chain that describes // the sequential bit pattern of a message, and then use that chain // to compress the incoming unknown text; the smaller the entropy // the better the quality of match. Yes, this sounds like // a horrible idea, but Andrej Bratko at TREC 2005 has shown that // it does work. One advantage is that it can work on files that // have no obvious tokenizations, like JPGs. // // Like every other Markov chain, there's a start node, and then // for each observed 1 or 0 there's an observed freqency of 1 or 0, // and the associated next node for a 1 or 0. The quality of match // is just the sums of the entropy (*) of each path taken. // // More correctly, entropy is the weighted sum for an ensemble of // signals, and is -Psig * log2 (Psig), and the sum over an // ensemble of possible texts yields the entropy which is the // average bit rate. Whichever Markov chain shows less entropy // is the one that matches best. However, in this case we're // only given one example, which is the text in consideration, // and so the probability of each transition is 1.0000 exactly // (as we have only a single sample of the unknown). Thus, we // actually sum the bits required; that is, log2 (Psig). // // Nota Bene 1: in order to make this code a little bit more // general, we have each node accept an alphabet of N different // symbols, where N defaults to 2 (that is, bitwise) but we'll // write the code to be more general than that; it's a compile-time // change rather than a hack-the-code change. States *are* always // numbered starting at zero, though, so consider the alphabet to // always be an ENUM. // // Nota Bene 2: If you happen to be familiar with the other CRM114 // classifiers, note that this classifier does NOT use the tokenizer // regex system, nor the SBPH or the OSB feature set. Instead, // all that state information is captured by where you are in the // Markov chain, and the concept of "token" is replaced by the // concept of "alphabet". By default, right now the "alphabet" // is single bits. // // Nota Bene 3: As currently designed, the learning algorithm // defaults to initialize with a toroidal perfect shuffle- thus the // default model always takes constant memory and you can't run off // the "edge" of the model. We still need to fix it so that // "crosslink" will allow a jump to another more appropriate (lower // entropy) part of the braid; that's not there yet. // // Nota Bene 4: As the toroidal shuffle didn't give extremely good // results, there's also a "unique state" classifier, set by the // flag < unique >. This generates a *loopless* chain where each // text learned produces a single linear chain; this means we // follow whatever model has already had the same bit pattern, but // whenever we diverge, we start allocating new bit nodes // immediately. This works decently but uses up a lot of nodespace // quickly. It also means that you can run off the "edge" of the // model because it's not closed like the toroidal shuffle. By // default, running off the edge causes a restart at the START node // (node 1, as it turns out; node 0 is a // bookkeeping/anchor/sentinel node). This works, but doesn't give // very high accuracy as the reSTARTs are not time-aligned with the // model; they might even occur in the middle of a byte. // // Nota Bene 5: In the quest for better accuracy, instead of doing // a reSTART when we're classifying and fall off the edge of a // 'unique' model, we branch to a node that has a very similar // prior history. How this is done is seen below but it boils down // to finding a node that already exists that has had a very // similar prior history bit pattern. This can actually be done in // near-constant (and pretty small!) time; the magic to do this is // all described below. // // Nota Bene 6: In the quest for smaller (and faster!) models and // the ability to not run out of nodes, we have the // flag, which allows the LEARNing process to do the same branching // that is done when we run off the end of the model chain during // classification. More specifically, before allocating a new // node, we look for the closest prior history in already-allocated // nodes, and if it's "close enough" (as set by a threshold), we // make that the next node in our chain. This produces Markov // models with both acyclic joins and repeating loops, and // effectively prevents the model from growing without bound, as // the model reuses nodes quite effectively (sometimes more than a // 300:1 reuse!). It also produces a model that is quite accurate // when trained with SSTTT. Although we can argue that ROCAC isn't // the best possible figure of merit, SSTTT can yield very good // ROCAC values (0.03 per cent); the terminal error rate on the // TREC06 public corpus can be as low as 3 errors per 10,000 (that // is, three errors in the final ten thousand messages, of the // 92Kmsgs corpus.) // /////////////////////////////////////////////////////////////////// // // Algorithm for Classification: // // Follow the Markov chain from START with total entropy of 0 to // begin, then walk the Markov chain taking each branch as // instructed by the successive bits of incoming text. Along the // path, accumulate the compressed size as defined by - log2 (Pt) // for the current transition. Here, we define probability for a // transition Pt = Nt + c / (Nt + N~t + 2c). Note that for c = 0 // this can yield probabilities == 0.00 exactly, which will yield // an infinite entropy (log of zero is... not a good thing). // Gordon Cormack's paper indicates a "small" value of c is good // for shorter texts; unfortunately there's no mention of what // "small" means. // // If you run off the end of the chain (some learning algorithms // generate terminating chains, others always generate loops) then // do one of : (1) stop accumulating entropy; you can't go any // further; (2) go back to the START node and keep accumulating, or // (3) jump to the node with the closest prior history and keep // accumulating. // // Do this on each bitwise markov file; the file which accumulates // the lowest compressed size is the best matching file. // /////////////////////////////////////////////////////////////////// // // Algorithm for Learning - Overview // // Before you learn in the first text, initialize the data // structure with a START node with NULL exit nodes for each of the // possible exits. If you want, you can put something bigger or // more complex in, but it's not necessary. If you're running with // constant storage, allocate the rest of the storage, preformat // them as a linked list of free nodes, and off you go. // // Follow the chain, incrementing "seen" counts as you go. If // you're learning a file, and you end up at a node exit that's // a NULL (meaning no node is yet linked in for the current bit // pattern) then grab a node off the free list (or malloc one). // See below under "advanced methods" for how to get more // nodes by merging very similar nodes together. // // Whenever a node is seen with both a sufficiently large number of // seens, and a sufficiently balanced number of differing symbols, // "clone" the node so that the Markov chain branches on the // previous node. This is to split the chain where the current // state is insufficient to model the underlying process. This // only can work if the Markov model somehow contains a merge; // otherwise each node can only be reached from a single other node // (and so on, all the way back to the START node). // // In Cormack's version, when you run out of free nodes, you must // reinitialize the storage and rebuild the model (using only the // last part of the corpus data), but there are ways to merge // similar nodes (see "advanced methods" below) to get more free // nodes without throwing away data. // // Note - there is a "classical" method for building a hidden // Markov model, but it's O(n^2) in the number of nodes in the // output model! It basically uses hillclimbing techniques to form // the convex hull and then climbing to the local top of the hull // repeatedly. That's fine for small HMMs but here, when we may // have a million nodes, that's way too slow. // // Method 1 - for both initial-loop or terminated Markov chains // // Follow the Markov chain from START, incrementing the seen-0 // and seen-1 bits of the Markov chain. When you hit a 0 or 1 // choice pointing to NULL, grab an unused node off of the free // list. If the free list is empty, use an advanced method to // get more. // //////////////////////////////////////////////////////////////////// // // Advanced Method - Cloning a Node // // If a node has more than one infeeder, there is always the // possibility that the node really should be two nodes. This // could happen either because of a prior merge, a prior clone, or // because the initial node setup had a loop in it (which it may or // may not; it can work both ways.) // // A node should be cloned when there is a sufficient number of // incoming events to be statistically significant, and the // outgoing events are sufficiently balanced (i.e. the current node // is of sufficiently low predictive value - that is, it's high // entropy no matter what you do ) to lend credence to the idea // that this one node really represents two states and splitting // the node is a good idea. // // Of course, there's no reason (or way!) to clone a node that has // only one infeeder even if the node state itself has low // predictive power; in this case what's really happening is that // the real corpus' hidden Markov model is diverging at a prior // node. There simply isn't any predictive state available. // // Topic for further work - given only a single infeeder, maybe we // should back propagate the "split" up the chain of single priors // until a state is found with at least two infeeders, and clone // *that* node and all successor nodes down to the current node. // Note that if we hit the START node we don't do this as there's // no way to clone START. // // The simple and obvious way to clone a node of low predictive // power is to keep track of the previous node while doing a // training traverse, and only clone off the path from the previous // node. This means creating a new node with the same successor // nodes as the current node (or not; it depends on whether the // current node has a successor or is "end of the line"), and has // only a single infeeder, which we know is at best a single line // of prior states of arbitrary length, and at worst a series of // nodes that all looked similar at some time in the past. // ////////////////////////////////////////////////////////////////// // // Advanced methods - getting free nodes by merging nodes // // NOTA BENE: the code to implement merging is not debugged because the // open-graph training with crosslinking works so well. Consider // this stuff more as "design notes for the future". // // // merging nodes - in order to represent loops in the markov model, // it's sometimes necessary to loop back into prior Markov states. // Finding the appropriate prior state can be a hassle. Cormack and // Lai both postulate a "way" to find the appropriate prior state, // but don't describe an efficient way to do so. // // Here's a shortcut, using arithmetic to get an infinite impulse // response out of a floating point number; the already established // node with the closest floating point number to the current // generated floating point number is the "closest node". This is // a different version of "arithmetical coding", but it can be // lossy (in a controlled way). See the Guazzo coding method in // Cormack's paper - note that we do NOT use Guazzo coding for // generating output but rather just for quickly finding the node // whth the closest prior state for us to jump to when we run // off the edge of a model. // // Clearly, one could weight the prior bit by a factor of 1/2 // (really 1/(2^1)), and weight the prior float by 1 - (1/2) in // which case the prior float's mantissa is just the last N bits, // in reverse order. Unfortunately, this also gaurantees that // prior history utterly disappears as soon as it rolls out of the // number of bits in the matissa of the local floating point // operation. // // More interesting is larger values of the exponent; for example a // value of 2 yields a half-bit fractional representation, so an // error of 1 bit seen N steps in the past is overrideable by two // correct bits at N+1 and N+2 bits in the past. For example, the // with an exponent of 2, the most recent bit has significance // 1/4 = 0.250, while the second-most-recent is 3/4 * 1/4 = 3/16 = // .1875, and the bit before that is 3/4 * 3/16 = 9/64 = .04687, // and so on. // // Higher exponents just stretch out this series even further. // There is no need to assume only an integer exponent. Indeed, // it's only because of the finite precision of the computer that // this a finite impulse response (FIR) filter; infinite precision // would give infinite impulse response and "total recall" of prior // states in the Markov process - but also the ability for some // long-ago series of perfectly correct steps to blow away the more // recent entropy. For this reason, we usually use an exponent of // 1, which yields a prior bit weight of 1/2 (whch is just the // floating-point mantissa as described above). (NB: as of 20070101, // the "floating point" has been replaced by a "long long" with 64 bits // of precision. This uses less space) // // Values of the exponent less than one yield a situation where the // range of possible outcome FIR numbers becomes discontinuous; // that is, significant chunks of the range become unreachable by // any possible prior bit series. Consider the FIR value 0.5000 // with exponent of 0.5. The most recent bit then has weighting of // 0.70711, with only .29289 of weight available for all following // bits. The maximum value achieveable for a pattern 0111111.. is // then .29289, while the minimum value achieveable for pattern // 100000... is 0.70711, so the range of possible FIR numbers does // not contain any value between .29289 and .70711. Recursively, // similar subranges cease to be accessible as well. // // The exponent for "exact ambivalence" between an error in state N // versus two errors in state N-1 and N-2 is around 1.385; the // exponent for ambivalence between an error in state N versus // errors in states N-1, N-2, and N-3 is about 1.13 . Note that both of // these numbers yield rather short "memories" in the FIR filter // before the Nth prior bit underflows and is lost completely. // // By using the FIR coding of prior state, we need only keep a // table of the FIR path floating point number generated on the // path that first generated this node (which can be sorted, // yielding a _very_ fast node merging capability; just make one // pass through the sorted table looking for the two closest // floating point numbers.) Those close pairings of nodes are good // merge candidates. // // It's probably worth checking these merge candidates to see if // they have similar local probabilities - that is, if they have similar // 0-counts and 1-counts. If one node goes 0 mostly and the other // goes 1 mostly, then there's a good chance that this isn't a good // merge. How to weight FIR prior history match versus the local // outbound probability is a good question. // // Another merge candidate is to look at nodes which have identical // 0 or 1 destinations. However, finding this requires either an // extensible list of back pointers or brute-force search (albeit // over a very small subset of the entire Marov Chain). // // If we're running in mode (that is, nodes are allocated // on each unique path, and no node has ancestors with any // possibility of significantly deviating FIR values, We can also // back-calculate the approximate "feeder" node FIR values that // could lead to a particular node, given the decreasing exponent. // For example, if the current node's FIR is .3700000, then it can // only have been reached by a 0-branch from a node whose prior was // (2 * .3700000 = .7400000 ) plus or minus any crosslink-error // threshold (which is typically under 1 in a million or less). // So, we can just use our FIR lookaside table to find all possible // ancestors to this node. // // When merging two nodes, we (speaking in the 0/1 tense; larger // alphabets are an obvious extension) we perform the following steps: // // * update the to-be-kept node with the sums of the outbound 0-seen // and 1-seen counts. Without any loss, we can choose the // to-be-kept node is the node with the lower index number. // * we keep whichever output chain had the highest counts of each // of the 0-seen and 1-seen categories // * we set the FIR prior history to the weighted average of the // two prior weighted averages // * for the two output nodes which were _not_ kept (a 0-chain and // a 1-chain) mark these chains for eligibility for garbage // if we choose active collection; // * put the merged-away node on the free list. // * make an update pass thorugh the nodeset; any node outlet pointing to // the merged-away node is reset to point to the to-be-kept node. // // We may choose to merge more than one node when we encounter // an empty free list. Each merge gives us one node for sure, plus // the possibility of many others. // // At some point, a garbage collection is necessary to reclaim nodes // that no longer have a prior node. We can do this either by: // * reference counts - each node can keep an accurate reference count // of nodes that can transfer to this node. Since there are only // three events that can cause the reference count to change (those // being "allocate from free list == refcount = 1", "clone node // refcount = split incomings, and "merge with another node // == refcount1 + refcount2" then the refcounts can be maintained at // relatively low cost. // * we can run a "classic" mark and sweep. However, there's a // better way... // * we could add back pointers in each node. Unfortunately, that // means the size of a node could grow rather large. There is // still a better way.... // * We could add chaining pointers on the _output_ of each seen // slot. The "to" node gets a single slot added, to point to // the first node that can transfer in; each such node points // to the next node that can also transfer to the target node. // Preferably, the insertions are done in order so the list // remains sorted. // // When learning a new text, we also compute the running FIR prior // floating point value, and on each node we encounter in our path, // we update the local prior state FIR value according to the // weighted average of the new text (with weight 1) and the prior // texts (with weight = total times this node has been visited in // the past = sum of the 0-seen and 1-seen output values ). For // this reason, node FIR prior encodings may change so it's necessary // for the sorted FIR vector to contain not just the FIR values, but // also backpointers to the FIR nodes that held them. // // Improvement: rather than a separate sorted list of FIR values, // embed the FIR value in the cell, along with the "next less" // and "next greater" indices. Thus, each cell contains within it // pointers to it's most likely merge candidates. // // Secondary improvement - when allocating a new node from the // free list, how does one efficiently link the new node into the // next-less and next-greater lists? Sequential search clearly // works, but is inefficient. Clearly a binary (or near-binary) // tree would assist this but it's a programming hassle to // maintain the two data structures in synchrony. // // Alternatively, a fixed-size table of nodes at or close to // various "thresholds" can be maintained; when a node changes // it's FIR value, it only takes constant time to round the FIR to // a table entry value and check to see if the new value is closer // to the "perfect" value of FIR for that table entry versus the // node that is currently in that slot. If it's better (not just // as good, "better", to minimize thrashing) then the new node // index is written into the table; otherwise the old node stays. // // Note that nothing prevents a node from being in the table more // than once, nor is there any correctness "need" for the table to // be exactly right. Instead, the lookaside table is just a // speedup that helps a lot most of the time. When a node changes // value, it will move (some) so incorrectness is part of the // game. If you really want, you can move the node to a new // lookaside table slot, and then look at the previous next-less // and next-greater nodes, and choose one of them to go into // the table, replacing the (now moved) node. (NB: in implementation, // it turns out that the FIR lookaside table is *critical* for // reasonable performance; any bug in that code that causes // weak inital guesses will slow the classifier down by a factor // of 10 or more. // //////////////////////////////////////////////////////////////////////// // // Futher Hacks and Improvements: resuming a CLASSIFY past NULL. // // If the classifier runs off the end of the known Markov chains, // what do we do? Just drop the following text? Possibly - but // there is an alternative. // // We can use the FIR arithmetical coding to resume the Markov // chain at a near-optimal position. The difference between where // we are and where we resume should be as small as possible; // fortunately the FIR-next-least and FIR-next-greater indices in the // current node give the two nearest side-steps at the current Markov // node; we can then explore up and down the FIR chain to find a // preexisting node that actually has a further non-NULL chain. // The entropy increase due to this side-step outside the Markov // chain is not entirely computable from the FIR values, because // the prior information that's already rolled off the least significant // bit of the FIR calculation. // // (note: this is currently the default action) //////////////////////////////////////////////////////////////////// // // IMPLEMENTATION AND STORAGE DETAILS // // Data storage format for bitwise Markov classifiers. Note that // we always use indices rather than pointers as we want the data // structure to be stored and reloaded from disk via mmap. // // We have several "global static" values, which are stored in // the meta-start node which is always node 0: // // * the index of the START node (should always be 1) - stored // in alphabet-zero slot. // // * the index of the node with the lowest FIR value - node 0 is // * always "lower than the lowest" // // * the index of the node with the highest FIR value - node 0 is // * also always the node "higher than the highest". // // If we have a FIR lookaside speedup table, that also needs to // have space allocated. // // Each normal node contains a header containing: // // * the floating-point FIR value that codes the prior states // // * the index to the node with the next larger FIR value // // * the index to the node with the next smallest FIR value // // These FIR prior values are updated whenever a node is // visited during learning. This may mean minor motion up and // down the FIR chain. // // Each node also contains BITMARKOV_ALPHABET_SIZE slots; each slot // contains the following (so there's one such slot for each // member of the alphabet A). There are two kinds of data stored in // each slot; they are related but not interdependent. The forward // markov slot data is used during classification; the backward slots // are used only during maintenance. // // Forward Markov slots: // // * how many times this alphabet member was seen as the input at // this node. // // * when it's seen, what the index of the next node to transition to // is (index, not pointer, for position-independent code) // // Backward slots for maintenance (clone and merge) // // * index of the first incoming node that can transfer to this // node when that prior node saw this member of the alphabet A // (note that this is NOT the alphabet member that the current // node is seeing. This just gives an array of // backlinks for each member of the alphabet). NULL if there's // no known encounters of an incoming with the most recent prior // for this alphabet member. // // * index iof the _next_ incoming node that can transfer to the // same outgoing node as this node. In combination with the // prior index above, this forms a linked list of all of the // nodes that can reach a particular node. As we don't need // to traverse this list in any particular order, it's strictly // a "stack" (newest added at the front). // // // Using 64-bit storage for the floating-point FIR values and // for the visit counts, and 32-bit storage for all indices, then // assuming we're using a binary alphabet, we have: // // Per-cell // FIR prior float = 8 bytes // FIR up and down links - 2 * 4 bytes = 8 bytes // Slot within cell // forward count - 8 bytes * 2 = 16 bytes // forward index - 4 bytes * 2 = 8 bytes // backward first - 4 bytes * 2 = 8 bytes // backward next - 4 bytes * 2 = 8 bytes // // Total - 56 bytes storage per nodem, 18K nodes per megabyte. // Note that the absolute minimum size is 24 bytes/node with no // advanced features, so we're about a factor of two worse in // space, and for that we get near-linear in size for both learning // and classification. // // If we add in a 0.1% overhead size for the FIR prior lookaside // accelleration table (containing 1 index per 4-byte word) then // for every 4 bytes (1 index) of FIR prior lookaside there are // 4Kbytes (about 73) FIR value cells, so with the FIR prior // lookaside and an assumption of roughly uniform FIR prior values, // we need to traverse an average of 1/4 of 73 or about 18 markov // chain cells. With a 1.0% overhead (1 byte per 100 bytes, or 1 // index slot per 400 bytes) then there is 1 index slot per 7 // markov chain cells, and (again assuming a uniform distribution // of FIR priors) an average of just 1.8 markov chain cells visited // per FIR lookup. // // If the distribution of FIR values is too nonuniform, an adaptive // grid method may be used to smooth it out. But we'll climb that // mountain when we come to it. // // Note to self: don't use the mathlib LOG function! // It's far too slow. We'll custom craft our own that uses // linear interpolation over a lookup table. // /////////////////////////////////////////////////////////////////// // // An interesting information-theoretic philosophical point - if // we only have a million nodes in the database, then at most we // encode 20 bits worth of state. A 500-megabyte file like Andrej // uses would have at most 62 million nodes (12 bytes/node not // with short counts), or 41 million nodes = 27 bits to define // state. Since the FIR-prior value encodes at least 64 bits, can // go to 96 bits (GCC 4.0 "long double") and possibly 128 (if your // compiler supports "-mlong-double-128") shouldn't we be sticking // with the FIR-prior and simply *ignoring* the actual links, as // they (and the state nodes) encode far less information? // // Submitted as an "ultimate" pare-down - the memory contains // ONLY the times-previously-seen information. It's a big array, // indexed by the most significant N bits of the prior, with the // high-order bit being the transition now being executed. All // that's there in the array is the number of times this transition // has been executed in LEARN mode; everything else is embedded // in the state (which _is_ the index). // // That optimization would concieveably allow one to write an // embedded form of bit-entropic classification in as little as // two to four megabytes of space per class; in short, something // that would fit on an embedded system. // ////////////////////////////////////////////////////////////////// // // File Structure and Layout for Bit Entropy // // The file starts with a header block, of size 4 kybtes; the first // four longwords give the indexes (in longwords, for 32-bit alignment) // of the start and length of the FIR lookaside table and the node list. // ///////////////////////////////////////////////////////////////// // // **** NOTE - THIS IS ALL UNNECESSARY AND DEPRECATED IN THE CURRENT CODE! // // Per-node memory layout // // We have three pointers in the node slot; each of these is replicated // for each member of the alphabet in the Markov graph. // * The .nextcell says what the next cell is when we see this alph // * The .backfirst cell says what the first cell is that could // get us here when it saw an alph // * The .samenext cell says what the next cell that also gets you // you to the same nextcell when you see the same alph. // // Yes, this is slightly redundant, but it's very fast. (but note the // approximation above that would save us all this memory!) // // A diagram here helps to understand the slotting structure // // node // .nextcell >------------------->--> node // .backfirst <---\ />------->/ .nextcell >----------> // .samenext -| \---------------------< .backfirst // | / // node <-| / // .nextcell >---/ // // // Note- "backfirst" and "samenext" are redundant given the FIRlat // system; we can always find referrers to the current node by // looking at the possible origins - a node of FIR value 0.260 can // only come from the alpha link of a node of FIR value 0.520 ; a // node of value 0.779 can only have come from the 1-link of a FIR // of .558 so we can just search the FIR around there. // // // // Helper function to calculate entropy from a probability // // We speed this up with a lookup table for relatively small numbers #define ENT_CACHE_MAXCOUNT 256 #define ENT_CACHE_MAXTOTAL 256 static double stats_2_entropy (long count, long total) { static long init = 0; static double loglookups[ENT_CACHE_MAXCOUNT][ENT_CACHE_MAXTOTAL]; double value; // if no prior information, this is 1 bit exactly of data. if (total < 0.5) return (1.00); if (count > total) return (0.0); // value = ( - (crm_logl ( // (count + BIT_ENTROPIC_PROBABILITY_NERF) // / (total + BIT_ENTROPIC_PROBABILITY_NERF))) // / 0.69314718 ); // return (value); // Do we need to initialize the cache to "no data", which we // code for as a -1 as the entropy. if (init == 0) { long i, j; init = 1; for (i = 0; i < ENT_CACHE_MAXCOUNT; i++) for (j = 0; j < ENT_CACHE_MAXTOTAL; j++) loglookups[i][j] = -1; } // See if we've already cached this log probability value? if ( count < ENT_CACHE_MAXCOUNT && total < ENT_CACHE_MAXTOTAL) { value = loglookups[count][total]; if (value > 0.0) return value; } // Nope. Nothing in the cache. Calculate it and cache it. // // "correct" entropy is factored by the unknown prior: // value = ( - ( ( count + BIT_ENTROPIC_PROBABILITY_NERF) // / ( total + BIT_ENTROPIC_PROBABILITY_NERF)) // * (crm_logl ( // (count + BIT_ENTROPIC_PROBABILITY_NERF) // / (total + BIT_ENTROPIC_PROBABILITY_NERF))) // / 0.69314718 ); // But here, we know the event has come to pass and so the // prior is 1.0 (the event itself is a certainty at this point; // we are now just counting bits needed to encode it ! ). value = ( - crm_logl ( (count + BIT_ENTROPIC_PROBABILITY_NERF) / (total + BIT_ENTROPIC_PROBABILITY_NERF))) / 0.69314718 ; // if it fits, put it in the cache if ( count < ENT_CACHE_MAXCOUNT && total < ENT_CACHE_MAXTOTAL) loglookups[count][total] = value; // and we're done. return (value); } // Helper functions for dealing with the FIRlat // Given a FIR value, what slot would this reside in? static long fir_2_slot (double fir, long firlatlen) { long ifir; ifir = (fir * (firlatlen - 1.00)); if (ifir < 0) ifir = 0; if (ifir >= firlatlen) ifir = firlatlen - 1; return (ifir); } static double slot_2_fir (long slot, long firlatlen) { double outval; outval = (slot + 0.5) / (firlatlen - 1); if (outval > 1.00) outval = 1.00; if (outval < 0.0) outval = 0.0; return (outval); } // Dump just the significant entries in the FIRlat static void firlat_significant (ENTROPY_FEATUREBUCKET_STRUCT *nodes, long *firlat, long firlatlen) { long i; fprintf (stderr, "**** FIRLAT significant scan *****\n"); fprintf (stderr, "root node low %ld high %ld\n", nodes[0].fir_smaller, nodes[0].fir_larger); for (i = 0; i < firlatlen; i++) { if (firlat[i] > 0) { fprintf (stderr, "FIRLAT slot %ld pval %f node %ld fv %f down %ld up %ld\n", i, slot_2_fir(i, firlatlen), firlat[i], nodes[firlat[i]].fir_prior, nodes[firlat[i]].fir_smaller, nodes[firlat[i]].fir_larger); }; }; } static void firlat_sanity_scan (long *firlat, long firlatlen, ENTROPY_FEATUREBUCKET_STRUCT *nodes, long nodeslen) { long i; long stepcounter; // Scan the FIRLat for errors (optional) // for (i = 0; i < firlatlen; i++) { if (firlat[i] > nodeslen || firlat[i] < 0 ) fprintf (stderr, "Internal FIRLAT error: slot %ld claims OOB node %ld\n", i, firlat[i]); }; for (i = 0; i < nodeslen; i++) { if ( nodes[i].fir_smaller < -1 || nodes[i].fir_larger < -1 || nodes[i].fir_smaller > nodeslen || nodes[i].fir_larger > nodeslen ) fprintf (stderr, "Internal FIRchain error at node %ld (%f) smaller: %ld larger %ld\n", i, nodes[i].fir_prior, nodes[i].fir_smaller, nodes[i].fir_larger); }; // // Now verify the FIRLAT chain integrity upward // stepcounter = 0; i = nodes[0].fir_larger; while (stepcounter < nodeslen + 1 && nodes[i].fir_larger > 0 && nodes[i].fir_larger < nodeslen) { stepcounter++; i = nodes[i].fir_larger; }; if (stepcounter > nodeslen+1) fprintf (stderr, "ERROR: the FIR chain is figure-6ed upward\n"); if (nodes[i].fir_larger != 0) fprintf (stderr, "ERROR: the FIR chain goes off to node %ld\n", nodes[i].fir_larger); // and again, check chain integrity downward stepcounter = 0; i = nodes[0].fir_smaller; while (stepcounter < nodeslen + 1 && nodes[i].fir_smaller > 0 && nodes[i].fir_smaller < nodeslen) { stepcounter++; i = nodes[i].fir_smaller; }; if (stepcounter > nodeslen+1) fprintf (stderr, "ERROR: the FIR chain is figure-6ed downward\n"); if (nodes[i].fir_smaller != 0) fprintf (stderr, "ERROR: the FIR chain goes off to node %ld\n", nodes[i].fir_smaller); } // // Helper function to find the smallest node that's larger than // the called FIR. // // Originally both this routine and find_closest returned exact // answers- but a bug caused the hash-table to find exact entry // points to overload buckets representing fairly common similar // states, and so we'd end up with exponentially increasing runtime // during the sequential search part of the run (measured at over // 2000 steps in sequential search average, when the average number // of steps should have been 5 (five). // // The intermediate fix is to only call this (exact) function when // you need to move a node - that is, during training. Otherwise, // "close enough is close enough", and use the "close" call down // below. But the _real_ bug has now been fixed, and the "close" // routine is now just an efficiency firewall. It never should // have been; but hey... it's here. // static long firlat_find_smallest_larger ( ENTROPY_FEATUREBUCKET_STRUCT *nodes, long nodeslen, long *firlat, long firlatlen, double my_fir) { long hit_node; long firlat_entry; long step_count; // Start out with FIRlat option. if (my_fir < 0.00 || my_fir >= 1.00 || firlatlen < 10 || firlatlen > 200000) fprintf (stderr, "My FIR is outrageous (value: %e, firlatlen %ld)\n", my_fir, firlatlen); firlat_entry = fir_2_slot ( my_fir, firlatlen); if (internal_trace) fprintf (stderr, "FFSL: Searching for FIR: %f slot %ld \n", my_fir, firlat_entry); if ((firlat_entry < 0) || (firlat_entry >= firlatlen)) fprintf (stderr, "FIRLAT is very hosed (myfir: %e, entry number %ld)!\n", my_fir, firlat_entry); if (internal_trace) fprintf (stderr, "FFSL search: FIR: %f, slot: %ld node: %ld, smaller: %ld, larger:%ld\n", (float) my_fir, firlat_entry, firlat[firlat_entry], nodes[firlat[firlat_entry]].fir_smaller, nodes[firlat[firlat_entry]].fir_larger ); if (firlat[firlat_entry] > nodeslen || nodes[firlat[firlat_entry]].fir_smaller < 0 || nodes[firlat[firlat_entry]].fir_larger < 0 ) fprintf (stderr, "Internal FIR chain error at slot %ld node %ld (%f)\n", firlat_entry, firlat[firlat_entry], my_fir); // Move down in the firlat till we find a live node. while ( firlat_entry > 0 && firlat[firlat_entry] <= 0) { // fprintf (stderr, // "firlat_entry: %ld points to node %ld so stepdown \n", // firlat_entry, firlat[firlat_entry]); firlat_entry--; if (internal_trace) fprintf (stderr, "_"); } // Do a little metering to see if we're getting firlat entries? //if (firlat_entry == initial_firlat_entry) // { // fprintf (stderr, "`"); // } //else // { // fprintf (stderr, " %ld", firlat_entry - initial_firlat_entry); // }; // Stupidity check - are we running down off the end of the FIRLAT? if (firlat_entry == 0) { // Yes. Resort to sequential search hit_node = nodes[0].fir_larger; if (internal_trace) fprintf (stderr, "FFSL: Need to start from rootptr- next node is %ld ( %f) for fir %f \n", hit_node, nodes[hit_node].fir_prior, my_fir); } else { // This non-null entry is where we start to look. hit_node = firlat[firlat_entry]; if (internal_trace) fprintf (stderr, "FFSL: non-null ENTRY at slot %ld node: %ld prior: %f down %ld up %ld\n", firlat_entry, hit_node, nodes[hit_node].fir_prior, nodes[hit_node].fir_smaller, nodes[hit_node].fir_larger); }; // Now we're on the closest "less than" firlat entry (which, if we // had a firlat hit, may still be above our desired place. // Part 1 - find a place where we are clearly smaller than // the node above us. step_count = 0; while ( hit_node > 0 && my_fir > nodes[hit_node].fir_prior ) { hit_node = nodes[hit_node].fir_larger; step_count++; if (internal_trace) fprintf (stderr, "+"); }; // Part 2 - assure that (because of FIRlat issues) we are not "above" // yet other nodes that we are smaller than. while ( hit_node > 0 && my_fir < nodes[hit_node].fir_prior ) { hit_node = nodes[hit_node].fir_smaller; step_count++; if (internal_trace) fprintf (stderr, "-"); }; // and again, try going up one last time. while ( hit_node > 0 && my_fir > nodes[hit_node].fir_prior ) { hit_node = nodes[hit_node].fir_larger; step_count++; if (internal_trace) fprintf (stderr, "*"); }; // hit_node is now the smallest node that's larger than us if (internal_trace) fprintf (stderr, "FFSL result node: %ld with FIR %f \n", hit_node, nodes[hit_node].fir_prior); if (internal_trace) fprintf (stderr, "Steps used: %ld\n", step_count); return (hit_node); } static long firlat_find_closest_node ( ENTROPY_FEATUREBUCKET_STRUCT *nodes, long nodeslen, long *firlat, long firlatlen, double currentfir) { long larger; long smaller; long closest; double error_larger, error_smaller; #define COMPLEX_FIND_CLOSEST #ifdef COMPLEX_FIND_CLOSEST // Complex find closest code. The idea here is to do // increasing radius searches to find the closest nearby // strings in Hamming distance. To generate this, we first // search at +/- one LSB in the floating point mantissa, and // then successively double it, which sweeps through the // single-bit errors. We can then repeat for double-bit // errors, triple-bit errors, and so on... which leads to a // huge number of possible next states. For IEEE floating // point, the limiting case is an underflowing double-bit // error representation. Of course, the value of this varies // with the local FIR value, but for a FIR value of 1.0 the // double that touches only the lowest-order mantissa of a // double float is hex 0x3bf0000000000000 (approximately // 5.421011E-20). For a FIR of 0.50000, the value is 1/2 // that, and so on. However, this is really not // computationally tractable right now. #endif // return the index of the node that's closest in // FIRLAT value to our current node. // assume larger is closer; larger = firlat_find_smallest_larger (nodes, nodeslen, firlat, firlatlen, currentfir); error_larger = nodes[larger].fir_prior - currentfir; closest = larger; // now make sure that smaller isn't better. smaller = nodes[larger].fir_smaller; error_smaller = currentfir - nodes[smaller].fir_prior; if (error_smaller < error_larger) closest = smaller; return (closest); } // How we remove a node from the FIR chain and firlat static void firlat_remove_node ( ENTROPY_FEATUREBUCKET_STRUCT *nodes, long nodeslen, long *firlat, long firlatlen, long curnode) { // cut it out of the chain first long smaller_node, larger_node; long firlat_entry; double perfect_fir; // set up larger and smaller nodes smaller_node = nodes[curnode].fir_smaller; larger_node = nodes[curnode].fir_larger; { if (internal_trace) { fprintf (stderr, "About to FIR chop node %ld (%ld %ld)\n", curnode, smaller_node, larger_node); } if (smaller_node < 0 && larger_node < 0) { // what do we do if the node ain't in the FIR chain - nuthin! // we might be initiated on a shuffle or somesuch. // fprintf (stderr, " BLARG!!!! removing a non-FIRchained node!\n"); } else { // cut the node out of the FIR chain. nodes[smaller_node].fir_larger = larger_node; nodes[larger_node].fir_smaller = smaller_node; } } // Now we're definitely out of the chain. Mark us so.... nodes[curnode].fir_smaller = -1; nodes[curnode].fir_larger = -1; if (internal_trace) fprintf ( stderr, "FIRchain delete %ld (%lf) between %ld (%lf) and %ld (%lf)\n", curnode, nodes[curnode].fir_prior, smaller_node, nodes[curnode].fir_prior, larger_node, nodes[curnode].fir_prior); // Since we're not in the chain (and unfindable that way, we can // now also be removed from the FIRLAT lookaside table. // this is where we think we are - which might not be where we are. if (nodes[curnode].firlat_slot > -1) { firlat_entry = nodes[curnode].firlat_slot; if ( firlat[firlat_entry] == curnode) firlat[firlat_entry] = 0; } // This is the slot where we once were. Fix it up. // firlat_entry = fir_2_slot ( nodes[curnode].fir_prior, firlatlen); if (firlat[firlat_entry] == curnode) { long correct_node; perfect_fir = slot_2_fir ( firlat_entry, firlatlen); correct_node = firlat_find_closest_node ( nodes, nodeslen, firlat, firlatlen, perfect_fir); if (fir_2_slot(nodes[correct_node].fir_prior, firlatlen) == firlat_entry) { firlat[firlat_entry] = correct_node; if (internal_trace) fprintf (stderr, "FIRtable insert node %ld at slot %ld\n", correct_node, firlat_entry); }; }; if (internal_trace) firlat_significant (nodes, firlat, firlatlen); } // insert a node into the FIR chain and firlat static void firlat_insert_node ( ENTROPY_FEATUREBUCKET_STRUCT *nodes, long nodeslen, long *firlat, long firlatlen, long curnode) { long smaller_node, larger_node; long current_occupant, firlat_slot; double perfect_fir_value, old_fir_error, new_fir_error; if ( nodes[curnode].fir_smaller > 0 || nodes[curnode].fir_larger > 0) fprintf (stderr, "BLARG !!! We're reinserting node %ld twice!\n", curnode); if (internal_trace) firlat_significant (nodes, firlat, firlatlen); // Get the next larger node larger_node = firlat_find_smallest_larger ( nodes, nodeslen, firlat, firlatlen, nodes[curnode].fir_prior); // and the smaller node smaller_node = nodes[larger_node].fir_smaller; if (internal_trace) fprintf (stderr, "FIRchain insert between node S: %ld (%f) and L: %ld (%f) \n", smaller_node, nodes[smaller_node].fir_prior, larger_node, nodes[larger_node].fir_prior); // Plug ourselves into the fir chain nodes[larger_node].fir_smaller = curnode; nodes[curnode].fir_larger = larger_node; nodes[smaller_node].fir_larger = curnode; nodes[curnode].fir_smaller = smaller_node; if (internal_trace) fprintf ( stderr, "FIRchain done - inserted %ld (%lf) between %ld (%lf) and %ld (%lf)\n", curnode, nodes[curnode].fir_prior, smaller_node, nodes[smaller_node].fir_prior, larger_node, nodes[larger_node].fir_prior); // and put ourselves into the FIRLat if appropriate. firlat_slot = fir_2_slot (nodes[curnode].fir_prior, firlatlen); current_occupant = firlat[firlat_slot ]; if (current_occupant == 0) { // No current occupant in the FIRLAT- put us in. firlat [firlat_slot] = curnode; nodes[curnode].firlat_slot = firlat_slot; if (internal_trace) fprintf (stderr, "Inserted node %ld into empty slot at %ld (%f)\n", curnode, firlat_slot, nodes[curnode].fir_prior); } else { // There's a prior occupant- need to see who's more accurate. perfect_fir_value = slot_2_fir ( firlat_slot, firlatlen); old_fir_error = fabs (nodes[current_occupant].fir_prior - perfect_fir_value); new_fir_error = fabs (nodes[curnode].fir_prior - perfect_fir_value); if (new_fir_error < old_fir_error) { firlat[firlat_slot] = curnode; nodes[curnode].firlat_slot = firlat_slot; if (internal_trace) fprintf (stderr, "Insert over %ld at slot %ld (%f) with node %ld\n", current_occupant, firlat_slot, nodes[curnode].fir_prior, curnode); }; }; if (internal_trace) firlat_significant (nodes, firlat, firlatlen); } // // Routine to dump what's in the node set and lookaside table // static void dump_bit_entropy_data ( ENTROPY_FEATUREBUCKET_STRUCT *nodes, long nodeslen, long *firlat, long firlatlen) { int i; fprintf (stderr, " node A0 c A1 c PFIR FIR< FIR> \n"); for (i = 0; i < nodeslen && nodes[i].fir_prior > -1.0; i++) { fprintf (stderr, "%5d %5ld %5ld %5ld %5ld %5.3f %5ld %5ld \n", i, nodes[i].abet[0].nextcell, nodes[i].abet[0].count, nodes[i].abet[1].nextcell, nodes[i].abet[1].count, nodes[i].fir_prior, nodes[i].fir_smaller, nodes[i].fir_larger); }; fprintf (stderr, "FIRlat contents:"); for (i = 0; i < firlatlen; i++) { if (firlat[i] > 0 ) { fprintf (stderr, "\n %d %ld (perf: %f actual: %f) ", i, firlat[i], ((i + 0.5) / firlatlen), nodes[firlat[i]].fir_prior); } } fprintf (stderr, "\n"); } // Create a T tall, W wide shuffle network in node memory. // This is destructive; note that nodes get their backchannel // set correctly, but their FIRs are not changed, and they // are NOT inserted into the FIRlat. // // Note also that this starts the shuffle block at the node // pointed to by "firstnode", and returns the index of the first // node NOT part of the shuffle block, which (handily) is the // start of the nominal freelist. // static long nodes_init_shufflenet ( ENTROPY_FEATUREBUCKET_STRUCT *nodes, long nodeslen, long height, long width, long firstnode) { long w, h, a ; long node, target, f; // long trow, tcol; f = firstnode ; // because firstnode is too long! for (w = 0; w < width; w++) // zero to width-1 for (h = 0; h < height; h++) // zero to height-1 { // What is our "from" node? node = f + h + (w * height); // Mark these nodes "not yet in the FIR chain". nodes[node].fir_smaller = -1; nodes[node].fir_larger = -1; // set each member of the alphabet to the shuffle of the // next column (modulo width, so the whole thing loops) for (a = 0; a < ENTROPY_ALPHABET_SIZE; a++) { // zero the count nodes[node].abet[a].count = 0; //tcol = (w + 1) % width; //trow = ((h * ENTROPY_ALPHABET_SIZE) + a) % height; //target = f // + tcol * height // + trow; // where does this node.abet point? target = f // start of shuffle + ((w+1) % width) * height // go to the next column + (((h*ENTROPY_ALPHABET_SIZE) + a) % height) ; // within col // //fprintf (stderr, "\nlinking from node %ld slot %ld to %ld", // node, a, target); // aim our current node.abet there. nodes[node].abet[a].nextcell = target; // and initialize the prior nodes[node].fir_prior = -1.0; } } // fprintf (stderr, "\n"); // dump_bit_entropy_data (nodes, nodeslen, NULL, 0); return (height * width + firstnode); }; // // How to learn Bit Entropic style // int crm_expr_bit_entropy_learn (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen) { // learn the osb_bayes transform spectrum of this input window as // belonging to a particular type. // learn (classname) /word/ // long i, j, k; ENTROPY_FEATUREBUCKET_STRUCT *nodes; // the node array (after mapping!) long nodeslen = 0; // how many nodes do we have? long nodebytes; // how many bytes of nodes? long *firlat; // the FIR prior lookaside table ENTROPY_HEADER_STRUCT *headers; // the what-is-where headers. long *fmap; // catcher for where the file gets mapped. long firlatlen = -1; // how long is the FIR lookaside table? long firlatbytes; // how many bytes of firlat long long *totalbits; // How many total feature bits in this file double localfir; // the FIR value we've accumulated on this path long curnode; // the current node in our search // Text to be learned stuff: long textoffset; long textmaxoffset; long bitnum; long sense; long crosslink; double crosslink_thresh, crosslink_mincount; char clerrtxt[MAX_PATTERN]; long clerrlen; double bitweight; long fev; // fev is for fatal errors // node file details char htext[MAX_PATTERN]; // the node filename working buffer long hlen; char *learnfilename; // the real node file name long hfsize; // size of the node file struct stat statbuf; // for statting the node file long made_new_file; long unique_states = 0; if (internal_trace) fprintf (stderr, "executing a bit-entropic LEARN\n"); // extract the node file name crm_get_pgm_arg (htext, MAX_PATTERN, apb->p1start, apb->p1len); hlen = apb->p1len; hlen = crm_nexpandvar (htext, hlen, MAX_PATTERN); // set our cflags, if needed. The defaults are // "case" and "affirm", (both zero valued). // and "crosslink" disabled. sense = 1; if (apb->sflags & CRM_REFUTE) { sense = -sense; if (user_trace) fprintf (stderr, " refuting bit-entropic learning\n"); }; crosslink = 0; if (apb->sflags & CRM_CROSSLINK) { crosslink = 1; if (user_trace) fprintf (stderr, " enabling crosslinking.\n"); }; unique_states = 0; if (apb->sflags & CRM_UNIQUE) { unique_states = 1; if (user_trace) fprintf (stderr, " enabling unique states.\n"); }; // // grab the filename, and stat the file // note that neither "stat", "fopen", nor "open" are // fully 8-bit or wchar clean... i = 0; while (htext[i] < 0x021) i++; j = i; while (htext[j] >= 0x021) j++; // filename starts at i, ends at j. null terminate it. htext[j] = '\000'; learnfilename = strdup ( &(htext[i] )); // and stat it to get it's length k = stat (learnfilename, &statbuf); made_new_file = 0; // quick check- does the file even exist? if (k != 0) { // file didn't exist... create it FILE *f; if (user_trace) { fprintf (stderr, "\nHad to create new BEN file %s\n", &htext[i]); fprintf (stderr, "Opening %s for BEN file creation\n", &htext[i]); }; f = fopen (&htext[i], "wb"); if (!f) { fprintf (stderr, "\n Couldn't open your new BEN file %s for writing; errno=%d .\n", &htext[i], errno); if (engine_exit_base != 0) { exit (engine_exit_base + 23); } else exit (EXIT_FAILURE); }; // did we get a value for sparse_spectrum_file_length? // (yes, even though we aren't using "sparse spectra", we'll // use the same -S or -s parameter. If we're in unique_states // mode, then use the DEFAULT_ENTROPY_FILE_LENGTH. Otherwise, // only allocate enough nodes for the header, the firlat, and // the standard toroid (plus a little because we don't trust // ourselves). if (sparse_spectrum_file_length == 0 ) { if (unique_states) { sparse_spectrum_file_length = DEFAULT_BIT_ENTROPY_FILE_LENGTH; } else { sparse_spectrum_file_length = ( BIT_ENTROPIC_SHUFFLE_HEIGHT * BIT_ENTROPIC_SHUFFLE_WIDTH ) + 1000; }; }; // How many bytes of nodes do we need? nodeslen = sparse_spectrum_file_length; nodebytes = nodeslen * (sizeof (ENTROPY_FEATUREBUCKET_STRUCT)); firlatlen = sparse_spectrum_file_length * BIT_ENTROPIC_FIR_LOOKASIDE_FRACTION; firlatbytes = firlatlen * sizeof (long); if (user_trace) fprintf (stderr, "No such file, must create it, length %ld nodes, %ld lats ( %ld nodebytes, %ld latbytes)\n", nodeslen, firlatlen, nodebytes, firlatbytes); // Write them bytes, all NULs. (the 1024 is just some padding) // for (j = 0; j < (1024 + firlatbytes + nodebytes + (ENTROPY_RESERVED_HEADER_LEN * sizeof (long))); j++) fputc ('\000', f); made_new_file = 1; fclose (f); // and reset the statbuf to be correct k = stat (learnfilename, &statbuf); }; // hfsize = statbuf.st_size; // // map the bit-entropy file into memory // fmap = ( long *) crm_mmap_file (&(htext[i]), 0, hfsize, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); if (fmap == MAP_FAILED) { fev = fatalerror5 ("Couldn't get to the bit-entropic file named: ", &htext[i], CRM_ENGINE_HERE); return (fev); }; headers = (ENTROPY_HEADER_STRUCT *) fmap; ///////////////////////////////////////////////////////////////// // // Now the file is memory mapped - if we just made it, we must // put the right format into it. if ( made_new_file ) { int i; // Start with initializing the headers. // headers->firlatstart = ENTROPY_RESERVED_HEADER_LEN; //Start of the FIRlat // firlat = (long *) & fmap[headers->firlatstart]; // headers->firlatlen = firlatlen; // // the actual nodes start after the FIRlat headers->nodestart = headers->firlatstart + firlatlen; // // and the actual address of the first node is: nodes = (ENTROPY_FEATUREBUCKET_STRUCT *) & (fmap[headers->nodestart]); // // and the number of nodes is... headers->nodeslen = nodeslen; // // headers [4] and [5] are for the long long totalbits. headers->totalbits = 0; // initialize the FIR lookaside to contain the metanode (node 0) as // a sentinel value, except for the two extremum nodes. for (i = 0; i < firlatlen; i++) firlat[i] = 0; // initialize the metanode (node 0) which points to the // start and end of the FIR chains, and to the free list. nodes[0].fir_smaller = 3; // yes, the initial DB looks like an nodes[0].fir_larger = 2; // infinity sign, with node 0 "twisted" // the start node is node 1, at node 0, alphabet 1 -> next, == 1 nodes[0].abet[1].nextcell = 1; if (! unique_states ) { // put in a shuffled initial state. long first_freenode; long width, height; // width should be the square root of the size, rounded down // to a multiple of 8 (assuming byte-wise orientation) width = ((int) (sqrt (sparse_spectrum_file_length)) / 8 ) * 8; // height should be as many as possible with complete rows height = sparse_spectrum_file_length / width; if (user_trace) fprintf (stderr, "New toroid. width: %ld, height %ld\n", width, height); first_freenode = nodes_init_shufflenet (nodes, nodeslen, height, width, 1); // the 1 means "node 1 is first node! // mark node 1 as the start of the actual Markov chain nodes[0].abet[1].nextcell = 1; // chain the rest of the nodes into a freelist. for (i = first_freenode; i < nodeslen; i++) { nodes[i].abet[0].nextcell = i+1 ; nodes[i].fir_prior = -1.0; } // mark the last node in the freelist as the end of the node // freelist. nodes[0].abet[0].nextcell = first_freenode; nodes[nodeslen-1].abet[0].nextcell = 0; } else {// Else very basic init - nodes 1, 2, and 3 in a vee, with // nodes 2 and 3 with no sucessors whatsoever. // initialize the START node (node 1) to point to node 2 on a 0 nodes[1].abet[0].count = 0; nodes[1].abet[0].nextcell = 2; // initialize the START node (node 1) to point to node 3 on a 1 nodes[1].abet[1].count = 0; nodes[1].abet[1].nextcell = 3; // initialize node 2 (that is, first bit is zero) // initialize the outoput on a 0 to be "nowhere" nodes[2].abet[0].count = 0; nodes[2].abet[0].nextcell = 0; // initialize the output on 1 to be "nowhere" nodes[2].abet[1].count = 0; nodes[2].abet[1].nextcell = 0; // initialize node 3 (that is, first bit is 1) // initialize the outoput on a 0 to be "nowhere" nodes[3].abet[0].count = 0; nodes[3].abet[0].nextcell = 0; // initialize the output on 1 to be "nowhere" nodes[3].abet[1].count = 0; nodes[3].abet[1].nextcell = 0; // initialize nodes 4 through n into a free list. We don't care // about any other field than the next node field while on the // free list. for ( i = 4; i < nodeslen; i++) { nodes[i].abet[0].nextcell = i+1 ; nodes[i].fir_prior = -1.0; } nodes[0].abet[0].nextcell = 4; // mark the last node in the freelist as the end of the node // freelist. nodes[nodeslen-1].abet[0].nextcell = 0; }; // Either way, we have to initialize the start of the FIR system // initialize the START node (node 1) into the FIR system nodes[1].fir_prior = 0.5; nodes[1].fir_smaller = 2; nodes[1].fir_larger = 3; //firlat[fir_2_slot (0.5, firlatlen)] = 1; //nodes[1].firlat_slot = fir_2_slot(0.5, firlatlen); // initialize node 2 into the FIR system nodes[2].fir_prior = 0.0 ; nodes[2].fir_smaller = 0; nodes[2].fir_larger = 1; //firlat[fir_2_slot (0.0, firlatlen)] = 2; //nodes[2].firlat_slot = fir_2_slot(0.0, firlatlen); // initialize node 3 into the FIR system nodes[3].fir_prior = 1.0; nodes[3].fir_smaller = 1; nodes[3].fir_larger = 0; //firlat[fir_2_slot (1.0, firlatlen)] = 3; //nodes[3].firlat_slot = fir_2_slot(1.0, firlatlen); } // If this isn't a new file, grab stuff out of the header: firlat = (long *) & (fmap [headers->firlatstart]); firlatlen = headers->firlatlen; firlatbytes = firlatlen * (sizeof (long)); nodes = (ENTROPY_FEATUREBUCKET_STRUCT *) & (fmap[headers->nodestart]); nodeslen = headers->nodeslen; nodebytes = nodeslen * (sizeof (ENTROPY_FEATUREBUCKET_STRUCT)); // headers [4] and [5] are for the long long totalbits. // totalbits = (long long *) &headers[4]; totalbits = & (headers->totalbits); // In this format, the first BIT_ENTROPIC_FIR_LOOKASIDE_FRACTION // longwords contain the FIR prior lookaside table and the // remainder contain actual nodes, and start immediately after // the FIR lookaside table. // if (internal_trace) fprintf (stderr, "Firlatlen = %ld (%ld bytes), nodeslen = %ld (%ld bytes) \n", firlatlen, firlatbytes, nodeslen, nodebytes); if (user_trace) fprintf (stderr, "Bit-entropic file %s has length %ld nodes\n", &htext[i], nodeslen); // If this is a new file, we have to write in the FIR lookaside // table and the free node list. Otherwise, the free list will // recurse and GCing will not work. Additionally, we need to // initialize node 0 (the metanode) and node 1 (the start node); // we'll also initialize nodes 2 and 3 to be the 0- and 1-successor // of node 1. // //////////////////////////////////////////////////////////////// // Crosslink Threshold is dependent on the number of nodes... // crosslink_thresh = 1.0 / nodeslen; crosslink_thresh = 0.5 / nodeslen; // crosslink_thresh = 0.25 / nodeslen; // crosslink_thresh = 0.1 / nodeslen; // crosslink_thresh = 0.05 / nodeslen; // crosslink_thresh = 0.01 / nodeslen; // crosslink_thresh = 0.003 / nodeslen; // crosslink_thresh = 0.001 / nodeslen; //////////////////////////////////////////////////////////// // Crosslink Mincount is the minimum number of good bits before we // allow a crosslink crosslink_mincount = 2; // Running 1 megaslot at 1E-7 thresh overflows TREC06 public. // Running 2 megaslot at 1E-7 thresh uses up 76% of TREC06 public, but // with lousy TER (36/10000, 24/1000) and 5140 total errors // even better might be an _adaptive_ threshold, that looks at the // available freelist length and modulates according to that. // Boundary conditions: with empty file, threshold is very small. // and with a full file, threshold is 10/nodeslen. // (we use the parameterized form of a line here - the first term can // be dropped, because it's always times zero. But we leave it here // in case we actually want a nonzero initial threshold.) // If user specified an overriding value, use that. if (apb->s2len > 0) { crm_get_pgm_arg (clerrtxt, MAX_PATTERN, apb->s2start, apb->s2len); clerrlen = apb->s2len; clerrlen = crm_nexpandvar (clerrtxt, clerrlen, MAX_PATTERN); crosslink_thresh = strtod (clerrtxt, NULL) * crosslink_thresh; }; //////////////////////////////////////////////////////////////// // // Setup is complete. We now just churn through the // incoming bitstream, incrementing and allocating new cells // as needed. Magic Merging and Cloning action may happen as well. // textoffset = txtstart; textmaxoffset = txtstart + txtlen; // we'll number bits from 7 to 0, with 7 being most significant (and // also, in most ASCII text, it's zero) bitnum = 8; // The current node starts at the START node, which is the // the abet[1] link of the metanode (node 0) curnode = nodes[0].abet[1].nextcell; bitweight = BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT; // localfir is the value of the FIR summary as we sit on // the current node. Thus, it contains the influence of the // most recent step that got us here, but NOT the effect of the current // alph. // // Of course, the starting is DEFINED as the FIR of the // START node. localfir = nodes[curnode].fir_prior; if (internal_trace) fprintf (stderr, "Going to do %ld characters.\n", (unsigned long) textmaxoffset - (unsigned long) textoffset); while (textoffset + 1 < textmaxoffset || bitnum > 0) { unsigned short thischar; unsigned short thisalph; unsigned short nextalph; unsigned short nextnextalph; unsigned short nextnextnextalph; double newnodefir; // get the next alph member. bitnum = bitnum - ENTROPY_CHAR_SIZE; if (bitnum < 0) { bitnum = 7; textoffset ++; }; thischar = txtptr [ textoffset ]; thisalph = ( thischar >> bitnum ) & ENTROPY_CHAR_BITMASK; if (internal_trace) firlat_sanity_scan (firlat, firlatlen, nodes, nodeslen); // nextalph is a lookahead one bit. We use it for smarter // crosslinking. nextalph = ( thischar >> (bitnum - 1) ) & ENTROPY_CHAR_BITMASK; if (textoffset < textmaxoffset && bitnum == 0) nextalph = ((txtptr [textoffset+1]) >> 7) & ENTROPY_CHAR_BITMASK; // nextnextalph is lookahead two bits. Yet smarter crosslinking nextnextalph = ( thischar >> (bitnum - 2) ) & ENTROPY_CHAR_BITMASK; if (textoffset < textmaxoffset && bitnum == 0) nextnextalph = txtptr [textoffset+1] >> 6 & ENTROPY_CHAR_BITMASK; if (textoffset < textmaxoffset && bitnum == 1) nextnextalph = txtptr [textoffset+1] >> 7 & ENTROPY_CHAR_BITMASK; // nextnextnextalph is lookahead three bits. Even more smarter nextnextnextalph = ( thischar >> (bitnum - 3) ) & ENTROPY_CHAR_BITMASK; if (textoffset < textmaxoffset && bitnum == 0) nextnextalph = txtptr [textoffset+1] >> 5 & ENTROPY_CHAR_BITMASK; if (textoffset < textmaxoffset && bitnum == 1) nextnextnextalph = txtptr [textoffset+1] >> 6 & ENTROPY_CHAR_BITMASK; if (textoffset < textmaxoffset && bitnum == 2) nextnextnextalph = txtptr [textoffset+1] >> 7 & ENTROPY_CHAR_BITMASK; if (internal_trace) fprintf (stderr, "Working char '%c' (%ld untouched), bit %ld.\n", thischar, (unsigned long) textmaxoffset - (unsigned long) textoffset -1, bitnum); if (internal_trace) fprintf (stderr, "\nNow at node %ld, alph %d, (bit %ld) localFIR %f\n", curnode, thisalph, bitnum, localfir); // remove this node from the FIRlat and the fir chain if (internal_trace) fprintf (stderr, "Removing node %ld from FIRlat \n", curnode); firlat_remove_node (nodes, nodeslen, firlat, firlatlen, curnode); /////////////////////////////////////////////////////////////// // // calculate node's new FIR and where it will go into the FIRlat // { long totalcount, i; totalcount = 0; for (i = 0; i < ENTROPY_ALPHABET_SIZE; i++) totalcount += nodes[curnode].abet[i].count; newnodefir = ( localfir + (totalcount * nodes[curnode].fir_prior) ) / ( totalcount + 1.0 ); }; nodes[curnode].fir_prior = newnodefir; ////////////////////////////////////////////////////// // // Increment our exit path from curnode; remember that we // might be "unlearning" so use "sense" rather than just incrementing // Of course, this doesn't completely unlearn, as there's always the // chance we allocated a new node, or did a merge or clone on // the initial learn and there's no way to "undo" that for certain. // nodes[curnode].abet[thisalph].count += sense; // and keep track of the total visits this file has seen. *totalbits += sense; // GROT GROT GROT // on an UNLEARN, we might actually decrement total_count to // zero, meaning that this node never actually got hit. In // which case, smoe gyrations are appropriate. But for now, let's // ignore the problem. //////////////////////////////////////////////////////////// // // Now put the node back where it belongs. // if (internal_trace) { fprintf (stderr, "Current Status before replacement at node %ld\n", curnode); dump_bit_entropy_data (nodes, nodeslen, firlat, firlatlen); }; firlat_insert_node (nodes, nodeslen, firlat, firlatlen, curnode); if (internal_trace) { fprintf (stderr, "Current Status after replacement at node %ld\n", curnode); dump_bit_entropy_data (nodes, nodeslen, firlat, firlatlen); }; ////////////// // we've fixed up the FIR chain; // we've fixed up the FIR lookaside table. // It's now time to move on to the next node. // If there is no next node, the fun begins and we need to // allocate yet another node. ///////////////////////////////////////////////////// // // Calculate the new local FIR for this path (NOT the // node, but for the current path we will have taken at // the next step in the node model. // localfir = ( thisalph * BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT) + ( localfir * ( 1.0 - BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT )); if (localfir > 1.0000) localfir = 1.00; if (localfir < 0.0000) localfir = 0.00; ////////////////////////////////////////////////////////// // Time to move to the next node. // // do we have a real cell as a successor already (i.e. have // we been here before?) Or is this the first time? // if (nodes[curnode].abet[thisalph].nextcell > 0) { // yes, there's a node already for this node's successor // and this particular alph element. // Make that the current node, and move along! curnode = nodes[curnode].abet[thisalph].nextcell; #ifdef BEN_GRAPHIC fprintf (stderr, "."); #endif if (internal_trace) fprintf (stderr, "next old cell %ld\n", curnode); } else { // // no successor on this alphabet slot yet, so we need a new one. // // If we have no free nodes, we have no choice but to crosslink. // If we have free nodes, and crosslinking isn't enabled, we // just allocate a new node. // If crosslinking is enabled, we see what the preferred // new FIR will be, and see what node will work for that. // // Note that this is a lattice - try A, then B, then C. // so we have to do it in two phases. long further_node, i, do_crosslink; double crosslink_err; //fprintf (stderr, "-"); // Sentinel to know we need to calculate a crosslink further_node = -10001; // fprintf (stderr, // "Ran off the end of entropy model at node %ld\n", // curnode); // Assume no crosslink unless we find out otherwise. do_crosslink = 0; // Is crosslink forced because we're out of nodes? if (nodes[0].abet[0].nextcell == 0) { do_crosslink = 1; // fprintf (stderr, "$"); if (internal_trace) fprintf (stderr, "Empty freelist, forced crosslink\n"); }; // do we possibly want to crosslink anyway? At least, // we may need to calculate the best next node, which is // the one with the closest FIR to what our nextnode FIR is. if ( crosslink || do_crosslink ) { further_node = firlat_find_closest_node ( nodes, nodeslen, firlat, firlatlen, localfir); crosslink_err = fabs (localfir - nodes[further_node].fir_prior); if ( crosslink_err < crosslink_thresh // this node OK? && nodes[further_node].abet[nextalph].count > crosslink_mincount // One node lookahead && nodes[further_node].abet[nextalph^0x1].count == 0 && nodes[further_node].abet[nextalph].nextcell > 0 // Two nodes lookahead && nodes[nodes[further_node].abet[nextalph].nextcell].abet[nextnextalph].count > 0 && nodes [nodes[further_node].abet[nextalph].nextcell].abet[nextnextalph^0x1].count == 0 // THREE nodes lookahead && nodes[nodes[nodes[further_node].abet[nextalph].nextcell].abet[nextnextalph].nextcell].abet[nextnextnextalph].count > 0 && nodes[nodes[nodes[further_node].abet[nextalph].nextcell].abet[nextnextalph].nextcell].abet[nextnextnextalph^0x1].count == 0 ) { do_crosslink = 1; #ifdef BEN_GRAPHIC fprintf (stderr, "!"); #endif if (internal_trace) fprintf (stderr, "Opportunistic crosslink accepted %lf\n", crosslink_err); } else // try one higher in the FIR stream... { further_node = nodes[further_node].fir_larger; crosslink_err = fabs (localfir - nodes[further_node].fir_prior); if (crosslink_err < crosslink_thresh && nodes[further_node].abet[nextalph].count > 0 && nodes[further_node].abet[nextalph^0x1].count == 0 ) { do_crosslink = 1; #ifdef BEN_GRAPHIC fprintf (stderr, "!^"); #endif if (internal_trace) fprintf (stderr, "Opportunistic up-crosslink accepted %lf\n", crosslink_err); } else // try one lower in the FIR stream.... { further_node = nodes[further_node].fir_smaller; further_node = nodes[further_node].fir_smaller; crosslink_err = fabs (localfir - nodes[further_node].fir_prior); if (crosslink_err < crosslink_thresh && nodes[further_node].abet[nextalph].count > 0 && nodes[further_node].abet[nextalph^0x1].count == 0 ) { do_crosslink = 1; #ifdef BEN_GRAPHIC fprintf (stderr, "!v"); #endif if (internal_trace) fprintf (stderr, "Opportunistic down-crosslink accepted %lf\n", crosslink_err); } } } // Note that further_node is set either way, so we don't // need to recalculate it. }; // Are we crosslinking? if (do_crosslink) { // Enforce sanity on further_node before crosslink if (further_node < 1) { if (further_node < 1) fprintf (stderr, "Bogus crosslink %ld! Branching back to node 1.\n", further_node); further_node = 1; fprintf (stderr, "?"); }; // Put furnode in as the next node for this alph nodes[curnode].abet[thisalph].nextcell = further_node; } else { // Time for a new node. We link it in with the // current alph's slot in curnode.. #ifdef BEN_GRAPHIC fprintf (stderr, "@"); #endif further_node = nodes[0].abet[0].nextcell; if (internal_trace) fprintf (stderr, "Allocating new cell %ld\n", further_node); nodes[curnode].abet[thisalph].nextcell = further_node; // cut this node out of the front of the free list. nodes[0].abet[0].nextcell = nodes[further_node].abet[0].nextcell; // wipe out any old data that might be there. for (i = 0; i < ENTROPY_ALPHABET_SIZE ; i++) { nodes[further_node].abet[i].count = 0; nodes[further_node].abet[i].nextcell = 0; }; // initialize the new node with the current FIR prior nodes[further_node].fir_prior = localfir; // hook the new node into the FIR chain firlat_insert_node (nodes, nodeslen, firlat, firlatlen, further_node); } if (internal_trace) fprintf (stderr, "Now moving from node %ld to %ld (nfir %f, pfir %f)\n", curnode, further_node, nodes[further_node].fir_prior, localfir); curnode = further_node; } } // All done munging the markov chain. // Time to clean up and go home. // and remember to let go of the mmap and the pattern bufffer crm_force_munmap_addr ((void *) fmap); #ifdef POSIX // Because mmap/munmap doesn't set atime, nor set the "modified" // flag, some network filesystems will fail to mark the file as // modified and so their cacheing will make a mistake. // // The fix is to do a trivial read/write on the .css ile, to force // the filesystem to repropagate it's caches. // { int hfd; // hashfile fd char foo; hfd = open (learnfilename, O_RDWR); read (hfd, &foo, sizeof(foo)); lseek (hfd, 0, SEEK_SET); write (hfd, &foo, sizeof(foo)); close (hfd); } #endif free (learnfilename); return (0); } int crm_expr_bit_entropy_classify (CSL_CELL *csl, ARGPARSE_BLOCK *apb, char *txtptr, long txtstart, long txtlen) { // fprintf (stderr, "Sorry, Not Yet Implemented\n"); //return (0); // classify the sparse spectrum of this input window // as belonging to a particular type. // // This code should look very familiar- it's cribbed from // the code for LEARN // long i, j, k; // the statistics file names char htext[MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN]; long htext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*MAX_FILE_NAME_LEN; long hlen; // the match statistics variable char stext[MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100)]; long stext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100); long slen; char svrbl[MAX_PATTERN]; // the match statistics text buffer long svlen; long fnameoffset; char fname[MAX_FILE_NAME_LEN]; long crosslink; struct stat statbuf; // for statting the hash file long totalfeatures = 0; // total features double tprob; // total probability in the "success" domain. // variables used when tracing the markov graph long curnode; double total_entropy[MAX_CLASSIFIERS]; // Variables for calculating entropy double ptc[MAX_CLASSIFIERS]; // current running probability of this class double renorm = 0.0; double entropy_renorm = 0.0; double entropy_sum = 0.0; // int hfds[MAX_CLASSIFIERS]; ENTROPY_FEATUREBUCKET_STRUCT *nodestarts[MAX_CLASSIFIERS]; ENTROPY_FEATUREBUCKET_STRUCT *nodes; ENTROPY_HEADER_STRUCT *headers[MAX_CLASSIFIERS]; // pointers in the files. long *fmaps[MAX_CLASSIFIERS]; long nodelens[MAX_CLASSIFIERS]; long nodeslen; long *firlats[MAX_CLASSIFIERS]; long firlatlens[MAX_CLASSIFIERS]; char *hashname[MAX_CLASSIFIERS]; long hashlens[MAX_CLASSIFIERS]; long firjumps[MAX_CLASSIFIERS]; long long *totalbits[MAX_CLASSIFIERS]; long succhash; long vbar_seen; // did we see '|' in classify's args? long maxhash; long fnstart, fnlen; long fn_start_here; long textoffset; long textmaxoffset; long bestseen; long thistotal; if (internal_trace) fprintf (stderr, "executing a bit-entropic CLASSIFY\n"); // extract the hash file names crm_get_pgm_arg (htext, htext_maxlen, apb->p1start, apb->p1len); hlen = apb->p1len; hlen = crm_nexpandvar (htext, hlen, htext_maxlen); // extract the optional "match statistics" variable // crm_get_pgm_arg (svrbl, MAX_PATTERN, apb->p2start, apb->p2len); svlen = apb->p2len; svlen = crm_nexpandvar (svrbl, svlen, MAX_PATTERN); { long vstart, vlen; crm_nextword (svrbl, svlen, 0, &vstart, &vlen); memmove (svrbl, &svrbl[vstart], vlen); svlen = vlen; svrbl[vlen] = '\000'; }; // status variable's text (used for output stats) // stext[0] = '\000'; slen = 0; // set our flags, if needed. The defaults are // "case" // Crosslink, in this case, means to allow // FIR jumping whenever we've not seen this // situation before. crosslink = 0; if (apb->sflags & CRM_CROSSLINK) { crosslink = 1; if (user_trace) fprintf (stderr, " enabling crosslinking.\n"); }; // Now, the loop to open the files. bestseen = 0; thistotal = 0; // initialize our arrays for N .css files for (i = 0; i < MAX_CLASSIFIERS; i++) { ptc[i] = 0.5; // priori probability }; // Our evaluator is to simply traverse the Markov chain that // starts at nodes[0].abet[0].nextcell, and add together all of // the entropies - that is, the -log2(prob) of each node. Of // course, we have to nerf the probabilities away from 0, as // that would yield a log2(0.0) which is infinite and thus not // well-represented in IEEE floating point. // // For our purposes, we will use: // probability = this_count + BIT_ENTROPIC_PROBABILITY_NERF // / total_count // // If we run off the "end" of the Markov chain, we use the FIRs // to quickly find another node that's "really similar" to this // node. (this can only happen if the initialized Markov // contained open ends; this doesn't happen when we initialize // with a perfect shuffle graph which is toroidal and has no // "end" to run off of. // vbar_seen = 0; maxhash = 0; succhash = 0; fnameoffset = 0; // now, get the file names and mmap each file // get the file name (grody and non-8-bit-safe, but doesn't matter // because the result is used for open() and nothing else. // GROT GROT GROT this isn't NULL-clean on filenames. But then // again, stdio.h itself isn't NULL-clean on filenames. if (user_trace) fprintf (stderr, "Classify list: -%s- \n", htext); fn_start_here = 0; fnlen = 1; while ( fnlen > 0 && ((maxhash < MAX_CLASSIFIERS-1))) { crm_nextword (htext, hlen, fn_start_here, &fnstart, &fnlen); if (fnlen > 0) { strncpy (fname, &htext[fnstart], fnlen); fn_start_here = fnstart + fnlen + 1; fname[fnlen] = '\000'; if (user_trace) fprintf (stderr, "Classifying with file -%s- "\ "succhash=%ld, maxhash=%ld\n", fname, succhash, maxhash); if ( fname[0] == '|' && fname[1] == '\000') { if (vbar_seen) { nonfatalerror5 ("Only one ' | ' allowed in a CLASSIFY. \n" , "We'll ignore it for now.", CRM_ENGINE_HERE); } else { succhash = maxhash; }; vbar_seen ++; } else { // be sure the file exists // stat the file to get it's length k = stat (fname, &statbuf); // quick check- does the file even exist? if (k != 0) { nonfatalerror5 ("Nonexistent Classify table named: ", fname, CRM_ENGINE_HERE); } else { // file exists - do the open/process/close // hashlens[maxhash] = statbuf.st_size; // mmap the hash file into memory so we can bitwhack it fmaps[maxhash] = crm_mmap_file ( fname, 0, hashlens[maxhash], PROT_READ | PROT_WRITE, MAP_SHARED, NULL); headers[maxhash] = (ENTROPY_HEADER_STRUCT *) fmaps[maxhash]; if (headers[maxhash] == MAP_FAILED ) { nonfatalerror5 ("Couldn't memory-map the table file", fname, CRM_ENGINE_HERE); } else { // // Check to see if this file is the right version // //long fev; //if (hashes[maxhash][0].hash != 0 || // hashes[maxhash][0].key != 0) //{ // fev =fatalerror ("The .css file is the wrong version! Filename is: ", // fname); // return (fev); //}; // Pull the file info out of the header (the // first 1024 bytes is the header) //firlats[maxhash] = (long *) // & headers[maxhash][headers[maxhash][0]]; firlats[maxhash]= (long *) & (fmaps[maxhash] [headers[maxhash]->firlatstart]); //firlatlens[maxhash] = headers[maxhash][1]; firlatlens[maxhash] = headers[maxhash]->firlatlen; //nodestarts[maxhash] = (ENTROPY_FEATUREBUCKET_STRUCT *) // & (headers[maxhash][headers[maxhash][2]]); nodestarts[maxhash] = (ENTROPY_FEATUREBUCKET_STRUCT *) & (fmaps[maxhash] [headers[maxhash]->nodestart]); //nodelens[maxhash] = headers[maxhash][3]; nodelens[maxhash] = headers[maxhash]->nodeslen; // totalbits[maxhash] = (long long *) &headers[4]; totalbits[maxhash] = (long long *) &(headers[maxhash]->totalbits); if (internal_trace) fprintf (stderr, "File #%ld firlat %lx len %ld and nodes %lx\n", maxhash, (unsigned long ) firlats[maxhash], firlatlens [maxhash], (unsigned long )nodestarts[maxhash]); // Keep a copy of the data filename for later. hashname[maxhash] = (char *) malloc (fnlen+10); if (!hashname[maxhash]) untrappableerror( "Couldn't malloc hashname[maxhash]\n", "We need that part later, so we're stuck. Sorry."); strncpy(hashname[maxhash],fname,fnlen); hashname[maxhash][fnlen]='\000'; maxhash++; }; }; }; if (maxhash > MAX_CLASSIFIERS-1) nonfatalerror5 ("Too many classifier files.", "Some may have been disregarded", CRM_ENGINE_HERE); }; }; // // If there is no '|', then all files are "success" files. if (succhash == 0) succhash = maxhash; // a CLASSIFY with no arguments is always a "success". if (maxhash == 0) return (0); // sanity checks... Uncomment for super-strict CLASSIFY. // // do we have at least 1 valid .css files? if (maxhash == 0) { fatalerror5 ("Couldn't open at least 2 .css files for classify().", "", CRM_ENGINE_HERE); }; // do we have at least 1 valid .css file at both sides of '|'? //if (!vbar_seen || succhash < 0 || (maxhash < succhash + 2)) // { // nonfatalerror ( // "Couldn't open at least 1 .ben file per SUCC | FAIL classes " // " for classify().\n","Hope you know what are you doing."); // }; // now all of the files are mmapped into memory, // and we can start following the chains and adding up entropy. i = 0; j = 0; k = 0; thistotal = 0; ////////////////////////////////////////////////////////////// // // The actual graph-following starts here. To maximize locality // of reference, we do the graph-following first for one graph, // then for another, then for another. That way, we tend to stay // in the CPU cache better. // // Note to self - doing this may or may not speed up the other // classifiers- but because they tend to use less memory it may // make less of a difference. { long c; // c is going to be our classifier counter unsigned short thischar; unsigned short thisalph; double localfir; long bitnum; long nextnode; // zero out our entropy accumulators // for (c = 0; c < MAX_CLASSIFIERS; c++) { total_entropy[c] = 0.0; firjumps[c] = 0; } for (c = 0; c < maxhash; c++) { if (internal_trace) fprintf (stderr, "Now running against stats file %ld\n", c); // initialize our per-graph-following stuff: totalfeatures = 0; nodes = nodestarts[c]; nodeslen = nodelens[c]; // initialize our starting conditions. curnode = nodes[0].abet[1].nextcell; localfir = nodes[nodes[0].abet[1].nextcell].fir_prior; textoffset = txtstart; textmaxoffset = txtstart + txtlen; bitnum = 8; // Now the big loop to follow the graph while (textoffset + 1 < textmaxoffset || bitnum > 0) { long nodetotcount, itc; float add_entropy; bitnum = bitnum - ENTROPY_CHAR_SIZE; if (bitnum < 0) { bitnum = 7; textoffset ++; } thischar = txtptr [textoffset]; thisalph = ( thischar >> bitnum ) & ENTROPY_CHAR_BITMASK; totalfeatures++; if (internal_trace) firlat_sanity_scan (firlats[c], firlatlens[c], nodes, nodeslen); // update our local FIR to what it will be when we're // at the next node. (this is effective not at the // current node, but one step further into the future.) localfir = thisalph * BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT + localfir * (1.0 - BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT); // and go where it told us. Note that we may want to // FIR jump if we've never been here before, or there is // no "next node", or there _is_ no next node. // we *have* been to before. nodetotcount = 0; for (itc = 0; itc < ENTROPY_ALPHABET_SIZE; itc++) nodetotcount += nodes[curnode].abet[itc].count; if (internal_trace) fprintf (stderr, "%ld/%ld ", nodes[curnode].abet[thisalph].count, nodetotcount); if ( ( nodetotcount < 1) || nodes[curnode].abet[thisalph].count < 1 || nodes[curnode].abet[thisalph].nextcell < 1 ) { // Either no corpus history, or // this cell has never gone here before, or // we're off the edge of the model completely // In any case, we're better off FIR-jumping no // matter what because we'll never be back "in corpus" double error; nextnode = firlat_find_closest_node (nodes, nodeslen, firlats[c], firlatlens[c], localfir); error = localfir - nodes[nextnode].fir_prior; if (internal_trace) fprintf (stderr, "FIR jumping, error = %12lg, new node %ld \n", error, nextnode); if (error < 0.00) error = - error; // and add the error entropy estimate // due to the FIR jump; since this is knowable // due to it being "no choice", the entropy is very small. add_entropy = // pow (2.0, (error / BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT)); // stats_2_entropy (1, nodelens[c]) ; // the following one seems to work well.. entirely // unjustifiably, but it works. ROC=0.128 // stats_2_entropy (1,10); stats_2_entropy (1,1000); // This one is horrible. 5% error rates... // stats_2_entropy (nodes[nextnode].total_count, // nodes[0].total_count); //stats_2_entropy (0, nodetotcount); //0.5; // This one is justifiable, and is very close to // the unjustifiable one above, but not quite as good. // GROT GROT GROT //stats_2_entropy (1, nodetotcount + 1); total_entropy[c] = total_entropy[c] + add_entropy; curnode = nextnode; firjumps[c]++; } else // else no FIRjump, things look good. // we're still "in the corpus" or we've not enabled // FIR jumping. { add_entropy = stats_2_entropy ( nodes[curnode].abet[thisalph].count, nodetotcount ); total_entropy[c] = total_entropy[c] + add_entropy; // and move to the next node. curnode = nodes[curnode].abet[thisalph].nextcell; // go back to the start cell if we're off the end and // we've not enabled crosslinked FIRjumping if (curnode < 1) { fprintf (stderr, "X"); curnode = nodes[0].abet[0].nextcell; }; }; if (internal_trace) fprintf (stderr, "%f \n", add_entropy); } } // Test Results on TREC public testset // No node-merge or node-clone // Unique states // TOE - 1000000 nodes - 6339/92189, ~356/144/64 min (filled@ ~10K) // 128 x 128 lattice: // no FIR-hop // TOE // 256 x 256 lattice: // no FIR-hop // FIR-hopping // TOE shuffle - 3342/91996, time 199/71/15 minutes // 512 x 512 lattice: // no FIR-hop // TOE - 3035/92189 ( ~3 % ) time: 233 / 74 / 25 min // TEFT - 8609/75962 and 9508/92189, 272 minutes // 1024x1024 lattice: // FIR-hop // TOE 7914/92189 and ?/96/60 minutes // Fixed 5109/92189 and 378/153/63 minutes (with some suspiciously // long delays during a few messages) // 2048x2048 lattice: // no FIR-hop // TOE 3780 / 92189 318/87/60 min if (internal_trace) { for (k = 0; k < maxhash; k++) fprintf (stderr, "entropy for classifier %ld is %f\n", k, total_entropy[k]); }; //////////////////////////////////////////////////// // // Now build the output report (if desired, that is, svlen > 0 // // How we convert entropy into probability and pR: // // Basically, we cheat a lot. If you consider the definition of // entropy, it's the log of probability. The probability of a // particular document being generated by a particular markov // model is equal to the product of the probabilities of each // particular transition. The entropy is the sum of the log of // the individual transition probabilities, and (remembering // that multiplying in the real domain is equivalent to summing // in the log domains) we see that the absolute probabililities // (that is, the monkeys-on-typewriters probability) of each // document are just 1 / (2 ^ entropy), and the relative // probabilities can then be scaled out by summing and dividing // in the real domain. // // However, those probabilities are REALLY SMALL - well below // the lower limit of IEEE floating point numbers, and so we can // renormalize them to close to 1 by addition of an arbitrary // constant (which is equivalent to multiplying the // probabilities by an arbitrary constant), thus converting them // to relative rather than absolute probabilities (oh- we should // do a renormalization so they really do add to 1.000). // // Once we have relative probabilities, we can then jump right // back into the standard summing code to decide on success or // failure. // Step 1: find the minimum needed to add to start renormalization entropy_renorm = total_entropy[0]; entropy_sum = 0.0; for (k = 0; k < maxhash; k++) { if (internal_trace) fprintf (stderr, "entropy class %ld is %f\n", k, total_entropy[k]); if (total_entropy[k] < entropy_renorm ) { entropy_renorm = total_entropy [k]; } entropy_sum += total_entropy[k]; }; if (internal_trace) fprintf (stderr, "entropy_renorm = %f\n", entropy_renorm); // Step 2: add entropy renorm constant, and convert to unnormalized P // Note that this is 2 raised to very small powers, and often // returns zero, so we have to nerf it with DBL_MIN. for (k = 0; k < maxhash; k++) { // Bad formula - technically accurate but pR maxes out // at 304 for nearly all texts //ptc [k] = 1 / // ( pow (2.00, (total_entropy[k] - entropy_renorm)) + 1000 * DBL_MIN); // better formula - probs very close to 0.5 // ptc [k] = (entropy_sum - total_entropy[k] ) / entropy_sum; // // try a much smaller exponential to bring things back into line: // Note: the 1.1 below is _empirical_, chosen to normalize this // classifier to a +/1 10 pR units for good thick threshold training. // It has no other meaning beyond that. ptc [k] = 1 / (pow (1.1, (total_entropy[k] - entropy_renorm)) + (1000 * DBL_MIN)) ; if (internal_trace) fprintf (stderr, "class %ld ptc %f\n", k, ptc[k]); }; // Step 3: renormalize the probabilities to sum to 1.000 renorm = 0.0; for (k = 0; k < maxhash; k++) renorm = renorm + ptc[k]; for (k = 0; k < maxhash; k++) { ptc[k] = (ptc[k] / renorm) + 1000 * DBL_MIN ; if (internal_trace) fprintf (stderr, "class %ld RENORM ptc %f\n", k, ptc[k]); }; // Now we're in probability space in the ptc array. Away we go! tprob = 0.0; for (k = 0; k < succhash; k++) tprob = tprob + ptc[k]; if (svlen > 0) { char buf[1024]; double accumulator; double remainder; double overall_pR; long m; buf [0] = '\000'; accumulator = 1000 * DBL_MIN; for (m = 0; m < succhash; m++) { accumulator = accumulator + ptc[m]; }; remainder = 1000 * DBL_MIN; for (m = succhash; m < maxhash; m++) if (bestseen != m) { remainder = remainder + ptc[m]; }; overall_pR = (log10 (accumulator) - log10 (remainder)); // note also that strcat _accumulates_ in stext. // There would be a possible buffer overflow except that _we_ control // what gets written here. So it's no biggie. if (tprob > 0.5000) { sprintf (buf, "CLASSIFY succeeds; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR ); } else { sprintf (buf, "CLASSIFY fails; success probability: %6.4f pR: %6.4f\n", tprob, overall_pR ); }; if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); bestseen = 0; for (k = 0; k < maxhash; k++) if (ptc[k] > ptc[bestseen] ) bestseen = k; remainder = 1000 * DBL_MIN; for (m = 0; m < maxhash; m++) if (bestseen != m) { remainder = remainder + ptc[m]; }; sprintf (buf, "Best match to file #%ld (%s) " \ "prob: %6.4f pR: %6.4f \n", bestseen, hashname[bestseen], ptc[bestseen], (log10(ptc[bestseen]) - log10(remainder))); if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); sprintf (buf, "Total features in input file: %ld\n", totalfeatures); if (strlen (stext) + strlen(buf) <= stext_maxlen) strcat (stext, buf); for (k = 0; k < maxhash; k++) { long m; float pctused; remainder = 1000 * DBL_MIN; for (m = 0; m < maxhash; m++) if (k != m) { remainder = remainder + ptc[m]; }; nodes = nodestarts[k]; // percent usage- hack due to "0" being a sentinel for no space pctused = nodes[0].abet[0].nextcell; if (nodes[0].abet[0].nextcell == 0) pctused = nodelens[k]; pctused = 100.0 * pctused / nodelens[k]; sprintf (buf, "#%ld (%s):" \ " features: %lld (%.0f%%), entropy: %8f, jumps: %ld, prob: %3.2e, pR: %6.2f \n", k, hashname[k], // nodes[0].total_count, *totalbits[k], pctused, total_entropy[k], firjumps[k], ptc[k], (log10 (ptc[k]) - log10 (remainder) ) ); // strcat (stext, buf); if (strlen(stext)+strlen(buf) <= stext_maxlen) strcat (stext, buf); }; // check here if we got enough room in stext to stuff everything // perhaps we'd better rise a nonfatalerror, instead of just // whining on stderr if (strcmp(&(stext[strlen(stext)-strlen(buf)]), buf) != 0) { nonfatalerror5( "WARNING: not enough room in the buffer to create " "the statistics text. Perhaps you could try bigger " "values for MAX_CLASSIFIERS or MAX_FILE_NAME_LEN?", " ", CRM_ENGINE_HERE); }; crm_destructive_alter_nvariable (svrbl, svlen, stext, strlen (stext)); }; }; // cleanup time! // remember to let go of the fd's and mmaps for (k = 0; k < maxhash; k++) { // close (hfds [k]); crm_munmap_file ((void *) fmaps[k]); }; // // Free the hashnames, to avoid a memory leak. // for (i = 0; i < maxhash; i++) { /////////////////////////////////////// // ! XXX SPAMNIX HACK! //! -- by Barry Jaspan // //! Without the statement "k = i" (which should have no effect), //! the for statement crashes on MacOS X when compiled with gcc //! -O3. I've examined the pointers being freed, and they appear //! valid. I've run this under Purify on Windows, valgrind on //! Linux, and efence on MacOS X; none report a problem here //! (though valgrind reports umrs in the VHT code; see my post to //! crm114-developers). I've also examined the assembler produced //! with various changes here and, though I don't speak PPC, w/o //! the k = i it is qualitatively different. //! //! For now, I'm concluding it is an optimizer bug, and fixing it //! with the "k = i" statement. This occurs on MacOS X 10.2 with //! Apple Computer, Inc. GCC version 1175, based on gcc version //! 3.1 20020420 (prerelease). // k = i; free (hashname[i]); } if (tprob <= 0.5000) { if (user_trace) fprintf (stderr, "CLASSIFY was a FAIL, skipping forward.\n"); // and do what we do for a FAIL here csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1; csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1; return (0); }; // // all done... if we got here, we should just continue execution if (user_trace) fprintf (stderr, "CLASSIFY was a SUCCESS, continuing execution.\n"); return (0); };