ports//mail/crm114/work/crm114-20070810-BlameTheSegfault.src/crm114

//  crm114_config.h  - Controllable Regex Mutilator base config, version X0.1
//  Copyright 2001-2006 William S. Yerazunis, all rights reserved.
//  
//  This software is licensed to the public under the Free Software
//  Foundation's GNU GPL, version 2.  You may obtain a copy of the
//  GPL by visiting the Free Software Foundations web site at
//  www.fsf.org .  Other licenses may be negotiated; contact the 
//  author for details.  
//
///////////////////////////////////////////////////////////////////
//
//    Configuration for CRM114.  Some things here you can change
//    with relative impunity.  Other things, not so much.  Where 
//    there are limiting factors noted, please obey them or you 
//    may break something important.  And, of course, realize that
//    this is GPLed software with NO WARRANTY - make any changes 
//    and that goes double. 
//
///////////////////////////////////////////////////////////////////
//
//   default size of the variables hashtable (a.k.a. the VHT)
#define DEFAULT_VHT_SIZE 4095
 
//   default limit on the control stack (for catching infinite loops,
//   not a preallocated variable)
#define DEFAULT_CSTK_LIMIT 1024

//   how many levels (pending operations) will we allow in 
//   math evaluations.  We _could_ have it be unlimited, but
//   this serves as an error catcher in runaway programs.
#define DEFAULT_MATHSTK_LIMIT 1024

//   default maximum number of lines in any program file
#define DEFAULT_MAX_PGMLINES 10000

//   define maximum number of INSERTs before we think we're in an
//   infinite loop...
#define DEFAULT_MAX_INSERTS 1024

//   default size of the data window: 8 megabytes.
#define DEFAULT_DATA_WINDOW  8388608
//#define DEFAULT_DATA_WINDOW 16777216
//#define DEFAULT_DATA_WINDOW 1048576

//    mmap cacheing length - only actually write out this often.
//     set to 0 to disable mmap cacheing and release files faster.
//      However, this has a negative speed impact. 
//#define UNMAP_COUNT_MAX 0
//#define UNMAP_COUNT_MAX 2
#define UNMAP_COUNT_MAX 1000

//    What's the smallest chunk we actually want to bother reclaiming
//    on the fly out of the isolated data area "tdw".  Set this to 1
//    for agressive compression; values like 100 to 10K can speed up
//    execution of things that thrash the tdw badly; set to larger
//    than the data window size to completely disable the on-the-fly
//    reclaimer.  Watch out though- values less than 1 can cause the
//    end of one variable to overlap the start of another; this causes
//    horrible problems.  FOR LATER IMPROVEMENT: Start with a
//    relatively large reclaimer value, then decrease slowly as memory
//    becomes more scarce.
#define MAX_RECLAIMER_GAP 5

//    How many regex compilations do we cache?  (this saves the time
//    to recompile regexes in a loop, but uses memory) Set to zero to
//    disable cacheing.  Note that we cache the actual regex, not the
//    source code line, so this happens *after* the regex text is var
//    expanded; two different expressions that evaluate to the same
//    actual regex will share the same cache slot, which is pretty
//    cool.
//
//    For programs that don't loop, or reuse the same regex a lot, 
//    performance is slightly better with cacheing disabled.  But if you 
//    do reuse the same regexes tens or hundreds of times (say, lots of 
//    LIAF-loops) then cacheing can accelerate your program significantly.
//
//#define CRM_REGEX_CACHESIZE 0
//#define CRM_REGEX_CACHESIZE 10
#define CRM_REGEX_CACHESIZE 1024
//
//    and how do we want the regex cache to work?  RANDOM_ACCESS can
//    keep more things around, but is only 1 LRU deep for each slot so
//    use plenty of slots, like 256 or more.  LINEAR_SEARCH is a
//    strict LRU cache but that's slower; don't use too many slots
//    with LINEAR_SEARCH or you'll spend more time searching the cache
//    than you would have spent just recompiling the regex.
//
//    Be sure to turn on ONLY ONE of these !!!!
//
#define REGEX_CACHE_RANDOM_ACCESS
//#define REGEX_CACHE_LINEAR_SEARCH


//    do we use Sparse Binary Polynomial Hashing (sensitive to both
//    sequence and spacing of individual words), Token Grab Bag, or
//    Token Sequence Sensitive?  Testing against the SpamAssassin
//    "hard" database shows that SBPH, TGB, and TGB2, are somewhat
//    more accurate than TSS, and about 50% more accurate than First
//    Order Only.  However, that's for English, and other natural
//    languages may show a different statistical distribution.
//
//    Choose ONE of the following:
//          SBPH, TGB2, TGB, TSS, or ARBITRARY_WINDOW_LEN:
//   
//    *** DANGER, WILL ROBINSON ***  You MUST rebuild your .css files from
//    samples of text if you change this.  
//
//
//     Sparse Binary Polynomial Hashing
#define SBPH
//
//     Token Grab Bag, noaliasing
//#define TGB2
//
//     Token Grab Bag, aliasing
//#define TGB
//
//     Token Sequence Sensitive
//#define TSS
//
//     First Order Only (i.e. single words, like SpamBayes)
//    Note- if you use FOO, you must turn off weights!!
//#define FOO
//
//     Generalized format for the window length.
//
//  DO NOT SET THIS TO MORE THAN 10 WITHOUT LENGTHENING hctable
//  the classifier modules !!!!!!  "hctable" contains the pipeline
//  hashing coefficients and needs to be extended to 2 * WINDOW_LEN
//
//     Generic window length code
//#define ARBITRARY_WINDOW_LENGTH
//
#define MARKOVIAN_WINDOW_LEN 5
//
#define OSB_BAYES_WINDOW_LEN 5
//
//      DO NOT set this to more than 5 without lengthening the
//      htup1 and htup2 tables in crm_unified_bayes.c
//
#define UNIFIED_BAYES_WINDOW_LEN 5
//
//      Unified tokenization pipeline length.
#define UNIFIED_WINDOW_LEN 32
////   
//         Winnow algorithm parameters here...  
//
#define OSB_WINNOW_WINDOW_LEN 5
#define OSB_WINNOW_PROMOTION 1.23
#define OSB_WINNOW_DEMOTION 0.83
//
//     Now, choose whether we want to use the "old" or the "new" local
//     probability calculation.  The "old" one works slightly better
//     for SBPH and much better for TSS, the "new" one works slightly
//     better for TGB and TGB2, and _much_ better for FOO
//
//     The current default (not necessarily optimal) 
//     is Markovian SBPH, STATIC_LOCAL_PROBABILITIES, 
//     LOCAL_PROB_DENOM = 16, and SUPER_MARKOV
//
//#define LOCAL_PROB_DENOM 2.0 
#define LOCAL_PROB_DENOM 16.0 
//#define LOCAL_PROB_DENOM 256.0 
#define STATIC_LOCAL_PROBABILITIES
//#define LENGTHBASED_LOCAL_PROBABILITIES
//
//#define ENTROPIC_WEIGHTS
//#define MARKOV_WEIGHTS
#define SUPER_MARKOV_WEIGHTS
//#define BREYER_CHHABRA_SIEFKES_WEIGHTS
//#define BREYER_CHHABRA_SIEFKES_BASE7_WEIGHTS
//#define BCS_MWS_WEIGHTS
//#define BCS_EXP_WEIGHTS 
//
//
//    Do we use learncount-based normalization in calculating probabilities?
#define OSB_LEARNCOUNTS
//
//    Do we want "compatibility mode" between .css files LEARNed under
//    Markovian versus those LEARNed with OSB or OSB/Unique?  Note that
//    the default is NOT compatible with Markov; the reason for this is
//    that Bill Y. screwed up his subscripts and caused an unintentional
//    yet hard to fix fork in the .css table hashing formula.  Both work,
//    both work exactly equally well, yet they are incompatible and you
//    must choose one or the other.  If you turn on OLD_MARKOV_COMPATIBILITY,
//    you can flip (mostly) back and forth between <microgroom> and
//    <osb microgroom>.  But if you've already built any OSB .css files,
//    you can't do this flip.  But if you have older OSB .css files,
//    you can't turn this on without having the data become inaccessible.
//    And there's no easy way to translate (due to the hashing obfuscation)
//    Anyway, it's better to rebuild from fresh spam anyway, so blame Bill.
//             - Bill
// #define OLD_MARKOV_COMPATIBILITY
//
//    Do we take only the maximum probability feature?
//
//#define USE_PEAK
//
//
//    Should we use stochastic microgrooming, or weight-distance microgrooming-
//    Make sure ONE of these is turned on.
//#define STOCHASTIC_AMNESIA
#define WEIGHT_DISTANCE_AMNESIA
//
//    define the default max chain length in a .css file that triggers
//    autogrooming, the rescale factor when we rescale, and how often
//    we rescale, and what chance (mask and key) for any particular 
//    slot to get rescaled when a rescale is triggered for that slot chain.
//#define MICROGROOM_CHAIN_LENGTH 1024
#define MICROGROOM_CHAIN_LENGTH 256
//#define MICROGROOM_CHAIN_LENGTH 64
#define MICROGROOM_RESCALE_FACTOR .75
#define MICROGROOM_STOCHASTIC_MASK 0x0000000F
#define MICROGROOM_STOCHASTIC_KEY  0x00000001
#define MICROGROOM_STOP_AFTER 32    //  maximum number of buckets groom-zeroed
                                    
#define FEATURE_HIT_INCREMENT_SIZE 7

//    define the "block ratio" of how of a memory data window we're
//    willing to suck in from a minion process before we block on
//    sucking; the un-sucked part just waits in the minion's stdout
//    buffer (and causes the minion to block on output).  Normally a
//    factor of 2 (1/4th of the size of a full memory window, or 2
//    megabytes in the default configuraton) is sufficient.
#define SYSCALL_WINDOW_RATIO 2

//   define default internal debug level
#define DEFAULT_INTERNAL_TRACE_LEVEL 0

//   define default user debug level
#define DEFAULT_USER_TRACE_LEVEL 0

//   define maximum number of parenthesized sub regexes we'll accept
#define MAX_SUBREGEX 256

//   define maximum bracket depth nesting we'll allow....
#define MAX_BRACKETDEPTH 256

//   define maximum number of iterations allowed for EVAL expansion
//#define MAX_EVAL_ITERATIONS 16384
//#define MAX_EVAL_ITERATIONS 1024
#define MAX_EVAL_ITERATIONS 4096

//   define maximum size of a pattern in bytes
#define MAX_PATTERN 16384

//    and how long can a variable name be
#define MAX_VARNAME 2048

//   define the default number of bytes in a learning file hash table
//   (note that this should be a prime number, or at least one with a 
//    lot of big factors)
//
//       this value (2097153) is one more than 2 megs, for a .css of 24 megs
//#define DEFAULT_SPARSE_SPECTRUM_FILE_LENGTH 2097153
//
//       this value (1048577) is one more than a meg, for a .css of 12 megs
//       for the Markovian, and half that for OSB classifiers
#define DEFAULT_SPARSE_SPECTRUM_FILE_LENGTH 1048577
#define DEFAULT_MARKOVIAN_SPARSE_SPECTRUM_FILE_LENGTH 1048577
#define DEFAULT_OSB_BAYES_SPARSE_SPECTRUM_FILE_LENGTH 524287 // Mersenne prime
#define DEFAULT_WINNOW_SPARSE_SPECTRUM_FILE_LENGTH 1048577
//#define DEFAULT_BIT_ENTROPY_FILE_LENGTH 2000000
#define DEFAULT_BIT_ENTROPY_FILE_LENGTH 1000000

//    For the hyperspace matcher, we need to define a few things.
#define HYPERSPACE_MAX_FEATURE_COUNT 500000

//   Stuff for bit-entropic configuration
//   Define the size of our alphabet, and how many bits per alph.
#define ENTROPY_ALPHABET_SIZE 2
#define ENTROPY_CHAR_SIZE 1
#define ENTROPY_CHAR_BITMASK 0x1
//       What fraction of the nodes in a bit-entropic file should be
//       referenceable from the FIR prior arithmetical encoding
//       lookaside table?  0.01 is 1% == average of 100 steps to find
//       the best node.  0.2 is 20% or 5 steps to find the best node.
#define BIT_ENTROPIC_FIR_LOOKASIDE_FRACTION 0.1
#define BIT_ENTROPIC_FIR_LOOKASIDE_STEP_LIMIT 128
#define BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT 0.5
#define BIT_ENTROPIC_SHUFFLE_HEIGHT 1024  //   was 256
#define BIT_ENTROPIC_SHUFFLE_WIDTH 1024   //   was 256
#define BIT_ENTROPIC_PROBABILITY_NERF 0.0000000000000000001

//    define the maximum length of a filename
// #define MAX_FILE_NAME_LEN 255

// defaults to system's, if any
#ifdef NAME_MAX
  #define MAX_FILE_NAME_LEN NAME_MAX+1
#else
  #ifdef FILENAME_MAX
    #define MAX_FILE_NAME_LEN FILENAME_MAX+1
  #else
    #define MAX_FILE_NAME_LEN 256
  #endif
#endif

//    define how many microseconds to sleep waiting for a minion process
//    to complete:
//#define MINION_SLEEP_USEC 1000000
//#define MINION_SLEEP_USEC 10000
//#define MINION_SLEEP_USEC 1000
//#define MINION_SLEEP_USEC 100
#define MINION_SLEEP_USEC 10

//    How many microseconds to sleep if we're looping on input WINDOW stmt.
//    try 1 millisecond for now
#define INPUT_WINDOW_SLEEP_USEC 1000

//     Maximum number of different .CSS files in a CLASSIFY
#define MAX_CLASSIFIERS 128

//     Maximum number of nonfatal errors we'll allow before tossing our
//     cookies on a fatal error
#define MAX_NONFATAL_ERRORS 100

//     How big is a feature bucket?  Is it a byte, a short, a long, 
//     a float, whatever.  :)
#define FEATUREBUCKET_TYPE FEATUREBUCKET_STRUCT
//#define FEATUREBUCKET_VALUE_MAX 32767
#define FEATUREBUCKET_VALUE_MAX 1000000000
#define FEATUREBUCKET_HISTOGRAM_MAX 4096
//#define FEATUREBUCKET_TYPE unsigned short

//    End of configurable parameters.
#define HERE (runtime system location: __FILE__ ( __LINE__ ) function __FUNCTION__ )
syntax highlighted by Code2HTML, v. 0.9.1