// crm_str_funcs.c - Controllable Regex Mutilator, version v1.0 // Copyright 2001-2006 William S. Yerazunis, all rights reserved. // // This software is licensed to the public under the Free Software // Foundation's GNU GPL, version 2. You may obtain a copy of the // GPL by visiting the Free Software Foundations web site at // www.fsf.org, and a copy is included in this distribution. // // Other licenses may be negotiated; contact the // author for details. // // include some standard files #include "crm114_sysincludes.h" // include any local crm114 configuration file #include "crm114_config.h" // include the crm114 data structures file #include "crm114_structs.h" // and include the routine declarations file #include "crm114.h" // the command line argc, argv extern int prog_argc; extern char **prog_argv; // the auxilliary input buffer (for WINDOW input) extern char *newinputbuf; // the globals used when we need a big buffer - allocated once, used // wherever needed. These are sized to the same size as the data window. extern char *inbuf; extern char *outbuf; extern char *tempbuf; /////////////////////////////////////////////////////////////////////////// // // This code section (from this comment block to the one declaring // "end of section dual-licensed to Bill Yerazunis and Joe // Langeway" is copyrighted and dual licensed by and to both Bill // Yerazunis and Joe Langeway; both have full rights to the code in // any way desired, including the right to relicense the code in // any way desired. // // Vectorized stringhashing - get a bunch of features in a nice // predigested form (a counted array of chars plus control params // go in, and a nice array of 32-bit ints come out. The idea is to // encapsulate tokenization/hashing into one function that all // CRM114 classifiers can use, and so improved tokenization raises // all boats equally, or something like that. // // If you need two sets of hashes, call this routine twice, with // different pipeline coefficient arrays. // // If the features_out area becomes close to overflowing, then // vector_stringhash will return with a value of next_offset <= // textlen. If next_offset is > textlen, then there is nothing // more to hash. // // The feature building is controlled via the pipeline coefficient // arrays as described in the paper "A Unified Approach To Spam // Filtration". In short, each row of an array describes one // rendition of an arbitrarily long pipeline of hashed token // values; each row of the array supplies one output value. Thus, // the 1x1 array {1} yields unigrams, the 5x6 array // // {{ 1 3 0 0 0 0} // { 1 0 5 0 0 0} // { 1 0 0 11 0 0} // { 1 0 0 0 23 0} // { 1 0 0 0 0 47}} // // yields "Classic CRM114" OSB features, and the 2x3 array // // {{1 1 0} // {1 0 1}} // // yields bigrams that are not position nor order sensitive, while // // {{1 2 0} // {1 0 2}} // // yields bigrams that are order sensitive, but not position sensitive. // // Because the array elements are used as dot-product multipliers // on the hashed token value pipeline, there is a small advantage to // having the elements of the array being odd (low bit set) and // relatively prime, as it decreases the chance of hash collisions. // /////////////////////////////////////////////////////////////////////////// long crm_vector_stringhash ( char *text, // input string (null-safe!) long textlen, // how many bytes of input. long start_offset, // start tokenizing at this byte. char *regex, // the parsing regex (might be ignored) long regexlen, // length of the parsing regex float *coeff_array, // the pipeline coefficient control array long pipe_len, // how long a pipeline (== coeff_array row length) long pipe_iters, // how many rows are there in coeff_array long *features, // where the output features go long featureslen, // how many output features (max) long *features_out, // how many longs did we actually use up long *next_offset // next invocation should start at this offset ) { long hashpipe[UNIFIED_WINDOW_LEN]; // the pipeline for hashes long keepgoing; // the loop controller regex_t regcb; // the compiled regex regmatch_t match[5]; // we only care about the outermost match long i, j, k; // some handy index vars int regcomp_status; // now do the work. features_out = 0; keepgoing = 1; regcomp_status = crm_regcomp (®cb, regex, regexlen, REG_EXTENDED); while (keepgoing) { } return (0); } /////////////////////////////////////////////////////////////////////////// // // End of code section dual-licensed to Bill Yerazunis and Joe Langeway. // ////////////////////////////////////////////////////////////////////////////