//  crm_str_funcs.c  - Controllable Regex Mutilator,  version v1.0
//  Copyright 2001-2006  William S. Yerazunis, all rights reserved.
//  
//  This software is licensed to the public under the Free Software
//  Foundation's GNU GPL, version 2.  You may obtain a copy of the
//  GPL by visiting the Free Software Foundations web site at
//  www.fsf.org, and a copy is included in this distribution.  
//
//  Other licenses may be negotiated; contact the 
//  author for details.  
//
//  include some standard files
#include "crm114_sysincludes.h"

//  include any local crm114 configuration file
#include "crm114_config.h"

//  include the crm114 data structures file
#include "crm114_structs.h"

//  and include the routine declarations file
#include "crm114.h"

//    the command line argc, argv
extern int prog_argc;
extern char **prog_argv;

//    the auxilliary input buffer (for WINDOW input)
extern char *newinputbuf;

//    the globals used when we need a big buffer  - allocated once, used 
//    wherever needed.  These are sized to the same size as the data window.
extern char *inbuf;
extern char *outbuf;
extern char *tempbuf;


///////////////////////////////////////////////////////////////////////////
//
//    This code section (from this comment block to the one declaring
//    "end of section dual-licensed to Bill Yerazunis and Joe
//    Langeway" is copyrighted and dual licensed by and to both Bill
//    Yerazunis and Joe Langeway; both have full rights to the code in
//    any way desired, including the right to relicense the code in
//    any way desired.
//
//    Vectorized stringhashing - get a bunch of features in a nice
//    predigested form (a counted array of chars plus control params
//    go in, and a nice array of 32-bit ints come out.  The idea is to
//    encapsulate tokenization/hashing into one function that all
//    CRM114 classifiers can use, and so improved tokenization raises
//    all boats equally, or something like that.
//
//    If you need two sets of hashes, call this routine twice, with
//    different pipeline coefficient arrays.
//
//    If the features_out area becomes close to overflowing, then
//    vector_stringhash will return with a value of next_offset <=
//    textlen.  If next_offset is > textlen, then there is nothing
//    more to hash.
//
//    The feature building is controlled via the pipeline coefficient
//    arrays as described in the paper "A Unified Approach To Spam
//    Filtration".  In short, each row of an array describes one
//    rendition of an arbitrarily long pipeline of hashed token
//    values; each row of the array supplies one output value.  Thus,
//    the 1x1 array {1} yields unigrams, the 5x6 array
//
//     {{ 1 3 0 0 0 0}
//      { 1 0 5 0 0 0}
//      { 1 0 0 11 0 0}
//      { 1 0 0 0 23 0}
//      { 1 0 0 0 0 47}}
//
//    yields "Classic CRM114" OSB features, and the 2x3 array 
//
//     {{1 1 0}
//      {1 0 1}}
//
//    yields bigrams that are not position nor order sensitive, while
//
//     {{1 2 0}
//      {1 0 2}}
//
//    yields bigrams that are order sensitive, but not position sensitive.
// 
//    Because the array elements are used as dot-product multipliers
//    on the hashed token value pipeline, there is a small advantage to
//    having the elements of the array being odd (low bit set) and
//    relatively prime, as it decreases the chance of hash collisions.
//
///////////////////////////////////////////////////////////////////////////

long crm_vector_stringhash 
(
   char *text,             // input string (null-safe!)
   long textlen,           //   how many bytes of input.
   long start_offset,      //     start tokenizing at this byte.
   char *regex,            // the parsing regex (might be ignored)
   long regexlen,          //   length of the parsing regex
   float *coeff_array,     // the pipeline coefficient control array
   long pipe_len,          //  how long a pipeline (== coeff_array row length)
   long pipe_iters,        //  how many rows are there in coeff_array
   long *features,         // where the output features go
   long featureslen,       //   how many output features (max)
   long *features_out,     // how many longs did we actually use up
   long *next_offset       // next invocation should start at this offset
   )
{
  long hashpipe[UNIFIED_WINDOW_LEN];    // the pipeline for hashes
  long keepgoing;                       // the loop controller
  regex_t regcb;                    // the compiled regex
  regmatch_t match[5];              // we only care about the outermost match
  long i, j, k;             // some handy index vars
  int regcomp_status;
  //    now do the work.

  features_out = 0;
  keepgoing = 1;
  
  

  regcomp_status = crm_regcomp (&regcb, regex, regexlen, REG_EXTENDED);
 
  while (keepgoing)
    {
      
    }
  return (0);
}

///////////////////////////////////////////////////////////////////////////
//
//   End of code section dual-licensed to Bill Yerazunis and Joe Langeway.
//
////////////////////////////////////////////////////////////////////////////


syntax highlighted by Code2HTML, v. 0.9.1