// crm_str_funcs.c - Controllable Regex Mutilator, version v1.0
// Copyright 2001-2006 William S. Yerazunis, all rights reserved.
//
// This software is licensed to the public under the Free Software
// Foundation's GNU GPL, version 2. You may obtain a copy of the
// GPL by visiting the Free Software Foundations web site at
// www.fsf.org, and a copy is included in this distribution.
//
// Other licenses may be negotiated; contact the
// author for details.
//
// include some standard files
#include "crm114_sysincludes.h"
// include any local crm114 configuration file
#include "crm114_config.h"
// include the crm114 data structures file
#include "crm114_structs.h"
// and include the routine declarations file
#include "crm114.h"
// the command line argc, argv
extern int prog_argc;
extern char **prog_argv;
// the auxilliary input buffer (for WINDOW input)
extern char *newinputbuf;
// the globals used when we need a big buffer - allocated once, used
// wherever needed. These are sized to the same size as the data window.
extern char *inbuf;
extern char *outbuf;
extern char *tempbuf;
///////////////////////////////////////////////////////////////////////////
//
// This code section (from this comment block to the one declaring
// "end of section dual-licensed to Bill Yerazunis and Joe
// Langeway" is copyrighted and dual licensed by and to both Bill
// Yerazunis and Joe Langeway; both have full rights to the code in
// any way desired, including the right to relicense the code in
// any way desired.
//
// Vectorized stringhashing - get a bunch of features in a nice
// predigested form (a counted array of chars plus control params
// go in, and a nice array of 32-bit ints come out. The idea is to
// encapsulate tokenization/hashing into one function that all
// CRM114 classifiers can use, and so improved tokenization raises
// all boats equally, or something like that.
//
// If you need two sets of hashes, call this routine twice, with
// different pipeline coefficient arrays.
//
// If the features_out area becomes close to overflowing, then
// vector_stringhash will return with a value of next_offset <=
// textlen. If next_offset is > textlen, then there is nothing
// more to hash.
//
// The feature building is controlled via the pipeline coefficient
// arrays as described in the paper "A Unified Approach To Spam
// Filtration". In short, each row of an array describes one
// rendition of an arbitrarily long pipeline of hashed token
// values; each row of the array supplies one output value. Thus,
// the 1x1 array {1} yields unigrams, the 5x6 array
//
// {{ 1 3 0 0 0 0}
// { 1 0 5 0 0 0}
// { 1 0 0 11 0 0}
// { 1 0 0 0 23 0}
// { 1 0 0 0 0 47}}
//
// yields "Classic CRM114" OSB features, and the 2x3 array
//
// {{1 1 0}
// {1 0 1}}
//
// yields bigrams that are not position nor order sensitive, while
//
// {{1 2 0}
// {1 0 2}}
//
// yields bigrams that are order sensitive, but not position sensitive.
//
// Because the array elements are used as dot-product multipliers
// on the hashed token value pipeline, there is a small advantage to
// having the elements of the array being odd (low bit set) and
// relatively prime, as it decreases the chance of hash collisions.
//
///////////////////////////////////////////////////////////////////////////
long crm_vector_stringhash
(
char *text, // input string (null-safe!)
long textlen, // how many bytes of input.
long start_offset, // start tokenizing at this byte.
char *regex, // the parsing regex (might be ignored)
long regexlen, // length of the parsing regex
float *coeff_array, // the pipeline coefficient control array
long pipe_len, // how long a pipeline (== coeff_array row length)
long pipe_iters, // how many rows are there in coeff_array
long *features, // where the output features go
long featureslen, // how many output features (max)
long *features_out, // how many longs did we actually use up
long *next_offset // next invocation should start at this offset
)
{
long hashpipe[UNIFIED_WINDOW_LEN]; // the pipeline for hashes
long keepgoing; // the loop controller
regex_t regcb; // the compiled regex
regmatch_t match[5]; // we only care about the outermost match
long i, j, k; // some handy index vars
int regcomp_status;
// now do the work.
features_out = 0;
keepgoing = 1;
regcomp_status = crm_regcomp (®cb, regex, regexlen, REG_EXTENDED);
while (keepgoing)
{
}
return (0);
}
///////////////////////////////////////////////////////////////////////////
//
// End of code section dual-licensed to Bill Yerazunis and Joe Langeway.
//
////////////////////////////////////////////////////////////////////////////
syntax highlighted by Code2HTML, v. 0.9.1