ports//mail/crm114/work/crm114-20070810-BlameTheSegfault.src/crm_osbf

//  crm_osbf_maintenance_.c  - Controllable Regex Mutilator,  version v1.0
//  Copyright 2004-2006  William S. Yerazunis, all rights reserved.
//  
//  This software is licensed to the public under the Free Software
//  Foundation's GNU GPL, version 2.  You may obtain a copy of the
//  GPL by visiting the Free Software Foundations web site at
//  www.fsf.org, and a copy is included in this distribution.  
//
//  Other licenses may be negotiated; contact the 
//  author for details.  
//
//  OBS: CSS header structure and pruning method modified for OSBF classifier.
//       See functions crm_osbf_microgroom and crm_osbf_create_cssfile, below,
//       for details.  -- Fidelis Assis - 2004/10/20
//
//  include some standard files

#include "crm114_sysincludes.h"

//  include any local crm114 configuration file
#include "crm114_config.h"

//  include the crm114 data structures file
#include "crm114_structs.h"

//  and include the routine declarations file
#include "crm114.h"

#include "crm114_osbf.h"

/* Version names */
char *CSS_version_name[] = {
  "SBPH-Markovian",
  "OSB-Bayes",
  "Correlate",
  "Neural",
  "OSB-Winnow",
  "OSBF-Bayes",
  "Unknown"
};

//    the command line argc, argv
extern int prog_argc;
extern char **prog_argv;

//    the auxilliary input buffer (for WINDOW input)
extern char *newinputbuf;

//    the globals used when we need a big buffer  - allocated once, used 
//    wherever needed.  These are sized to the same size as the data window.
extern char *inbuf;
extern char *outbuf;
extern char *tempbuf;

//    microgroom flag for osbf
static int osbf_microgroom;
// turn microgroom on (1) or off (0)
void crm_osbf_set_microgroom(int value)
{
  osbf_microgroom = value;
}


//
//     How to microgroom a .css file that's getting full
//
//     NOTA BENE NOTA BENE NOTA BENE NOTA BENE
//      
//         This whole section of code is under intense develoment; right now
//         it "works" but not any better than nothing at all.  Be warned
//         that any patches issued on it may well never see the light of
//         day, as intense testing and comparison may show that the current
//         algorithms are, well, suckful.
//
//
//     There are two steps to microgrooming - first, since we know we're
//     already too full, we execute a 'zero unity bins'.
//
void
crm_osbf_microgroom (OSBF_FEATURE_HEADER_STRUCT * header,
		     unsigned long hindex)
{
  long i, j, k;
  static long microgroom_count = 0;
  long packstart;
  long packlen;
  long zeroed_countdown, max_zeroed_buckets;
  long min_value, min_value_any, distance, max_distance;
  int groom_any = 0;
  OSBF_FEATUREBUCKET_STRUCT *h;

  // if not set by command line, use default
  if (microgroom_chain_length == 0)
    microgroom_chain_length = OSBF_MICROGROOM_CHAIN_LENGTH;
  if (microgroom_stop_after == 0)
    microgroom_stop_after = OSBF_MICROGROOM_STOP_AFTER;


  // make h point to the first feature bucket
  h = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;

  zeroed_countdown = microgroom_stop_after;
  i = j = k = 0;
  microgroom_count++;

  if (user_trace)
    {
      if (microgroom_count == 1)
	fprintf (stderr, "CSS file too full: microgrooming this css chain: ");
      fprintf (stderr, " %ld ", microgroom_count);
    };

//   micropack - start at initial chain start, move to back of 
//   chain that overflowed, then scale just that chain.

  i = j = hindex % header->buckets;
  min_value = OSBF_FEATUREBUCKET_VALUE_MAX;
  min_value_any = GET_BUCKET_VALUE (h[i]);
  while (BUCKET_IN_CHAIN (h[i]))
    {
      if (GET_BUCKET_VALUE (h[i]) < min_value && !BUCKET_IS_LOCKED (h[i]))
	min_value = GET_BUCKET_VALUE (h[i]);
      if (GET_BUCKET_VALUE (h[i]) < min_value_any)
	min_value_any = GET_BUCKET_VALUE (h[i]);
      if (i == 0)
	i = header->buckets - 1;
      else
	i--;
      if (i == j)
	break;			// don't hang if we have a 100% full .css file
      // fprintf (stderr, "-");
    }

  if (min_value == OSBF_FEATUREBUCKET_VALUE_MAX)
    {
      /* no unlocked bucket avaiable so groom any */
      groom_any = 1;
      min_value = min_value_any;
    }

  //     now, move our index to the first bucket in this chain.
  i++;
  if (i >= header->buckets)
    i = 0;
  packstart = i;

  /* i = j = hindex % header->buckets; */
  while (BUCKET_IN_CHAIN (h[i]))
    {
      i++;
      if (i == header->buckets)
	i = 0;
      if (i == packstart)
	break;		// don't hang if we have a 100% full .cfc file
    }
  //     now, our index is right after the last bucket in this chain.

  /* if there was a wraparound, full .cfc file,    */
  /* i == packstart and packlen == header->buckets */
  if (i > packstart) 
  packlen = i - packstart;
  else
    packlen = header->buckets + i - packstart;

//   This pruning method zeroes buckets with minimum count in the chain.
//   It tries first buckets with minimum distance to their right position,
//   to increase the chance of zeroing older buckets first. If none with
//   distance 0 is found, the distance is increased until at least one
//   bucket is zeroed.
//
//   We keep track of how many buckets we've zeroed and we stop
//   zeroing additional buckets after that point.   NO!  BUG!  That 
//   messes up the tail length, and if we don't repack the tail, then
//   features in the tail can become permanently inaccessible!   Therefore,
//   we really can't stop in the middle of the tail (well, we could 
//   stop zeroing, but we need to pass the full length of the tail in.
// 
//   Note that we can't do this "adaptively" in packcss, because zeroes
//   there aren't necessarily overflow chain terminators (because -we-
//   might have inserted them here.
//
//   GROT GROT GROT  Note that the following algorithm does multiple 
//   passes to find the lowest-valued features.  In fact, that's 
//   actually rather slow; a better algorithm would keep track of
//   the N least-valued features in the chain in ONE pass and zero 
//   those.
//  
//   --
//   I'm not sure if it's worth working on a better algoritm for this:
//
//   This is a statistics report of microgroomings for 4147 messages
//   of the SpamAssassin corpus. It shows that 77% is done in a single
//   pass, 95.2% in 1 or 2 passes and 99% in at most 3 passes.
//
//   # microgrommings   passes   %    accum. %
//        232584           1    76.6   76.6
//         56396           2    18.6   95.2
//         11172           3     3.7   98.9
//          2502           4     0.8   99.7
//           726           5     0.2   99.9
//           ...
//   -----------
//        303773
//
//   If we consider only the last 100 microgroomings, when the css
//   file is full, we'll have the following numbers showing that most
//   microgroomings (61%) are still done in a single pass, almost 90%
//   is done in 1 or 2 passes and 97% are done in at most 3 passes:
//
//   # microgrommings   passes   %    accum. %
//          61             1    61      61
//          27             2    27      88
//           9             3     9      97
//           3             4     3     100
//         ---
//         100
//
//   So, it's not so slow. Anyway, a better algorithm could be
//   implemented using 2 additional arrays, with MICROGROOM_STOP_AFTER
//   positions each, to store the indexes of the candidate buckets
//   found with distance equal to 1 or 2 while we scan for distance 0.
//   Those with distance 0 are zeroed immediatelly. If none with
//   distance 0 is found, we'll zero the indexes stored in the first
//   array. Again, if none is found in the first array, we'll try the
//   second one. Finally, if none is found in both arrays, the loop
//   will continue until one bucket is zeroed.
//
//   But now comes the question: do the numbers above justify the
//   additional code/work? I'll try to find out the answer
//   implementing it :), but this has low priority for now.
//
//   -- Fidelis Assis
//
  
  // try features in their right place first
  max_distance = 1;

  /* zero up to 50% of packlen */
  /* max_zeroed_buckets = (long) (0.5 * packlen + 0.5); */
  max_zeroed_buckets =  microgroom_stop_after;
  zeroed_countdown = max_zeroed_buckets;

  /*fprintf(stderr, "packstart: %ld,  packlen: %ld, max_zeroed_buckets: %ld\n",
      packstart, packlen, max_zeroed_buckets); */

  // while no bucket is zeroed...
  while (zeroed_countdown == max_zeroed_buckets)
    {
      /*
         fprintf(stderr, "Start: %ld, stop_after: %ld, max_distance: %ld\n", packstart, microgroom_stop_after, max_distance);
       */
      i = packstart;
      while (BUCKET_IN_CHAIN (h[i]) && zeroed_countdown > 0)
	{
	  // check if it's a candidate
	  if (GET_BUCKET_VALUE (h[i]) == min_value &&
	      (!BUCKET_IS_LOCKED (h[i]) || (groom_any != 0)))
	    {
	      // if it is, check the distance
	      distance = i - BUCKET_HASH (h[i]) % header->buckets;
	      if (distance < 0)
	        distance += header->buckets;
	      if (distance < max_distance)
	        {
		  BUCKET_RAW_VALUE (h[i]) = 0;
	          zeroed_countdown--;
	        }
	    }
	  i++;
	  if (i >= header->buckets)
	    i = 0;
	}

//  if none was zeroed, increase the allowed distance between the
//  candidade's position and its right place.

      if (zeroed_countdown == max_zeroed_buckets)
	max_distance++;
    }

  /*
     fprintf (stderr, "Leaving microgroom: %ld buckets zeroed at distance %ld\n", microgroom_stop_after - zeroed_countdown, max_distance - 1);
   */

  //   now we pack the buckets
  crm_osbf_packcss (header, packstart, packlen);

}

void
crm_osbf_packcss (OSBF_FEATURE_HEADER_STRUCT * header,
		  unsigned long packstart, unsigned long packlen)
{

//    How we pack...
//   
//    We look at each bucket, and attempt to reinsert it at the "best"
//    place.  We know at worst it will end up where it already is, and
//    at best it will end up lower (at a lower index) in the file, except
//    if it's in wraparound mode, in which case we know it will not get
//    back up past us (since the file must contain at least one empty)
//    and so it's still below us in the file.


  OSBF_FEATUREBUCKET_STRUCT *h;

  // make h point to the first feature bucket
  h = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;

  if (packstart + packlen <= header->buckets)	//  no wraparound in this case
    {
      crm_osbf_packseg (header, packstart, packlen);
    }
  else		//  wraparound mode - do it as two separate repacks
    {
      crm_osbf_packseg (header, packstart, (header->buckets - packstart));
      crm_osbf_packseg (header, 0, (packlen - (header->buckets - packstart)));
    };
}

void
crm_osbf_packseg (OSBF_FEATURE_HEADER_STRUCT * header,
		  unsigned long packstart, unsigned long packlen)
{
  unsigned long ifrom, ito;
  unsigned long thash, tkey;
  OSBF_FEATUREBUCKET_STRUCT *h;

  // make h point to the first feature bucket
  h = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;

  if (internal_trace)
    fprintf (stderr, " < %ld %ld >", packstart, packlen);

  // Our slot values are now somewhat in disorder because empty
  // buckets may now have been inserted into a chain where there used
  // to be placeholder buckets.  We need to re-insert slot data in a
  // bucket where it will be found.

  for (ifrom = packstart; ifrom < packstart + packlen; ifrom++)
    {
      //    Now find the next bucket to place somewhere 
      thash = BUCKET_HASH (h[ifrom]);
      tkey = BUCKET_KEY (h[ifrom]);

      if (GET_BUCKET_VALUE (h[ifrom]) == 0)
	{
	  if (internal_trace)
	    fprintf (stderr, "X");
	}
      else
	{
	  ito = thash % header->buckets;
	  // fprintf (stderr, "a %ld", ito);

	  while (BUCKET_IN_CHAIN (h[ito]) &&
		 !BUCKET_HASH_COMPARE (h[ito], thash, tkey))
	    {
	      ito++;
	      if (ito >= header->buckets)
	        ito = 0;
            }

	  //   found an empty slot, put this value there, and zero the
	  //   original one.  Sometimes this is a noop.  We don't care.

	  if (ito != ifrom)
	    {
	      BUCKET_HASH (h[ito]) = thash;
	      BUCKET_KEY (h[ito]) = tkey;
	      // move value and lock together
	      BUCKET_RAW_VALUE (h[ito]) = BUCKET_RAW_VALUE (h[ifrom]);

	      // clean "from" bucket
	      BUCKET_HASH (h[ifrom]) = 0;
	      BUCKET_KEY (h[ifrom]) = 0;
	      BUCKET_RAW_VALUE (h[ifrom]) = 0;
	    }

	  if (internal_trace)
	    {
	      if (ifrom == ito)
		fprintf (stderr, "=");
	      if (ito < ifrom)
		fprintf (stderr, "<");
	      if (ito > ifrom)
		fprintf (stderr, ">");
	    };

	};
    };
}

/* get next bucket index */
unsigned long
crm_osbf_next_bindex (OSBF_FEATURE_HEADER_STRUCT * header,
		      unsigned long hindex)
{
  hindex++;
  if (hindex >= header->buckets)
    hindex = 0;
  return hindex;
}

/* get index of the last bucket in a chain */
unsigned long
crm_osbf_last_in_chain (OSBF_FEATURE_HEADER_STRUCT * header,
			unsigned long hindex)
{
  unsigned long wraparound;
  OSBF_FEATUREBUCKET_STRUCT *hashes;

  hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;

  /* if the bucket is not in a chain, return an index */
  /* out of the buckets space, equal to the number of */
  /* buckets in the file to indicate an empty chain */
  if (!BUCKET_IN_CHAIN (hashes[hindex]))
    return header->buckets;

  wraparound = hindex;
  while (BUCKET_IN_CHAIN (hashes[hindex]))
    {
      hindex++;
      if (hindex >= header->buckets)
	hindex = 0;

      /* if .cfc file is full return an index out of */
      /* the buckets space, equal to number of buckets */
      /* in the file, plus one */
      if (hindex == wraparound)
	return header->buckets + 1;
    }
  hindex = crm_osbf_prev_bindex (header, hindex);
  return hindex;
}


/* get previous bucket index */
unsigned long
crm_osbf_prev_bindex (OSBF_FEATURE_HEADER_STRUCT * header,
		      unsigned long hindex)
{
  if (hindex == 0)
    hindex = header->buckets - 1;
  else
    hindex--;
  return hindex;
}

/* get index of the first bucket in a chain */
unsigned long
crm_osbf_first_in_chain (OSBF_FEATURE_HEADER_STRUCT * header,
			 unsigned long hindex)
{
  unsigned long wraparound;
  OSBF_FEATUREBUCKET_STRUCT *hashes;

  hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;

  /* if the bucket is not in a chain, return an index */
  /* out of the buckets space, equal to the number of */
  /* buckets in the file to indicate an empty chain */
  if (!BUCKET_IN_CHAIN (hashes[hindex]))
    return header->buckets;

  wraparound = hindex;
  while (BUCKET_IN_CHAIN (hashes[hindex]))
    {
      if (hindex == 0)
	hindex = header->buckets - 1;
      else
	hindex--;

      /* if .cfc file is full return an index out of */
      /* the buckets space, equal to number of buckets */
      /* in the file, plus one */
      if (hindex == wraparound)
	return header->buckets + 1;
    }
  return crm_osbf_next_bindex (header, hindex);
}

unsigned long
crm_osbf_find_bucket (OSBF_FEATURE_HEADER_STRUCT * header,
		      unsigned long hash, unsigned long key)
{
  OSBF_FEATUREBUCKET_STRUCT *hashes;
  unsigned long hindex, start;

  hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
  hindex = start = hash % header->buckets;
  while (!BUCKET_HASH_COMPARE (hashes[hindex], hash, key)
	 && !EMPTY_BUCKET (hashes[hindex]))
    {
      hindex = crm_osbf_next_bindex (header, hindex);
      /* if .cfc file is completely full return an index */
      /* out of the buckets space, equal to number of buckets */
      /* in the file, plus one */
      if (hindex == start)
	return header->buckets + 1;
    }

  /* return the index of the found bucket or, if not found,
   * the index of a free bucket where it could be put */
  return hindex;
}

void
crm_osbf_update_bucket (OSBF_FEATURE_HEADER_STRUCT * header,
			unsigned long bindex, int delta)
{
  OSBF_FEATUREBUCKET_STRUCT *hashes;
  hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
  /*
     fprintf (stderr, "Bucket updated at %lu, hash: %lu, key: %lu, value: %d\n",
     bindex, hashes[bindex].hash, hashes[bindex].key, delta);
   */
  if (delta > 0 && GET_BUCKET_VALUE (hashes[bindex]) +
      delta >= OSBF_FEATUREBUCKET_VALUE_MAX - 1)
    {
      SETL_BUCKET_VALUE (hashes[bindex], OSBF_FEATUREBUCKET_VALUE_MAX - 1);
    }
  else if (delta < 0 && GET_BUCKET_VALUE (hashes[bindex]) <= -delta)
    {
      BUCKET_RAW_VALUE (hashes[bindex]) = 0;
      BUCKET_HASH (hashes[bindex]) = 0;
      BUCKET_KEY (hashes[bindex]) = 0;
      /* pack chain */
      {
	long i, j, packlen;
	i = crm_osbf_next_bindex (header, bindex);
	j = crm_osbf_last_in_chain (header, i);

	/* if there's a valid chain tail starting at i, pack it */
	if (j < header->buckets)
	  {
	    if (j >= i)
	      packlen = j - i + 1;
	    else
	      packlen = header->buckets + 1 - (i - j);
	    crm_osbf_packcss (header, i, packlen);
	  }
      }
    }
  else
    {
      SETL_BUCKET_VALUE (hashes[bindex],
			 GET_BUCKET_VALUE (hashes[bindex]) + delta);
    }
}

void
crm_osbf_insert_bucket (OSBF_FEATURE_HEADER_STRUCT * header,
			unsigned long bindex, unsigned long hash,
			unsigned long key, int value)
{
  unsigned long hindex, distance;
  OSBF_FEATUREBUCKET_STRUCT *hashes;

  if (microgroom_chain_length == 0)
    microgroom_chain_length = OSBF_MICROGROOM_CHAIN_LENGTH;


  hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
  /* "right" bucket position */
  hindex = hash % header->buckets;
  /* distance from right position to free position */
  distance = (bindex >= hindex) ? bindex - hindex :
    header->buckets - (hindex - bindex);
  if ((osbf_microgroom != 0) && (value > 0))
    while (distance > microgroom_chain_length)
      {
	/*
	   fprintf (stderr, "hindex: %lu, bindex: %lu, distance: %lu\n",
	   hindex, bindex, distance);
	 */
	crm_osbf_microgroom (header, crm_osbf_prev_bindex (header, bindex));
	/* get new free bucket index */
	bindex = crm_osbf_find_bucket (header, hash, key);
	distance = (bindex >= hindex) ? bindex - hindex :
	  header->buckets - (hindex - bindex);
      }

  /*
     fprintf (stderr, "new bucket at %lu, hash: %lu, key: %lu, distance: %lu\n",
     bindex, hash, key, distance);
   */

  SETL_BUCKET_VALUE (hashes[bindex], value);
  BUCKET_HASH (hashes[bindex]) = hash;
  BUCKET_KEY (hashes[bindex]) = key;
}

static OSBF_HEADER_UNION hu;
int
crm_osbf_create_cssfile (char *cssfile, unsigned long buckets,
			 unsigned long major, unsigned long minor,
			 unsigned long spectrum_start)
{
  FILE *f;
  long i;
  OSBF_FEATUREBUCKET_STRUCT feature = {
    0, 0, 0
  };
  if (user_trace)
    fprintf (stderr, "Opening file %s for read/write\n", cssfile);
  f = fopen (cssfile, "wb");
  if (!f)
    {
      fatalerror ("Couldn't open the new .cfc file for writing; file = ",
                  cssfile);
    };
  // Set the header.
  *((unsigned long *) hu.header.version) = major;	// quick hack for now...
  hu.header.flags = minor;
  hu.header.learnings = 0;
  hu.header.buckets = buckets;
  hu.header.buckets_start = OSBF_CSS_SPECTRA_START;
  // Write header
  if (fwrite (&hu, sizeof (hu), 1, f) != 1)
    {           
      fatalerror (" Couldn't initialize the .cfc file header; file = ",
                  cssfile);
    }  

  //  Initialize CSS hashes - zero all buckets
  for (i = 0; i < buckets; i++)
    {
      // Write buckets
      if (fwrite (&feature, sizeof (feature), 1, f) != 1)
        {
          fatalerror (" Couldn't initialize the .cfc buckets; file = ",
                      cssfile);
        }
    }
  fclose (f);
  return (EXIT_SUCCESS);
}
syntax highlighted by Code2HTML, v. 0.9.1