// crm_css_maintenance_.c - Controllable Regex Mutilator, version v1.0
// Copyright 2001-2006 William S. Yerazunis, all rights reserved.
//
// This software is licensed to the public under the Free Software
// Foundation's GNU GPL, version 2. You may obtain a copy of the
// GPL by visiting the Free Software Foundations web site at
// www.fsf.org, and a copy is included in this distribution.
//
// Other licenses may be negotiated; contact the
// author for details.
//
// include some standard files
#include "crm114_sysincludes.h"
// include any local crm114 configuration file
#include "crm114_config.h"
// include the crm114 data structures file
#include "crm114_structs.h"
// and include the routine declarations file
#include "crm114.h"
// the command line argc, argv
extern int prog_argc;
extern char **prog_argv;
// the auxilliary input buffer (for WINDOW input)
extern char *newinputbuf;
// the globals used when we need a big buffer - allocated once, used
// wherever needed. These are sized to the same size as the data window.
extern char *inbuf;
extern char *outbuf;
extern char *tempbuf;
static long crm_zapcss ( FEATUREBUCKET_TYPE *h,
unsigned long hs,
unsigned long start,
unsigned long end );
// How to microgroom a .css file that's getting full
//
// NOTA BENE NOTA BENE NOTA BENE NOTA BENE
//
// This whole section of code is under intense develoment; right now
// it "works" but not any better than nothing at all. Be warned
// that any patches issued on it may well never see the light of
// day, as intense testing and comparison may show that the current
// algorithms are, well, suckful.
//
//
// There are two steps to microgrooming - first, since we know we're
// already too full, we execute a 'zero unity bins'. Then, we see
// how the file looks, and if necessary, we get rid of some data.
// R is the "MICROGROOM_RESCALE_FACTOR"
//
long crm_microgroom (FEATUREBUCKET_TYPE *h, unsigned char *seen_features,
long hs, unsigned long hindex)
{
long i, j, k;
static long microgroom_count = 0;
long steps;
long packstart; // first used bucket in the chain
long packlen; // # of used buckets in the chain
long packend; // last used bucket in the chain
// for stochastic grooming we need a place for the random...
unsigned long randy;
long zeroed_countdown;
long actually_zeroed;
long force_rescale;
j = 0;
k = 0;
zeroed_countdown = MICROGROOM_STOP_AFTER;
//
i = j = k = 0;
microgroom_count++;
if (user_trace)
{
if (microgroom_count == 1)
fprintf (stderr, "CSS file too full: microgrooming this css chain: ");
fprintf (stderr, " %ld ",
microgroom_count);
};
// We have two different algorithms for amnesia - stochastic
// (meaning random) and weight-distance based.
//
steps = 0;
randy = 0;
force_rescale = 0;
#ifdef STOCHASTIC_AMNESIA
// set our stochastic amnesia matcher - note that we add
// our microgroom count so that we _eventually_ can knock out anything
// even if we get a whole string of buckets with hash keys that all alias
// to the same value.
//
// We also keep track of how many buckets we've zeroed and we stop
// zeroing additional buckets after that point. NO! BUG! That
// messes up the tail length, and if we don't repack the tail, then
// features in the tail can become permanently inaccessible! Therefore,
// we really can't stop in the middle of the tail (well, we could
// stop zeroing, but we need to pass the full length of the tail in.
//
// Note that we can't do this "adaptively" in packcss, because zeroes
// there aren't necessarily overflow chain terminators (because -we-
// might have inserted them here.
//
// start at initial chain start, move to back of
// chain that overflowed, then scale just that chain.
//
i = j = hindex % hs;
if (i == 0) i = 1;
while (h[i].hash != 0)
{
i--;
if (i < 1) i = hs - 1;
if (i == j) break; // don't hang if we have a 100% full .css file
// fprintf (stderr, "-");
}
// now, move our index to point to the first bucket in this chain.
i++;
if (i >= hs) i = 1;
packstart = i;
steps = 0;
force_rescale = 0;
while (h[i].value != 0 )
{
// fprintf (stderr, "=");
randy = rand() + microgroom_count;
if (
( h[i].key != 0 ) // hash keys == 0 are SPECIALS like #learns,
// and must never be deleted.
&&
(force_rescale ||
(( h[i].key + randy ) & MICROGROOM_STOCHASTIC_MASK )
== MICROGROOM_STOCHASTIC_KEY ))
{
h[i].value = h[i].value * MICROGROOM_RESCALE_FACTOR;
};
if (h[i].value == 0) zeroed_countdown--;
i++;
if (i >= hs ) i = 1;
steps++;
}
packlen = steps;
#endif
#ifdef WEIGHT_DISTANCE_AMNESIA
//
// Weight-Distance Amnesia is an improvement by Fidelis Assis
// over Stochastic amnesia in that it doesn't delete information
// randomly; instead it uses the heuristic that low-count buckets
// at or near their original insert point are likely to be old and
// stale so expire those first.
//
//
i = j = k = hindex % hs;
if (i == 0) i = j = k = 1;
while (h[i].hash != 0)
{
i--;
if (i < 1) i = hs - 1;
if (i == j) break; // don't hang if we have a 100% full .css file
// fprintf (stderr, "-");
}
// now, move our index to point to the first _used_ bucket in
// this chain.
i++;
if (i >= hs) i = 1;
packstart = i;
// Now find the _end_ of the bucket chain.
//
while (h[j].hash != 0)
{
j++;
if (j >= hs) j = 1;
if (j == k) break; // don't hang on 100% full .css file
}
j--;
if (j == 0) j = hs - 1;
// j is now the _last_ _used_ bucket.
packend = j;
// Now we have the start and end of the bucket chain.
//
// An advanced version of this algorithm would make just two passes;
// one to find the lowest-ranked buckets, and another to zero them.
// However, Fidelis' instrumentation indicates that an in-place,
// multisweep algorithm may be as fast, or even faster, in the most
// common situations. So for now, we'll do a multisweep.
//
//
// Normal Case: hs=10, packstart = 4, packend = 7
// buck# 0 1 2 3 4 5 6 7 8 9
// R 0 0 0 X X X X 0 0
// so packlen = 4 ( == 7 - 4 + 1)
//
// fixup for wraparound - note the 0th bucket is RESERVED:
// example hs = 10, packstart = 8, packend = 2
// buck# 0 1 2 3 4 5 6 7 8 9
// R X X 0 0 0 0 0 X X
// and so packlen = 4 (10 - 8 + 2)
if (packstart < packend )
{
packlen = packend - packstart + 1;
}
else
{
packlen = ( hs - packstart ) + packend;
};
// And now zap some buckets - are we in wraparound?
//
if ( packstart < packend )
{
// fprintf (stderr, "z");
actually_zeroed = crm_zapcss ( h, hs, packstart, packend);
}
else
{
//fprintf (stderr, "Z");
actually_zeroed = crm_zapcss (h, hs, packstart, hs -1 );
actually_zeroed = actually_zeroed
+ crm_zapcss (h, hs, 1, (packlen - (hs - packstart)));
};
#endif
// now we pack the buckets
crm_packcss (h, seen_features, hs, packstart, packlen);
return (actually_zeroed);
}
////////////////////////////////////////////////
//
// crm_zapcss - the distance-heuristic microgroomer core.
static long crm_zapcss ( FEATUREBUCKET_TYPE *h,
unsigned long hs,
unsigned long start,
unsigned long end )
{
// A question- what's the ratio deprecation ratio between
// "distance from original" vs. low point value? The original
// Fidelis code did a 1:1 equivalence (being 1 place off is exactly as
// bad as having a count of just 1).
//
// In reality, because of Zipf's law, most of the buckets
// stay at a value of 1 forever; they provide scant evidence
// no matter what. Therefore, we will allow separate weights
// for V (value) and D (distance). Note that a D of zero
// means "don't use distance, only value", and a V of zero
// means "don't use value, only distance. Mixed values will
// give intermediate tradeoffs between distance( ~~ age) and
// value.
//
// Similarly, VWEIGHT2 and DWEIGHT2 go with the _square_ of
// the value and distance.
#define VWEIGHT 1.0
#define VWEIGHT2 0.0
#define DWEIGHT 1.0
#define DWEIGHT2 0.0
long vcut;
long zcountdown;
unsigned long packlen;
unsigned long k;
long actually_zeroed;
vcut = 1;
packlen = end - start;
// fprintf (stderr, " S: %ld, E: %ld, L: %ld ", start, end, packlen );
zcountdown = packlen / 32.0 ; // get rid of about 3% of the data
actually_zeroed = 0;
while (zcountdown > 0)
{
// fprintf (stderr, " %ld ", vcut);
for (k = start; k <= end; k++)
{
if (h[k].key != 0 ) // key == 0 means "special- don't zero!"
{
// fprintf (stderr, "a");
if (h[k].value > 0) // can't zero it if it's already zeroed
{
// fprintf (stderr, "b");
if ((VWEIGHT * h[k].value) +
(VWEIGHT2 * h[k].value * h[k].value ) +
(DWEIGHT * (k - h[k].hash % hs)) +
(DWEIGHT2 * (k - h[k].hash % hs) * (k - h[k].hash % hs))
<= vcut)
{
// fprintf (stderr, "*");
h[k].value = 0;
zcountdown--;
actually_zeroed++;
};
};
};
};
vcut++;
};
return (actually_zeroed);
}
void crm_packcss (FEATUREBUCKET_TYPE *h, unsigned char *seen_features,
long hs, long packstart, long packlen)
{
// How we pack...
//
// We look at each bucket, and attempt to reinsert it at the "best"
// place. We know at worst it will end up where it already is, and
// at best it will end up lower (at a lower index) in the file, except
// if it's in wraparound mode, in which case we know it will not get
// back up past us (since the file must contain at least one empty)
// and so it's still below us in the file.
//fprintf (stderr, "Packing %ld len %ld total %ld",
// packstart, packlen, packstart+packlen);
// if (packstart+packlen >= hs)
// fprintf (stderr, " BLORTTTTTT ");
if (packstart+packlen <= hs) // no wraparound in this case
{
crm_packseg (h, seen_features, hs, packstart, packlen);
}
else // wraparound mode - do it as two separate repacks
{
crm_packseg (h, seen_features, hs, packstart, (hs - packstart));
crm_packseg (h, seen_features, hs, 1, (packlen - (hs - packstart)));
};
}
void crm_packseg (FEATUREBUCKET_TYPE *h, unsigned char *seen_features,
long hs, long packstart, long packlen)
{
unsigned long ifrom, ito;
unsigned long thash, tkey, tvalue, tseen;
// keep the compiler quiet - tseen is used only if seen_features
// is non-null, but the compiler isn't smart enough to know that.
tseen = 0;
if (internal_trace) fprintf (stderr, " < %ld %ld >", packstart, packlen);
for (ifrom = packstart; ifrom < packstart + packlen; ifrom++)
{
// Is it an empty bucket? (remember, we're compressing out
// all placeholder buckets, so any bucket that's zero-valued
// is a valid target.)
if ( h[ifrom].value == 0)
{
// Empty bucket - turn it from marker to empty
if (internal_trace) fprintf (stderr, "x");
h[ifrom].key = 0;
h[ifrom].hash = 0;
if (seen_features)
seen_features[ifrom] = 0;
}
else
{ if (internal_trace) fprintf (stderr, "-");};
}
// Our slot values are now somewhat in disorder because empty
// buckets may now have been inserted into a chain where there used
// to be placeholder buckets. We need to re-insert slot data in a
// bucket where it will be found.
//
ito = 0;
for (ifrom = packstart; ifrom < packstart+packlen; ifrom++)
{
// Now find the next bucket to place somewhere
//
thash = h[ifrom].hash;
tkey = h[ifrom].key;
tvalue = h[ifrom].value;
if (seen_features)
tseen = seen_features[ifrom];
if (tvalue == 0)
{
if (internal_trace) fprintf (stderr, "X");
}
else
{
ito = thash % hs;
if (ito == 0) ito = 1;
// fprintf (stderr, "a %ld", ito);
while ( ! ( (h[ito].value == 0)
|| ( h[ito].hash == thash
&& h[ito].key == tkey )))
{
ito++;
if (ito >= hs) ito = 1;
// fprintf (stderr, "a %ld", ito);
};
//
// found an empty slot, put this value there, and zero the
// original one. Sometimes this is a noop. We don't care.
if (internal_trace)
{
if ( ifrom == ito ) fprintf (stderr, "=");
if ( ito < ifrom) fprintf (stderr, "<");
if ( ito > ifrom ) fprintf (stderr, ">");
};
h[ifrom].hash = 0;
h[ifrom].key = 0;
h[ifrom].value = 0;
if (seen_features)
seen_features[ifrom] = 0;
h[ito].hash = thash;
h[ito].key = tkey;
h[ito].value = tvalue;
if (seen_features)
seen_features[ito] = tseen;
};
};
}
int crm_create_cssfile(char *cssfile, long buckets,
long major, long minor, long spectrum_start)
{
FILE *f;
long i;
FEATUREBUCKET_STRUCT feature = {0, 0, 0};
if (user_trace)
fprintf (stderr, "Opening file %s for writing.\n", cssfile);
f = fopen (cssfile, "wb");
if (!f)
{
fprintf (stderr, "\n Couldn't open file %s for writing; errno=%d .\n",
cssfile, errno);
return (EXIT_FAILURE);
}
// Initialize CSS file - zero all buckets
feature.hash = major;
feature.key = minor;
feature.value = spectrum_start;
for (i=0; i < buckets; i++)
{
if (fwrite(&feature, sizeof(feature), 1, f) != 1)
{
fprintf (stderr, "\n Couldn't initialize .CSS file %s, "
"errno=%d.\n",
cssfile,
errno);
return (EXIT_FAILURE);
}
//
// HACK ALERT HACK ALERT HACK ALERT
//
// yeah,there's more efficient ways to do this, but this will
// stay in cache; an IF-statement will need at least three ops as
// well. Probably six of one...
feature.hash = 0;
feature.key = 0;
feature.value = 0;
}
fclose (f);
return (EXIT_SUCCESS);
}
syntax highlighted by Code2HTML, v. 0.9.1