ports//mail/crm114/work/crm114-20070810-BlameTheSegfault.src/crm_expr_clump

//	crm_expr_nn_clump.h

//  by Joe Langeway derived from crm_bit_entropy.c and produced for the crm114 so:
//  
//  This software is licensed to the public under the Free Software
//  Foundation's GNU GPL, version 2.  You may obtain a copy of the
//  GPL by visiting the Free Software Foundations web site at
//  www.fsf.org, and a copy is included in this distribution.  
//
//  Other licenses may be negotiated; contact Bill for details.  
//
/////////////////////////////////////////////////////////////////////
//
//     crm_expr_nn_clump.h - translate characters of a string.
//    
//     Original spec by Bill Yerazunis, original code by Joe Langeway,
//     recode for CRM114 use by Bill Yerazunis. 
//
//     This code section (crm_expr_nn_clump and subsidiary routines) is
//     dual-licensed to both William S. Yerazunis and Joe Langeway,
//     including the right to reuse this code in any way desired,
//     including the right to relicense it under any other terms as
//     desired.
//
//////////////////////////////////////////////////////////////////////
//if we had 2^16 tokens we'd have 2^31 cooccurences which is bigger than a long can hold, so it is safe and prudent to use an index_t for token id's

//this uses 206470625 + sizeof(header) bytes of file or ~192.3Megs
#define MAX_TOKENS 100000
#define MAX_COR_TOKENS 10000

//while we can't have this many when we're done, we can intermediately
#define MAX_CLUSTERS	10000

//align segments of files to this many bytes, to make sure we don't point to wierd boundrys and gum things up
#define BYTE_ALIGN 4

//TYPEDEFS
//we use index_t whenever we're talking about indeces because we might want to shrink things done later
typedef long index_t;
//we use NULL_INDEX just like we'd use NULL with pointers
#define NULL_INDEX 2147483647
//HAPAX_INDEX is what the non-learning tokenizer labels unfamiliar tokens
#define HAPAX_INDEX 2147483646

//we need some kind of floating point type to generate correlation scores
typedef float COOCCURRENCE_SCORE_TYPE;
#define REALLY_SMALL_SCORE -1000000.0

typedef struct mythical_cluster
{
	long next_free;
	COOCCURRENCE_SCORE_TYPE occurrences;
} CLUSTER_STRUCT;

typedef struct mythical_edge
{
	index_t edge_to;
	index_t next;
}	EDGE_STRUCT;

typedef struct mythical_token
{
	index_t cor_index;
	index_t more_common; //we maintain a sorted list of token seen counts so that we only cluster the MAX_COR_TOKENS most common to kill hapaxes
	index_t less_common;
	COOCCURRENCE_SCORE_TYPE count;
	index_t more_recent; //we need to know the second least recent all the time, so we need to look up
	index_t less_recent; //next unused slot if this slot unused
	long hash_code;
} TOKEN_STRUCT;
	
typedef struct mythical_cor_token
{
	index_t token;
	index_t cluster; //which cluster is this token in?
	index_t nearest_neihbor;
	index_t edges;
} COR_TOKEN_STRUCT;

typedef struct mythical_token_hash_node
{
	long key;
	 //next free slot if not in use, NULL_INDEX if end of chain
	index_t next_in_hash_chain;
	//token = NULL_INDEX if unused
	index_t token;
} HASH_NODE_STRUCT;

typedef struct mythical_NNclusteror_header
{
	index_t n_tokens;
	index_t n_cor_tokens;
	index_t max_tokens;
	index_t max_cor_tokens;
	index_t n_clusters;
	long hash_slots_offset;
	long tokens_offset;
	long cor_tokens_offset;
	index_t next_free_cor_token;
	index_t first_unused_token_slot;
	index_t first_unused_hash_slot;
	index_t first_unused_cluster_slot;
	index_t last_unused_cluster_slot;
	index_t most_recent_token;
	index_t least_recent_token;
	index_t least_frequent_token;
	index_t least_frequent_cor_token; //the token number, not cor token number of the least frequent token in the corelation tables
	index_t first_unused_edge;
	long cooccurences_offset;
	long graph_offset;
	long clusters_offset;
	COOCCURRENCE_SCORE_TYPE	tot_occ,	//total occurrances recorded, to normalize
							normal_factor;	//when tot_occ is huge divide everything down
											//then when we add we nult by normal_factor to make everything work out
} NNCLUSTEROR_HEADER_STRUCT;

typedef struct mythical_clusteror_state
{
	NNCLUSTEROR_HEADER_STRUCT *header;
	HASH_NODE_STRUCT *hash_table;
	TOKEN_STRUCT *tokens;
	COR_TOKEN_STRUCT *cor_tokens;
	COOCCURRENCE_SCORE_TYPE *cooccurences;
	
	EDGE_STRUCT *graph;
	//bit vector to flag when tokens have changed nearest neihbor or when tokens have changed at all for unlearning
	long *token_nn_changed;
	long *closed_list; //bit vector to flag when tokens are visited to propagate cluster member ship or when tokens are seen before we worry about clusters
	CLUSTER_STRUCT *clusters;
	index_t *old_nearest_neihbors;
} CLUSTEROR_STATE_STRUCT;
syntax highlighted by Code2HTML, v. 0.9.1