// crm_svm.c - version v1.0
//
// Copyright 2001-2006 William S. Yerazunis, all rights reserved.
//
// This software is licensed to the public under the Free Software
// Foundation's GNU GPL, version 2. You may obtain a copy of the
// GPL by visiting the Free Software Foundations web site at
// www.fsf.org, and a copy is included in this distribution.
//
///////////////////////////////////////////////////////////////////////////
//
// This code is originally copyright and owned by William
// S. Yerazunis. In return for addition of significant derivative
// work, Yimin Wu is hereby granted a full unlimited license to use
// this code, includng license to relicense under other licenses.
//
///////////////////////////////////////////////////////////////////////
// Other licenses may be negotiated; contact the
// author for details.
//
//
// include some standard files
#include "crm114_sysincludes.h"
#include <math.h>
#include <time.h>
// include any local crm114 configuration file
#include "crm114_config.h"
// include the crm114 data structures file
#include "crm114_structs.h"
// and include the routine declarations file
#include "crm114.h"
#include <stdlib.h>
#include <stdio.h>
// the command line argc, argv
extern int prog_argc;
extern char **prog_argv;
// the auxilliary input buffer (for WINDOW input)
extern char *newinputbuf;
// the globals used when we need a big buffer - allocated once, used
// wherever needed. These are sized to the same size as the data window.
extern char *inbuf;
extern char *outbuf;
extern char *tempbuf;
// The following sqrtf mumbojumbo because ppc_osx doesn't define sqrtf
// like it should.
#ifndef sqrtf
#define sqrtf(x) sqrt((x))
#endif
//////////////////////////////////////////////////////////////////////////
//
// Support Vector Machine (SVM) Classification
//
// This is an implementation of a support vector machine classification.
// The current version only implement one type of SVM called C-Support
// Vector Classification (C-SVC, Boser et al., 1992; Cortes and Vapnik,
// 1995).
//
// The dual formulation of C-SVC is to find
//
// min 0.5 ( \alpha^T Q \alpha) - e^T \alpha
//
// subject to y^T \alpha = 0
// y_i = +1 or -1
// 0 <= alpha_i <= C, i=1,...,sizeof(corpus).
//
// Where "e" is the vector of all ones,
// Q is the sizeof(corpus) by sizeof(corpus) matrix containing the
// calculated distances between any two documents (that is,
// Q_ij = y_i * y_j * kernel(x_i, x_j) which may be HUGE and so
// we only calculate part of it at any one time.
// x_i is the feature vector of document i.
//
// The decision function is
//
// sgn (sum(y_i * \alpha_i * kernel(x_i, x)) + b)
//
// In the optimization, we set the kernel parameters at the start and
// then modify only the weighting parameters till it (hopefully) converges.
////////////////////////////////////////////////////////////////////////////
//
// SMO-type Decomposition Method
//
// Here we used SMO-type decomposition method ( Platt, 1998) to solve
// the quadratic optimization problem --dual formulation of C-SVC, using
// the method of Fan, Chen, and Lin ("Working Set Selection using Second
// Order Information for Training Support Vector Machines", 2005)
// to select the working set.
//
// Pick a kernel...
#define LINEAR 0
#define RBF 1
#define POLY 2
// and a mode for the software to run in (only C-SVC works for now)
#define C_SVC 0
#define ONE_CLASS 1
// Tau is a small minimum positive number (divide-by-zero noodge)
#define TAU 1e-12
// We use the same old hyperspace structures for the intermediate storage.
typedef struct mythical_hyperspace_cell {
unsigned long hash;
// unsigned long key;
} HYPERSPACE_FEATUREBUCKET_STRUCT;
// Parameter block to control the SVM solver.
//
typedef struct mythical_svm_param {
int svm_type;
int kernel_type;
double cache_size; // in MB
double eps; // convergence stop criterion
double C; // parameter in C_SVC
double nu; // parameter for One-class SVM
double max_run_time; // time control for microgroom (in seconds).
// If computing time exceeds max_run_time,
// then start microgrooming to delete the
// documents far away from the hyperplane.
int shrinking; // use the shrinking heuristics, isn't available now
} SVM_PARAM;
// And a struct to hold the actual data we're trying to solve.
//
typedef struct mythical_svm_problem {
int l; //number of documents
int *y; //label of documents -1/+1
HYPERSPACE_FEATUREBUCKET_STRUCT **x; // x[i] is the ith document's
// feature vector
} SVM_PROBLEM;
// A structure to hold the cache node - these hold one row worth of
// Q matrix.
//
typedef struct mythical_cache_node {
struct mythical_cache_node *prev, *next;
float *data;
int len;
} CACHE_NODE;
// This is the cache representaton of the whole matrix; this is the
// "first column" and points to the start of each row.
typedef struct mythical_cache {
int l; //the number of documents in the corpus
long size; //the cache size (bytes)
CACHE_NODE *head;
CACHE_NODE lru_headnode; //least-recent-use node
} CACHE;
// This stores the result - alpha is the weighting vector (what we are
// searching for) and
//
typedef struct mythical_solver{
double *alpha;
double *G; // Gradient of objective function in each dimension
double *deci_array; // decision values for all training data
} SOLVER;
// And a few file-wide static globals:
//
static SVM_PARAM param;
static SVM_PROBLEM svm_prob;
static CACHE svmcache;
static float *DiagQ; //diagonal Qmatrix
static SOLVER solver;
////////////////////////////////////////////////////////////////////
//
// the hash coefficient table (hctable) should be full of relatively
// prime numbers, and preferably superincreasing, though both of those
// are not strict requirements.
//
static long hctable[] =
{ 1, 7,
3, 13,
5, 29,
11, 51,
23, 101,
47, 203,
97, 407,
197, 817,
397, 1637,
797, 3277 };
static int hash_compare (void const *a, void const *b)
{
HYPERSPACE_FEATUREBUCKET_STRUCT *pa, *pb;
pa = (HYPERSPACE_FEATUREBUCKET_STRUCT *) a;
pb = (HYPERSPACE_FEATUREBUCKET_STRUCT *) b;
if (pa->hash < pb->hash)
return (-1);
if (pa->hash > pb->hash)
return (1);
return (0);
}
// Compare two integers when all we have are void* pointers to them
static int int_compare (void const *a, void const *b)
{
int *pa, *pb;
pa = (int *) a;
pb = (int *) b;
if (*pa < *pb)
return (-1);
if (*pa > *pb)
return (1);
return (0);
}
// Compare ids of documents according to its decision value (decending
// absolute value), which is used for microgrooming
static int descend_int_decision_compare(void const *a, void const *b)
{
int *pa, *pb;
pa = (int *) a;
pb = (int *) b;
if ( solver.deci_array[*pa] < solver.deci_array[*pb])
return (1);
if ( solver.deci_array[*pa] > solver.deci_array[*pb])
return (-1);
return (0);
}
///////////////////////////////////////////////////////////////////////////
//
// Cache with least-recent-use strategy
// This will be used to store the part of the Q matrix that we know
// about. We recalculate parts as needed... this lets us solve the
// problem without requiring enough memory to build the entire Q
// matrix.
//
static void cache_init ( int len, long size, CACHE *svmcache)
{
svmcache->l = len;
svmcache->size = size;
svmcache->head = (CACHE_NODE *)calloc(len, sizeof(CACHE_NODE));
size /= sizeof(float);
size -= len * (sizeof(CACHE_NODE)/sizeof(float));
if(size < (2 * len))
size = 2 * len; // cache size must at least
// as large as two columns of Qmatrix
(svmcache->lru_headnode).prev
= (svmcache->lru_headnode).next
= &(svmcache->lru_headnode);
}
// Release the whole Q-matrix cache
//
static void cache_free(CACHE *svmcache)
{
CACHE_NODE *temp;
for(temp = (svmcache->lru_headnode).next;
temp != &(svmcache->lru_headnode);
temp = temp->next)
free(temp->data);
free(svmcache->head);
}
// Delete one node (that is, one row of Q-matrix) from the LRU.
// (we then usually move that row to the front of the LRU.
static void lru_delete(CACHE_NODE *h){
h->prev->next = h->next;
h->next->prev = h->prev;
}
// Insert to the last position in the cache node list
//
static void lru_insert(CACHE_NODE *h, CACHE *svmcache)
{
h->next = &(svmcache->lru_headnode);
h->prev = (svmcache->lru_headnode).prev;
h->prev->next = h;
(svmcache->lru_headnode).prev = h;
}
// Get a line of Q matrix data for certain document, and return the
// length of cached data. If it is smaller than the request length,
// then we need to fill in the uncached data.
static int get_data (CACHE *svmcache,
const int doc_index,
float **data,
int length)
{
int result = length;
CACHE_NODE *doc = svmcache->head + doc_index;
if(doc->len) lru_delete(doc); //least-recent-use strategy
//need to allocate more space
if ( length > (doc->len))
{
// GROT GROT GROT check this to see if it doesn't leak memory
// cache hasn't enough free space, we need to release some old space
while((svmcache->size) < (length - doc->len))
{
CACHE_NODE *temp = (svmcache->lru_headnode).next;
lru_delete(temp);
free(temp->data);
svmcache->size += temp->len;
temp->data = 0;
temp->len = 0;
}
//allocate new space
doc->data = (float *)realloc(doc->data, length * sizeof(float));
svmcache->size -= (length - doc->len);
result = doc->len;
doc->len = length;
}
lru_insert(doc, svmcache);
*data = doc->data;
return result;
}
// Dot operation of two feature vectors
//
static double dot(void const *a, void const *b)
{
HYPERSPACE_FEATUREBUCKET_STRUCT *pa, *pb;
int j = 0;
int i = 0;
double sum = 0;
pa = (HYPERSPACE_FEATUREBUCKET_STRUCT *) a;
pb = (HYPERSPACE_FEATUREBUCKET_STRUCT *) b;
while(pa[i].hash != 0 && pb[j].hash != 0)
{
if(pa[i].hash == pb[j].hash && pa[i].hash != 0)
{
sum ++;
i++;
j++;
}
else
{
if(pa[i].hash > pb[j].hash)
j++;
else
i++;
}
}
return sum;
}
// RBF (Radial Basis Function) kernel
//
static double rbf ( void const *a, void const *b )
{
HYPERSPACE_FEATUREBUCKET_STRUCT *pa, *pb;
int j = 0;
int i = 0;
double sum = 0;
pa = (HYPERSPACE_FEATUREBUCKET_STRUCT *) a;
pb = (HYPERSPACE_FEATUREBUCKET_STRUCT *) b;
while(pa[i].hash != 0 && pb[j].hash != 0)
{
if(pa[i].hash > pb[j].hash)
{
sum ++;
j++;
}
else if(pa[i].hash < pb[j].hash)
{
sum ++;
i++;
}
else
{
i++;
j++;
}
}
while(pa[i].hash != 0)
{
sum ++;
i++;
}
while(pb[j].hash != 0)
{
sum ++;
j++;
}
return exp(-0.00001 * sum);
}
// Polynomial kernel of two basis vectors
static double poly(void const *a, void const *b)
{
double gamma = 0.001;
double sum = 0.0;
sum = pow(gamma * dot(a,b) + 3, 2.0);
return sum;
}
// Big switch to pick the kernel operation we want
static double kernel(void const *a, void const *b)
{
switch(param.kernel_type)
{
case LINEAR:
return dot(a,b);
case RBF:
return rbf(a,b);
case POLY:
return poly(a,b);
default:
return 0;
}
}
// Ask the cache for the ith row in the Q matrix for C-Support Vector
// Classification(C-SVC)and one-class SVM
//
static float *get_rowQ(int i, int length)
{
float *rowQ;
int cached;
if ((cached = get_data(&svmcache, i, &rowQ, length)) < length)
{
int temp;
for (temp = cached; temp < length; temp++)
{
if (param.svm_type == C_SVC)
// multiply by the +1/-1 labels (in the .y structures) to
// face the kernel result in the right direction.
rowQ[temp] = (float)svm_prob.y[i]
* svm_prob.y[temp]
* kernel(svm_prob.x[i],svm_prob.x[temp] ) ;
else if(param.svm_type == ONE_CLASS)
rowQ[temp] = (float)kernel(svm_prob.x[i],svm_prob.x[temp]);
}
}
return rowQ;
}
// Request of the diagonal in Qmatrix for C- Support Vector
// Classification(C-SVC) and one-class SVM
static float *get_DiagQ()
{
float *DiagQ = (float *) malloc(svm_prob.l * sizeof(float));
int i;
for(i = 0; i<svm_prob.l; i++)
{
DiagQ[i] = (float) kernel(svm_prob.x[i],svm_prob.x[i]);
}
return DiagQ;
}
// Initialize the cache and diagonal Qmatrix
static void Q_init()
{
// Default initialization is the param cache size in megabytes (that's
// the 1<<20)
cache_init(svm_prob.l, param.cache_size*(1<<20), &svmcache);
DiagQ = get_DiagQ();
}
// Now, select a working set. This is based on the paper:
// "An SMO algorithm in Fan et al., JMLR 6(2005), p. 1889--1918"
// Solves:
//
// min 0.5(\alpha^T Q \alpha) + p^T \alpha
//
// y^T \alpha = \delta
// y_i = +1 or -1
// 0 <= alpha <= C
//
//
// Given:
//
// Q, p, y, C, and an initial feasible point \alpha
// l is the size of vectors and matrices
// eps is the stopping tolerance
//
// solution will be put in \alpha
//
// modification from Fan's paper- select_times keeps us from re-selecting
// the same document over and over (which was causing us to lock up)
static void selectB(int workset[], int *select_times)
{
// select i
int i = -1;
double G_max;
double G_min;
int t, j;
double obj_min;
double a,b;
float *Qi;
// Select a document that is on the wrong side of the hyperplane
// (called a "violating pair" in Fan's paper). Note that the
// margin is not symmetrical - we can select any "positive" class
// element with alpha < param.C, but the 'negative' class must
// only be greater than 0. Yimin says this is OK, I say it's
// wierd.
G_max = - HUGE_VAL;
for (t = 0; t < svm_prob.l; t++)
{
if((((svm_prob.y[t] == 1) && (solver.alpha[t] < param.C))
|| ((svm_prob.y[t] == -1) && (solver.alpha[t] > 0)))
&& select_times[t] < 10)
{
if(-svm_prob.y[t] * solver.G[t] >= G_max)
{
i = t;
G_max = -svm_prob.y[t] * solver.G[t];
}
}
}
// select j as second member of working set;
j = -1;
obj_min = HUGE_VAL;
G_min = HUGE_VAL;
for (t = 0; t< svm_prob.l; t++)
{
if((((svm_prob.y[t] == -1) && (solver.alpha[t] < param.C))
|| ((svm_prob.y[t] == 1) && (solver.alpha[t] > 0)))
&& select_times[t] < 10)
{
b = G_max + svm_prob.y[t] * solver.G[t];
if (-svm_prob.y[t] * solver.G[t] <= G_min)
G_min = -svm_prob.y[t] * solver.G[t];
if (b > 0)
{
if(i != -1)
{
Qi = get_rowQ(i,svm_prob.l);
a = Qi[i] + DiagQ[t]
- 2 * svm_prob.y[i] * svm_prob.y[t] * Qi[t];
if (a <= 0)
a = TAU;
if(-(b * b) / a <= obj_min)
{
j = t;
obj_min = -(b * b) / a;
}
}
}
}
}
// Are we done?
if(G_max - G_min < param.eps)
{
workset[0] = -1;
workset[1] = -1;
}
else
{
workset[0] = i;
workset[1] = j;
}
}
static void solve()
{
int t,workset[2],i,j, n;
double a, b, oldi, oldj, sum;
float *Qi, *Qj;
// Array for storing how many times a particular document has been
// selected in working set.
int select_times[svm_prob.l];
for(i = 0; i < svm_prob.l; i++)
{
select_times[i] = 0;
}
solver.alpha = (double *)malloc(svm_prob.l * sizeof(double));
solver.G = (double *)malloc(svm_prob.l * sizeof(double));
if(param.svm_type == C_SVC)
{
// initialize alpha to all zero;
// initialize G to all -1;
for(t = 0; t < svm_prob.l; t++){
solver.alpha[t] = 0;
solver.G[t] = -1;
}
}
else
if
(param.svm_type == ONE_CLASS)
{
//initialize the first nu*l elements of alpha to have the value one;
n = (int)(param.nu * svm_prob.l);
for(i = 0; i < n; i++)
solver.alpha[i] = 1;
if(n < svm_prob.l)
solver.alpha[n] = param.nu * svm_prob.l - n;
for(i = n + 1;i < svm_prob.l;i++)
solver.alpha[i] = 0;
//initialize G to all 0;
for(i = 0; i < svm_prob.l; i++){
solver.G[i] = 0;
}
};
while(1)
{
selectB(workset, select_times);
i = workset[0];
j = workset[1];
if(i != -1)
select_times[i] ++;
if(j != -1)
select_times[j] ++;
if(j == -1)
break;
//fprintf(stderr, "i=%d\tj=%d\t",i,j);
Qi = get_rowQ(i, svm_prob.l);
Qj = get_rowQ(j, svm_prob.l);
// Calculate the incremental step forward.
a = Qi[i] + DiagQ[j] - 2 * svm_prob.y[i] * svm_prob.y[j] * Qi[j];
if(a <= 0)
a = TAU;
b = -svm_prob.y[i] * solver.G[i] + svm_prob.y[j] * solver.G[j];
// update alpha (weight vector)
oldi = solver.alpha[i];
oldj = solver.alpha[j];
solver.alpha[i] += svm_prob.y[i] * b/a;
solver.alpha[j] -= svm_prob.y[j] * b/a;
// project alpha back to the feasible region (that is, where
// where 0 <= alpha <= C )
sum = svm_prob.y[i] * oldi + svm_prob.y[j] * oldj;
if (solver.alpha[i] > param.C)
solver.alpha[i] = param.C;
if (solver.alpha[i] < 0 )
solver.alpha[i] = 0;
solver.alpha[j] = svm_prob.y[j]
* (sum - svm_prob.y[i] * (solver.alpha[i]));
if (solver.alpha[j] > param.C)
solver.alpha[j] = param.C;
if (solver.alpha[j] < 0 )
solver.alpha[j] = 0;
solver.alpha[i] = svm_prob.y[i]
* (sum - svm_prob.y[j] * (solver.alpha[j]));
// update gradient array
for(t = 0; t < svm_prob.l; t++)
{
solver.G[t] += Qi[t] * (solver.alpha[i] - oldi)
+ Qj[t] * (solver.alpha[j] - oldj);
}
}
}
// Calculate b (hyperplane offset in
// SUM (y[i] alpha[i] kernel (x[i],x)) + b form)
// after calculating error margin alpha
static double calc_b()
{
int count = 0;
double upper = HUGE_VAL;
double lower = -HUGE_VAL;
double sum = 0;
int i;
double b;
for(i = 0; i < svm_prob.l; i++)
{
if(svm_prob.y[i] == 1)
{
if(solver.alpha[i] == param.C)
{
if(solver.G[i] > lower)
{
lower = solver.G[i];
}
}
else if(solver.alpha[i] == 0)
{
if(solver.G[i] < upper)
{
upper = solver.G[i];
}
}
else
{
count++;
sum += solver.G[i];
}
}
else
{
if(solver.alpha[i] == 0)
{
if(-solver.G[i] > lower)
{
lower = -solver.G[i];
}
}
else if(solver.alpha[i] == param.C)
{
if(-solver.G[i] < upper)
{
upper = -solver.G[i];
}
}
else
{
count++;
sum -= solver.G[i];
}
}
}
if(count > 0)
b = -sum/count;
else
b = -(upper + lower)/2;
return b;
}
// Calculate the decision function
static double calc_decision (HYPERSPACE_FEATUREBUCKET_STRUCT *x,
double *alpha,
double b)
{
int i;
double sum = 0;
i=0;
if(param.svm_type == C_SVC)
{
for (i = 0; i < svm_prob.l; i++){
if(alpha[i] != 0)
sum += svm_prob.y[i] * alpha[i] * kernel(x,svm_prob.x[i]);
}
sum += b;
}
else if(param.svm_type == ONE_CLASS)
{
for (i = 0; i < svm_prob.l; i++)
{
if(alpha[i] != 0)
sum += alpha[i] * kernel(x,svm_prob.x[i]);
}
sum -= b;
}
return sum;
}
// Implementation of Lin's 2003 improved algorithm on Platt's
// probabilistic outputs for binary SVM input parameters: deci_array =
// array of svm decision values svm.prob posn = number of positive
// examples negn = number of negative examples
// Outputs: parameters of sigmoid function-- A and B (AB[0] = A, AB[1] = B)
static void calc_AB(double *AB, double *deci_array, int posn, int negn)
{
int maxiter, i, j;
double minstep, sigma, fval, hiTarget, loTarget, *t;
double fApB, h11, h22, h21, g1, g2, p, q, d1, d2, det, dA, dB, gd, stepsize, newA, newB, newf;
maxiter = 100;
minstep = 1e-10;
sigma = 1e-3;
fval = 0.0;
hiTarget = (posn + 1.0) / (posn + 2.0);
loTarget = 1 / (negn + 2.0);
t = (double *)malloc(svm_prob.l * sizeof(double));
for(i = 0; i< svm_prob.l; i++)
{
if(svm_prob.y[i] > 0)
t[i] = hiTarget;
else
t[i] = loTarget;
}
AB[0] = 0.0;
AB[1] = log((negn + 1.0) / (posn + 1.0));
for (i = 0; i < svm_prob.l; i++)
{
fApB = deci_array[i] * AB[0] + AB[1];
if(fApB >= 0)
fval += t[i] * fApB + log(1 + exp(-fApB));
else
fval += (t[i] - 1) * fApB + log(1 + exp(fApB));
}
for(j = 0; j < maxiter; j++)
{
h11 = h22 = sigma;
h21 = g1 = g2 = 0.0;
for(i = 0; i < svm_prob.l; i++)
{
fApB = deci_array[i] * AB[0] + AB[1];
if(fApB >= 0)
{
p = exp(-fApB) / (1.0 + exp(-fApB));
q = 1.0 / (1.0 + exp(-fApB));
}
else
{
p = 1.0 / (1.0 + exp(fApB));
q = exp(fApB) / (1.0 + exp(fApB));
}
d2 = p * q;
h11 += deci_array[i] * deci_array[i] * d2;
h22 += d2;
h21 += deci_array[i] * d2;
d1 = t[i] - p;
g1 += deci_array[i] * d1;
g2 += d1;
}
// Stopping Criterion
if ((fabs(g1) < 1e-5) && (fabs(g2) < 1e-5))
{
break;
}
//compute modified Newton directions
det = h11 * h22 - h21 * h21;
dA = -(h22 * g1 - h21 * g2) / det;
dB = -(-h21 * g1 + h11 * g2) / det;
gd = g1 * dA + g2 * dB;
stepsize = 1;
while (stepsize >= minstep)
{
newA = AB[0] + stepsize * dA;
newB = AB[1] + stepsize * dB;
newf = 0.0;
for (i = 0; i < svm_prob.l; i++)
{
fApB = deci_array[i] * newA + newB;
if (fApB >= 0)
newf += t[i] * fApB + log(1 + exp(-fApB));
else
newf += (t[i] - 1) * fApB + log(1 + exp(fApB));
}
// Check whether sufficient decrease is satisfied
if (newf < fval + 0.0001 * stepsize * gd)
{
AB[0] = newA;
AB[1] = newB;
fval = newf;
break;
}
else
stepsize /= 2.0;
}
if (stepsize < minstep)
{
if(user_trace)
fprintf(stderr, "Line search fails in probability estimates\n");
break;
}
}
if (j >= maxiter)
if(user_trace)
fprintf(stderr,
"Reaching maximal iterations in probability estimates\n");
free(t);
}
static double sigmoid_predict(double decision_value, double A, double B)
{
double fApB = decision_value * A + B;
if (fApB >= 0)
{
return exp(-fApB) / (1.0 + exp(-fApB));
}
else
return 1.0 / (1 + exp(fApB)) ;
}
int crm_expr_svm_learn(CSL_CELL *csl, ARGPARSE_BLOCK *apb,
char *txtptr, long txtstart, long txtlen)
{
long i, j, k, h;
char ftext[MAX_PATTERN];
long flen;
char file1[MAX_PATTERN];
char file2[MAX_PATTERN];
char file3[MAX_PATTERN];
FILE *hashf;
long textoffset;
long textmaxoffset;
HYPERSPACE_FEATUREBUCKET_STRUCT *hashes; // the hashes we'll sort
long hashcounts;
long cflags, eflags;
long sense;
long microgroom;
long unique;
long use_unigram_features;
char ptext[MAX_PATTERN]; //the regrex pattern
long plen;
regex_t regcb;
regmatch_t match[5];
struct stat statbuf1; // for statting the hash file1
struct stat statbuf2; // for statting the hash file2
unsigned long hashpipe[OSB_BAYES_WINDOW_LEN+1];
time_t start_timer;
time_t end_timer;
double run_time;
i = 0;
j = 0;
k = 0;
start_timer = time(NULL);
// set our cflags, if needed. The defaults are
// "case" and "affirm", (both zero valued).
// and "microgroom" disabled.
cflags = REG_EXTENDED;
eflags = 0;
sense = +1;
if (apb->sflags & CRM_NOCASE)
{
cflags = cflags | REG_ICASE;
eflags = 1;
if (user_trace)
fprintf (stderr, "turning oncase-insensitive match\n");
};
if (apb->sflags & CRM_REFUTE)
{
sense = -sense;
if (user_trace)
fprintf (stderr, " refuting learning\n");
};
microgroom = 0;
if (apb->sflags & CRM_MICROGROOM)
{
microgroom = 1;
if (user_trace)
fprintf (stderr, " enabling microgrooming.\n");
};
unique = 0;
if (apb->sflags & CRM_UNIQUE)
{
unique = 1;
if (user_trace)
fprintf (stderr, " enabling uniqueifying features.\n");
};
use_unigram_features = 0;
if (apb->sflags & CRM_UNIGRAM)
{
use_unigram_features = 1;
if (user_trace)
fprintf (stderr, " using only unigram features.\n");
};
// Note that during a LEARN in hyperspace, we do NOT use the mmap of
// pre-existing memory. We just write to the end of the file instead.
// malloc up the unsorted hashbucket space
hashes = calloc (HYPERSPACE_MAX_FEATURE_COUNT,
sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT));
hashcounts = 0;
// Extract the file names for storing svm solver.( file1.svm |
// file2.svm | 1vs2_solver.svm )
crm_get_pgm_arg (ftext, MAX_PATTERN, apb->p1start, apb->p1len);
flen = apb->p1len;
flen = crm_nexpandvar (ftext, flen, MAX_PATTERN);
strcpy (ptext,
"[[:space:]]*([[:graph:]]+)[[:space:]]+\\|[[:space:]]+([[:graph:]]+)[[:space:]]+\\|[[:space:]]+([[:graph:]]+)[[:space:]]*");
plen = strlen(ptext);
plen = crm_nexpandvar (ptext, plen, MAX_PATTERN);
i = crm_regcomp (®cb, ptext, plen, cflags);
if ( i > 0){
crm_regerror ( i, ®cb, tempbuf, data_window_size);
nonfatalerror ("Regular Expression Compilation Problem:", tempbuf);
goto regcomp_failed;
};
k = crm_regexec (®cb, ftext,
flen, 5, match, 0, NULL);
if( k==0 )
{
// get three input files.
memmove(file1,&ftext[match[1].rm_so],(match[1].rm_eo-match[1].rm_so));
file1[match[1].rm_eo-match[1].rm_so]='\000';
memmove(file2,&ftext[match[2].rm_so],(match[2].rm_eo-match[2].rm_so));
file2[match[2].rm_eo-match[2].rm_so]='\000';
memmove(file3,&ftext[match[3].rm_so],(match[3].rm_eo-match[3].rm_so));
file3[match[3].rm_eo-match[3].rm_so]='\000';
if(internal_trace)
fprintf(stderr, "file1=%s\tfile2=%s\tfile3=%s\n", file1, file2, file3);
}
else
{
if (ptext[0] != '\0') crm_regfree (®cb);
i = 0;
while(ftext[i] < 0x021) i++;
j = i;
while(ftext[j] >= 0x021) j++;
ftext[j] = '\000';
strcpy(file1, &ftext[i]);
file2[0] = '\000';
file3[0] = '\000';
}
// if (|Text|>0) hide the text into the .svm file
// get the "this is a word" regex
crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len);
plen = apb->s1len;
plen = crm_nexpandvar (ptext, plen, MAX_PATTERN);
// compile the word regex
//
if ( internal_trace)
fprintf (stderr, "\nWordmatch pattern is %s", ptext);
i = crm_regcomp (®cb, ptext, plen, cflags);
if ( i > 0)
{
crm_regerror ( i, ®cb, tempbuf, data_window_size);
nonfatalerror ("Regular Expression Compilation Problem:", tempbuf);
goto regcomp_failed;
};
// Now tokenize the input text
// We got txtptr, txtstart, and txtlen from the caller.
//
textoffset = txtstart;
textmaxoffset = txtstart + txtlen;
i = 0;
j = 0;
k = 0;
// init the hashpipe with 0xDEADBEEF
for (h = 0; h < OSB_BAYES_WINDOW_LEN; h++)
{
hashpipe[h] = 0xDEADBEEF;
};
// if [Text]>0 hide it and append to the file1
if (txtlen > 0)
{
while (k == 0 && textoffset <= textmaxoffset
&& hashcounts < HYPERSPACE_MAX_FEATURE_COUNT )
{
long wlen;
long slen = textmaxoffset - textoffset;
// if pattern is empty, extract non graph delimited tokens
// directly ([[graph]]+) instead of calling regexec (8% faster)
if (ptext[0] != '\0')
{
k = crm_regexec (®cb, &(txtptr[textoffset]),
slen, 5, match, 0, NULL);
}
else
{
k = 0;
// skip non-graphical characthers
match[0].rm_so = 0;
while (!isgraph (txtptr[textoffset + match[0].rm_so])
&& textoffset + match[0].rm_so < textmaxoffset)
match[0].rm_so ++;
match[0].rm_eo = match[0].rm_so;
while (isgraph (txtptr [textoffset + match[0].rm_eo])
&& textoffset + match[0].rm_eo < textmaxoffset)
match[0].rm_eo ++;
if ( match[0].rm_so == match[0].rm_eo)
k = 1;
};
if (!(k != 0 || textoffset > textmaxoffset))
{
wlen = match[0].rm_eo - match[0].rm_so;
memmove (tempbuf,
&(txtptr[textoffset + match[0].rm_so]),
wlen);
tempbuf[wlen] = '\000';
if (internal_trace)
{
fprintf (stderr,
" Learn #%ld t.o. %ld strt %ld end %ld len %ld is -%s-\n",
i,
textoffset,
(long) match[0].rm_so,
(long) match[0].rm_eo,
wlen,
tempbuf);
};
if (match[0].rm_eo == 0)
{
nonfatalerror ( "The LEARN pattern matched zero length! ",
"\n Forcing an increment to avoid an infinite loop.");
match[0].rm_eo = 1;
};
// Shift the hash pipe down one
//
for (h = OSB_BAYES_WINDOW_LEN-1; h > 0; h--)
{
hashpipe [h] = hashpipe [h-1];
};
// and put new hash into pipeline
hashpipe[0] = strnhash (tempbuf, wlen);
if (internal_trace)
{
fprintf (stderr, " Hashpipe contents: ");
for (h = 0; h < OSB_BAYES_WINDOW_LEN; h++)
fprintf (stderr, " %lud", hashpipe[h]);
fprintf (stderr, "\n");
};
// and account for the text used up.
textoffset = textoffset + match[0].rm_eo;
i++;
// is the pipe full enough to do the hashing?
// we always run the hashpipe now, even if it's
// just full of 0xDEADBEEF. (was i >=5)
if(1)
{
unsigned long h1;
unsigned long h2;
long th = 0; // a counter used for TSS tokenizing
long j;
//
// old Hash polynomial: h0 + 3h1 + 5h2 +11h3 +23h4
// (coefficients chosen by requiring superincreasing,
// as well as prime)
//
th = 0;
//
if (use_unigram_features == 1)
{
h1 = hashpipe[0];
if (h1 == 0) h1 = 0xdeadbeef;
h2 = 0xdeadbeef;
if (internal_trace)
fprintf (stderr, "Singleton feature : %lud\n", h1);
hashes[hashcounts].hash = h1;
hashcounts++;
}
else
{
for (j = 1;
j < OSB_BAYES_WINDOW_LEN; // OSB_BAYES_WINDOW_LEN;
j++)
{
h1 = hashpipe[0]*hctable[0] +
hashpipe[j] * hctable[j<<1];
if (h1 ==0 ) h1 = 0xdeadbeef;
h2 = 0xdeadbeef;
if (internal_trace)
fprintf (stderr,
"Polynomial %ld has h1:%lud h2: %lud\n",
j, h1, h2);
hashes[hashcounts].hash = h1;
hashcounts++;
};
};
};
}
else
{
if (ptext[0] != '\0') crm_regfree (®cb);
k = 1;
}
}; // end the while k==0
// Now sort the hashes array.
//
qsort (hashes, hashcounts,
sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT),
&hash_compare);
if (user_trace)
{
fprintf(stderr,"sorted hashes:\n");
for(i=0;i<hashcounts;i++)
{
fprintf(stderr, "hashes[%ld]=%lud\n",i,hashes[i].hash);
}
fprintf (stderr, "Total hashes generated: %ld\n", hashcounts);
}
// And uniqueify the hashes array
//
i = 0;
j = 0;
if (unique)
{
while ( i < hashcounts ){
if (hashes[i].hash != hashes[i+1].hash)
{
hashes[j]= hashes[i];
j++;
};
i++;
};
hashcounts = j;
};
//mark the end of a feature vector
hashes[hashcounts].hash = 0;
if (user_trace)
fprintf (stderr, "Unique hashes generated: %ld\n", hashcounts);
if(hashcounts > 0 && sense > 0)
{
//append the hashed text to file1
// Because there are probably retained hashes of the
// file, we need to force an unmap-by-name which will allow a remap
// with the new file length later on.
crm_force_munmap_filename (file1);
if (user_trace)
fprintf (stderr, "Opening a svm file %s for append.\n", file1);
if ((hashf = fopen ( file1 , "ab+")) == NULL)
{
nonfatalerror("Sorry, couldn't open the .svm file", "");
return (0);
}
if (user_trace)
fprintf (stderr, "Writing to a svm file %s\n", file1);
// and write the sorted hashes out.
fwrite (hashes, 1,
sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT) * (hashcounts + 1),
hashf);
fclose (hashf);
}
/////////////////////////////////////////////////////////////////////
// Start refuting........
// What we have to do here is find the set of hashes that matches
// the input most closely - and then remove it.
//
// For this, we want the single closest set of hashes. That
// implies highest radiance, so we use the same bit of code
// we use down in classification. We also keep start and
// end of the "best match" segment.
////////////////////////////////////////////////////////////////////
if (hashcounts > 0 && sense < 0)
{
long beststart, bestend;
long thisstart, thislen, thisend;
double bestrad;
long wrapup;
double kandu, unotk, knotu, dist, radiance;
long k, u;
long file_hashlens;
HYPERSPACE_FEATUREBUCKET_STRUCT *file_hashes;
// Get the file mmapped so we can find the closest match
//
struct stat statbuf; // for statting the hash file
// stat the file to get it's length
k = stat (file1, &statbuf);
// does the file really exist?
if (k != 0)
{
nonfatalerror ("Refuting from nonexistent data cannot be done!"
" More specifically, this data file doesn't exist: ",
file1);
return (0);
}
else
{
file_hashlens = statbuf.st_size;
file_hashes = (HYPERSPACE_FEATUREBUCKET_STRUCT *)
crm_mmap_file (file1,
0, file_hashlens,
PROT_READ | PROT_WRITE,
MAP_SHARED,
NULL);
file_hashlens = file_hashlens
/ sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT );
};
wrapup = 0;
k = u = 0;
beststart = bestend = 0;
bestrad = 0.0;
while (k < file_hashlens)
{
long cmp;
// Except on the first iteration, we're looking one cell
// past the 0x0 start marker.
kandu = 0;
knotu = unotk = 10 ;
u = 0;
thisstart = k;
if (internal_trace)
fprintf (stderr,
"At featstart, looking at %ld (next bucket value is %ld)\n",
file_hashes[thisstart].hash,
file_hashes[thisstart+1].hash);
while (wrapup == 0)
{
// it's an in-class feature.
cmp = hash_compare (&hashes[u], &file_hashes[k]);
if (cmp < 0)
{ // unknown less, step u forward
// increment on u, because maybe k will match next time
unotk++;
u++;
}
if (cmp == 0) // features matched.
// These aren't the features you're looking for.
// Move along, move along....
{
u++;
k++;
kandu++;
};
if (cmp > 0) // unknown is greater, step k forward
{
// increment on k, because maybe u will match next time.
knotu++;
k++;
};
// End of the U's? If so, skip k to the end marker
// and finish.
if ( u >= hashcounts )
{
while ( k < file_hashlens
&& file_hashes[k].hash != 0)
{
k++;
knotu++;
};
};
// End of the K's? If so, skip U to the end marker
if ( k >= file_hashlens - 1
|| file_hashes[k].hash == 0 ) // end of doc features
{
unotk += hashcounts - u;
};
// end of the U's or end of the K's? If so, end document.
if (u >= hashcounts
|| k >= file_hashlens - 1
|| file_hashes[k].hash == 0) // this sets end-of-document
{
wrapup = 1;
k++;
};
};
// Now the per-document wrapup...
wrapup = 0; // reset wrapup for next file
// drop our markers for this particular document. We are now
// looking at the next 0 (or end of file).
thisend = k - 1;
thislen = thisend - thisstart + 1;
if (internal_trace)
fprintf (stderr,
"At featend, looking at %ld (next bucket value is %ld)\n",
file_hashes[thisend].hash,
file_hashes[thisend+1].hash);
// end of a document- process accumulations
// Proper pythagorean (Euclidean) distance - best in
// SpamConf 2006 paper
dist = sqrtf (unotk + knotu) ;
// PREV RELEASE VER --> radiance = 1.0 / ((dist * dist )+ 1.0);
//
// This formula was the best found in the MIT `SC 2006 paper.
radiance = 1.0 / (( dist * dist) + .000001);
radiance = radiance * kandu;
radiance = radiance * kandu;
if (user_trace)
fprintf (stderr, "Feature Radiance %f at %ld to %ld\n",
radiance, thisstart, thisend);
if (radiance >= bestrad)
{
beststart = thisstart;
bestend = thisend;
bestrad = radiance;
}
};
// end of the per-document stuff - now chop out the part of the
// file between beststart and bestend.
if (user_trace)
fprintf (stderr,
"Deleting feature from %ld to %ld (rad %f) of file %s\n",
beststart, bestend, bestrad, file1);
// Deletion time - move the remaining stuff in the file
// up to fill the hole, then msync the file, munmap it, and
// then truncate it to the new, correct length.
{
long newhashlen, newhashlenbytes;
newhashlen = file_hashlens - (bestend + 1 - beststart);
newhashlenbytes=newhashlen
* sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT);
memmove (&file_hashes[beststart],
&file_hashes[bestend+1],
sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT)
* (file_hashlens - bestend) );
crm_force_munmap_filename (file1);
if (internal_trace)
fprintf (stderr, "Truncating file to %ld cells ( %ld bytes)\n",
newhashlen,
newhashlenbytes);
k = truncate (file1,
newhashlenbytes);
}
};
};
// let go of the hashes.
free (hashes);
if ( sense < 0 )
{
// finish refuting....
return (0);
}
// extract parameters for svm
crm_get_pgm_arg(ptext, MAX_PATTERN, apb->s2start, apb->s2len);
plen = apb->s2len;
plen = crm_nexpandvar (ptext, plen, MAX_PATTERN);
if(plen)
{
sscanf(ptext,
"%d %d %lf %lf %lf %lf %lf %d",
&(param.svm_type),
&(param.kernel_type),
&(param.cache_size),
&(param.eps),
&(param.C),
&(param.nu),
&(param.max_run_time) ,
&(param.shrinking));
}
else
{
//set default parameters for SVM
param.svm_type = C_SVC;
param.kernel_type = LINEAR;
param.cache_size = 100; // MB
param.eps = 1e-3;
param.C = 1;
param.max_run_time = 1;
param.shrinking = 1; // not available right now
}
//if svm_type is ONE_CLASS, then do one class svm
if(param.svm_type == ONE_CLASS)
{
long file1_hashlens;
HYPERSPACE_FEATUREBUCKET_STRUCT *file1_hashes;
HYPERSPACE_FEATUREBUCKET_STRUCT **x = NULL;
int *y = NULL;
int k1;
k1 = stat (file1, &statbuf1);
file1_hashlens = statbuf1.st_size;
file1_hashes = (HYPERSPACE_FEATUREBUCKET_STRUCT *)
crm_mmap_file (file1,
0, file1_hashlens,
PROT_READ | PROT_WRITE,
MAP_SHARED,
NULL);
file1_hashlens = file1_hashlens
/ sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT );
//find out how many documents in file1
for(i = 0;i< file1_hashlens;i++){
if(internal_trace)
fprintf (stderr,
"\nThe %ldth hash value in file1 is %lud",
i, file1_hashes[i].hash);
if(file1_hashes[i].hash == 0){
k1 ++;
}
}
if(internal_trace)
fprintf (stderr,
"\nThe total number of documents in file1 is %d\n",
k1);
//initialize the svm_prob.x, svm_prob.y
svm_prob.l = k1;
y = calloc(svm_prob.l , sizeof(y[0]));
x = calloc(svm_prob.l , sizeof(x[0]));
for(i = 0; i < k1; i++)
y[i] = 1;
svm_prob.y = y;
x[0] = &(file1_hashes[0]);
k = 1;
for(i = 1;i< file1_hashlens - 1;i++)
{
if(file1_hashes[i].hash == 0)
{
x[k++] = &(file1_hashes[i+1]);
}
}
svm_prob.x = x;
if(internal_trace)
{
for(i = 0;i< k;i++)
{
fprintf(stderr, "\nx[%ld]=%lud\n",i,x[i][1].hash);
}
}
Q_init();
solve(); //result is in solver
//free cache
cache_free(&svmcache);
}
//if file2 is not empty, open file1 and file2, calculate hyperplane,
//and write the solution to file3
if(file2[0] != '\000')
{
long file1_hashlens;
HYPERSPACE_FEATUREBUCKET_STRUCT *file1_hashes;
long file2_hashlens;
HYPERSPACE_FEATUREBUCKET_STRUCT *file2_hashes;
int k1, k2;
k1 = stat (file1, &statbuf1);
k2 = stat (file2, &statbuf2);
if (k1 != 0)
{
nonfatalerror ("Sorry, there has no enough data to calculate the hyperplane"
"", file1);
return (0);
}
else if(k2 != 0)
{
nonfatalerror ("Sorry, there has no enough data to calculate the hyperplane"
"", file2);
return (0);
}
else
{
k1 = 0;
k2 = 0;
file1_hashlens = statbuf1.st_size;
file1_hashes = (HYPERSPACE_FEATUREBUCKET_STRUCT *)
crm_mmap_file (file1,
0, file1_hashlens,
PROT_READ | PROT_WRITE,
MAP_SHARED,
NULL);
file1_hashlens = file1_hashlens
/ sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT );
file2_hashlens = statbuf2.st_size;
file2_hashes = (HYPERSPACE_FEATUREBUCKET_STRUCT *)
crm_mmap_file (file2,
0, file2_hashlens,
PROT_READ | PROT_WRITE,
MAP_SHARED,
NULL);
file2_hashlens = file2_hashlens
/ sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT );
for(i = 0;i< file1_hashlens;i++)
{
if( internal_trace)
fprintf(stderr,
"\nThe %ldth hash value in file1 is %lud",
i, file1_hashes[i].hash);
if(file1_hashes[i].hash == 0)
{
k1 ++;
}
}
if(internal_trace)
fprintf (stderr,
"\nThe total number of documents in file1 is %d\n", k1);
for(i = 0;i< file2_hashlens;i++)
{
if(internal_trace)
fprintf (stderr,
"\nThe %ldth hash value in file2 is %lud",
i, file2_hashes[i].hash);
if(file2_hashes[i].hash == 0)
{
k2 ++;
}
}
if(internal_trace)
fprintf (stderr,
"\nThe total number of documents in file2 is %d\n", k2);
if((k1 > 0) && (k2 >0))
{
//initialize the svm_prob.x, svm_prob.y
int *y = NULL;
double b;
double *deci_array = NULL;
double AB[2];
HYPERSPACE_FEATUREBUCKET_STRUCT **x = NULL;
svm_prob.l = k1 + k2;
y = calloc(svm_prob.l , sizeof(y[0]));
x = calloc(svm_prob.l , sizeof(x[0]));
for(i = 0; i < k1; i++)
y[i] = 1;
for(i = k1; i < svm_prob.l; i++)
y[i] = -1;
svm_prob.y = y;
x[0] = &(file1_hashes[0]);
k = 1;
for(i = 1;i< file1_hashlens - 1;i++)
{
if(file1_hashes[i].hash == 0)
{
x[k++] = &(file1_hashes[i+1]);
}
}
x[k++] = &(file2_hashes[0]);
for(i = 1;i< file2_hashlens - 1;i++)
{
if(file2_hashes[i].hash == 0)
{
x[k++] = &(file2_hashes[i+1]);
}
}
svm_prob.x = x;
if(internal_trace)
{
for(i = 0;i< k;i++){
fprintf(stderr, "\nx[%ld]=%lud\n",i,x[i][1].hash);
}
}
Q_init();
solve(); //result is in solver
b = calc_b();
deci_array = (double*) malloc(svm_prob.l*sizeof(double));
//compute decision values for all training documents
for(i = 0; i < svm_prob.l; i++){
deci_array[i] = calc_decision(svm_prob.x[i], solver.alpha, b);
}
if (internal_trace)
fprintf(stderr, "done********\n");
calc_AB(AB,deci_array, k1,k2);
end_timer = time(NULL);
run_time = difftime(end_timer, start_timer);
if(user_trace)
fprintf(stderr, "run_time = %lf seconds\n", run_time);
//IF MICROGROOMING IS ENABLED, WE'LL GET RID OF LESS THAN
//10% DOCUMENTS THAT ARE FAR AWAY FROM THE HYPERPLANE
//(WITH HIGH ABSOLUTE DECISION VALUE). HERE HAVE TWO
//PARAMETERS YOU CAN CONTROL, ONE IS DELETE_FRACTION, THE
//OTHER IS THE HOW FAR AWAY FROM THE HYPERPLANE. DEFAULT
//SET DELETE_FRACTION=5% AND IF |DISTANCE| > 1.2 * AVERAGE
//DISTANCE, THEN IT IS FAR AWAY FROM THE hyperplane.
//
if(microgroom && (run_time > param.max_run_time))
{
float distance_fraction = 1.2;
int *id_desc;
//int *id_asc;
double average1 = 0.0, average2 = 0.0;
float delete_fraction;
int delete_num1 = 0, delete_num2 = 0;
id_desc = (int*) malloc(svm_prob.l * sizeof(int));
//id_asc = (int*) malloc(svm_prob.l * sizeof(int));
if(user_trace)
fprintf(stderr, "\nStart microgrooming......\n");
solver.deci_array = deci_array;
// set an upper bound of delete fraction
delete_fraction = pow(param.max_run_time/run_time,
1.0/3.0);
for (i = 0; i < svm_prob.l; i++)
{
id_desc[i] = i;
if(i < k1)
{
average1 += solver.deci_array[i];
}
else
average2 += solver.deci_array[i];
}
average1 /= k1;
average2 /= k2;
qsort (id_desc, svm_prob.l, sizeof (int),
&descend_int_decision_compare );
if(user_trace)
fprintf(stderr,
"average1=%lf\taverage2=%lf\n", average1, average2);
// if decision[i] > 1.5 * average decision value, then
// get rid of it.
i = 0;
j = svm_prob.l - 1;
while (((delete_num1 + delete_num2)
< floor(delete_fraction * svm_prob.l))
&& (i < svm_prob.l) && (j >= 0))
{
if((k1 - delete_num1) > (k2 - delete_num2))
{
//delete the farest document from the first class
if (svm_prob.y[id_desc[i]] == 1
&& fabs(solver.deci_array[id_desc[i]])
> distance_fraction * fabs(average1))
{
if(user_trace)
fprintf(stderr,
"delete document %d!decision value=%lf\n",
id_desc[i],solver.deci_array[id_desc[i]]);
id_desc[i] = -1;
delete_num1 ++;
}
i++;
}
else if((k1 - delete_num1) < (k2 - delete_num2))
{
//delete the farest document from the second class
if(svm_prob.y[id_desc[j]] == -1
&& fabs(solver.deci_array[id_desc[j]]) > distance_fraction * fabs(average2))
{
if(user_trace)
fprintf(stderr,
"delete document %d!decision value=%lf\n",
id_desc[j],solver.deci_array[id_desc[j]]);
id_desc[j] = -1;
delete_num2 ++;
}
j--;
}
else
{
//delete the farest document from both classes
if(svm_prob.y[id_desc[i]] == 1
&& fabs(solver.deci_array[id_desc[i]]) > distance_fraction * fabs(average1))
{
if(user_trace)
fprintf(stderr,
"delete document %d!decision value=%lf\n",
id_desc[i],solver.deci_array[id_desc[i]]);
id_desc[i] = -1;
delete_num1 ++;
}
i++;
if(svm_prob.y[id_desc[j]] == -1
&& fabs(solver.deci_array[id_desc[j]]) > distance_fraction * fabs(average2))
{
if(user_trace)
fprintf(stderr,
"delete document %d!decision value=%lf\n",
id_desc[j],solver.deci_array[id_desc[j]]);
id_desc[j] = -1;
delete_num2 ++;
}
j--;
}
}
if(user_trace)
fprintf(stderr,
"delete_num1 = %d\t delete_num2 = %d\n",
delete_num1, delete_num2);
if(delete_num1 != 0 || delete_num2 != 0)
{
HYPERSPACE_FEATUREBUCKET_STRUCT *new_x[k1 + k2 - delete_num1 - delete_num2];
qsort (id_desc, svm_prob.l, sizeof (int), &int_compare );
//now start deleting documents and write the
//remain documents to file1, this is unrecoverable.
j = 0;
for(i = 0; i < svm_prob.l; i++)
{
if((id_desc[i] != -1) && (id_desc[i] < k1))
{
int temp_i;
int temp_count = 0;
//newalpha[j] = solver.alpha[id_desc[i]];
svm_prob.y[j] = svm_prob.y[id_desc[i]];
while(svm_prob.x[id_desc[i]][temp_count].hash!=0)
temp_count ++;
new_x[j] = (HYPERSPACE_FEATUREBUCKET_STRUCT *)
malloc((temp_count + 1)
* sizeof(HYPERSPACE_FEATUREBUCKET_STRUCT));
for(temp_i = 0; temp_i<temp_count; temp_i++)
new_x[j][temp_i] = svm_prob.x[id_desc[i]][temp_i];
new_x[j][temp_count].hash = 0 ;
j++;
}
else if((id_desc[i] != -1) && (id_desc[i] >= k1))
{
int temp_count = 0;
int temp_i;
//newalpha[j] = solver.alpha[id_desc[i]];
svm_prob.y[j] = svm_prob.y[id_desc[i]];
while(svm_prob.x[id_desc[i]][temp_count].hash != 0)
temp_count ++;
new_x[j] = (HYPERSPACE_FEATUREBUCKET_STRUCT *)
malloc((temp_count + 1)
* sizeof(HYPERSPACE_FEATUREBUCKET_STRUCT));
for(temp_i = 0; temp_i<temp_count; temp_i++)
new_x[j][temp_i] = svm_prob.x[id_desc[i]][temp_i];
new_x[j][temp_count].hash = 0 ;
j++;
}
}
svm_prob.l = k1 + k2 - delete_num1 - delete_num2;
svm_prob.x = new_x;
if(delete_num1 != 0)
{
crm_force_munmap_filename (file1);
if (user_trace)
fprintf (stderr,
"Opening a svm file %s for rewriting.\n",
file1);
hashf = fopen ( file1 , "wb+");
if (user_trace)
fprintf (stderr,
"Writing to a svm file %s\n",
file1);
for(i = 0; i < k1 - delete_num1; i++)
{
int temp_count = 0;
while(svm_prob.x[i][temp_count].hash != 0)
{
temp_count ++;
}
fwrite (svm_prob.x[i], 1,
sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT)
* (temp_count+1),
hashf);
}
fclose (hashf);
}
if(delete_num2 != 0)
{
crm_force_munmap_filename (file2);
if (user_trace)
fprintf (stderr,
"Opening a svm file %s for rewriting.\n",
file2);
hashf = fopen ( file2 , "wb+");
if (user_trace)
fprintf (stderr,
"Writing to a svm file %s\n", file2);
for(i = k1 - delete_num1; i < svm_prob.l; i++)
{
int temp_count = 0;
while(svm_prob.x[i][temp_count].hash != 0)
temp_count ++;
fwrite (svm_prob.x[i], 1,
sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT)
* (temp_count+1),
hashf);
}
fclose (hashf);
}
k1 -= delete_num1;
k2 -= delete_num2;
//recalculate the hyperplane
//
//free cache
cache_free(&svmcache);
if(user_trace)
fprintf(stderr, "recalculate the hyperplane!\n");
//recalculate the hyperplane
Q_init();
solve(); // result is in solver
b = calc_b();
if(internal_trace)
fprintf(stderr, "b=%lf\n",b);
if (user_trace)
fprintf(stderr,
"Finishing calculate the hyperplane\n");
//compute A,B for sigmoid prediction
deci_array = (double*) realloc(deci_array, svm_prob.l
* sizeof(double));
for(i = 0; i < svm_prob.l; i++)
{
deci_array[i] = calc_decision (svm_prob.x[i],
solver. alpha, b);
}
calc_AB(AB, deci_array, k1, k2);
if (internal_trace)
fprintf(stderr,
"Finished calculating probability parameter\n");
for(i = 0; i < svm_prob.l; i++)
{
free(new_x[i]);
}
}
free(id_desc);
}
free(deci_array);
// write solver to file3
if (user_trace)
fprintf (stderr,
"Opening a solution file %s for writing alpha and b.\n",
file3);
hashf = fopen ( file3 , "w+");
if (user_trace)
fprintf (stderr, "Writing to a svm solution file %s\n", file3);
fwrite(&k1, sizeof(int), 1, hashf);
fwrite(&k2, sizeof(int), 1, hashf);
for (i = 0; i < svm_prob.l; i++)
fwrite(&(solver.alpha[i]), sizeof(double), 1, hashf);
fwrite(&b, sizeof(double), 1, hashf);
fwrite(&AB[0], sizeof(double), 1, hashf);
fwrite(&AB[1], sizeof(double), 1, hashf);
fclose (hashf);
//free cache
cache_free(&svmcache);
free(solver.G);
free(DiagQ);
free(solver.alpha);
if(user_trace)
fprintf(stderr,
"Finish calculating SVM hyperplane, store the solution to %s!\n",
file3);
}
else
{
if (user_trace)
fprintf(stderr,
"There hasn't enough documents to calculate a svm hyperplane!\n");
}
crm_force_munmap_filename (file1);
crm_force_munmap_filename (file2);
crm_force_munmap_filename (file3);
}
}//end if(file2[0])
regcomp_failed:
return 0;
}
int crm_expr_svm_classify(CSL_CELL *csl, ARGPARSE_BLOCK *apb,
char *txtptr, long txtstart, long txtlen){
long i,j, k, h;
char ftext[MAX_PATTERN];
long flen;
char ptext[MAX_PATTERN]; //the regrex pattern
long plen;
char file1[MAX_PATTERN];
char file2[MAX_PATTERN];
char file3[MAX_PATTERN];
regex_t regcb;
regmatch_t match[5];
FILE *hashf;
long textoffset;
long textmaxoffset;
unsigned long hashpipe[OSB_BAYES_WINDOW_LEN+1];
HYPERSPACE_FEATUREBUCKET_STRUCT *hashes; // the hashes we'll sort
long hashcounts;
long cflags, eflags;
long sense;
long microgroom;
long unique;
long use_unigram_features;
struct stat statbuf1; // for statting the hash file1
struct stat statbuf2; // for statting the hash file2
struct stat statbuf3; // for statting the hash file3
double *alpha;
double b;
double AB[2];
long slen;
char svrbl[MAX_PATTERN]; // the match statistics text buffer
long svlen;
// the match statistics variable
char stext [MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100)];
long stext_maxlen = MAX_PATTERN+MAX_CLASSIFIERS*(MAX_FILE_NAME_LEN+100);
double decision = 0;
long totalfeatures = 0; // total features
long bestseen;
double ptc[MAX_CLASSIFIERS]; // current running probability of this class
long hashlens[MAX_CLASSIFIERS];
char *hashname[MAX_CLASSIFIERS];
long doc_num[MAX_CLASSIFIERS];
// extract the optional "match statistics" variable
//
crm_get_pgm_arg (svrbl, MAX_PATTERN, apb->p2start, apb->p2len);
svlen = apb->p2len;
svlen = crm_nexpandvar (svrbl, svlen, MAX_PATTERN);
{
long vstart, vlen;
crm_nextword (svrbl, svlen, 0, &vstart, &vlen);
memmove (svrbl, &svrbl[vstart], vlen);
svlen = vlen;
svrbl[vlen] = '\000';
};
// status variable's text (used for output stats)
//
stext[0] = '\000';
slen = 0;
// set our cflags, if needed. The defaults are
// "case" and "affirm", (both zero valued).
// and "microgroom" disabled.
cflags = REG_EXTENDED;
eflags = 0;
sense = +1;
if (apb->sflags & CRM_NOCASE)
{
cflags = cflags | REG_ICASE;
eflags = 1;
if (user_trace)
fprintf (stderr, "turning oncase-insensitive match\n");
};
if (apb->sflags & CRM_REFUTE)
{
sense = -sense;
/////////////////////////////////////
// Take this out when we finally support refutation
////////////////////////////////////
// fprintf (stderr, "Hyperspace Refute is NOT SUPPORTED YET\n");
//return (0);
if (user_trace)
fprintf (stderr, " refuting learning\n");
};
microgroom = 0;
if (apb->sflags & CRM_MICROGROOM)
{
microgroom = 1;
if (user_trace)
fprintf (stderr, " enabling microgrooming.\n");
};
unique = 0;
if (apb->sflags & CRM_UNIQUE)
{
unique = 1;
if (user_trace)
fprintf (stderr, " enabling uniqueifying features.\n");
};
use_unigram_features = 0;
if (apb->sflags & CRM_UNIGRAM)
{
use_unigram_features = 1;
if (user_trace)
fprintf (stderr, " using only unigram features.\n");
};
// Note that during a LEARN in hyperspace, we do NOT use the mmap of
// pre-existing memory. We just write to the end of the file instead.
// malloc up the unsorted hashbucket space
hashes = calloc (HYPERSPACE_MAX_FEATURE_COUNT,
sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT));
hashcounts = 0;
// get the "this is a word" regex
crm_get_pgm_arg (ptext, MAX_PATTERN, apb->s1start, apb->s1len);
plen = apb->s1len;
plen = crm_nexpandvar (ptext, plen, MAX_PATTERN);
// compile the word regex
//
if ( internal_trace)
fprintf (stderr, "\nWordmatch pattern is %s", ptext);
i = crm_regcomp (®cb, ptext, plen, cflags);
if ( i > 0)
{
crm_regerror ( i, ®cb, tempbuf, data_window_size);
nonfatalerror ("Regular Expression Compilation Problem:", tempbuf);
goto regcomp_failed;
};
// Now tokenize the input text
// We got txtptr, txtstart, and txtlen from the caller.
//
textoffset = txtstart;
textmaxoffset = txtstart + txtlen;
// init the hashpipe with 0xDEADBEEF
for (h = 0; h < OSB_BAYES_WINDOW_LEN; h++)
{
hashpipe[h] = 0xDEADBEEF;
};
i = 0;
j = 0;
k = 0;
//generate the sorted hashes of input text
if(txtlen > 0)
{
while (k == 0 && textoffset <= textmaxoffset
&& hashcounts < HYPERSPACE_MAX_FEATURE_COUNT )
{
long wlen;
long slen = textmaxoffset - textoffset;
// if pattern is empty, extract non graph delimited tokens
// directly ([[graph]]+) instead of calling regexec (8% faster)
if (ptext[0] != '\0')
{
k = crm_regexec (®cb, &(txtptr[textoffset]),
slen, 5, match, 0, NULL);
}
else
{
k = 0;
// skip non-graphical characthers
match[0].rm_so = 0;
while (!isgraph (txtptr[textoffset + match[0].rm_so])
&& textoffset + match[0].rm_so < textmaxoffset)
match[0].rm_so ++;
match[0].rm_eo = match[0].rm_so;
while (isgraph (txtptr [textoffset + match[0].rm_eo])
&& textoffset + match[0].rm_eo < textmaxoffset)
match[0].rm_eo ++;
if ( match[0].rm_so == match[0].rm_eo)
k = 1;
};
if (!(k != 0 || textoffset > textmaxoffset))
{
wlen = match[0].rm_eo - match[0].rm_so;
memmove (tempbuf,
&(txtptr[textoffset + match[0].rm_so]),
wlen);
tempbuf[wlen] = '\000';
if (internal_trace)
{
fprintf (stderr,
"Learn #%ld t.o. %ld strt %ld end %ld len %ld is -%s-\n",
i,
textoffset,
(long) match[0].rm_so,
(long) match[0].rm_eo,
wlen,
tempbuf);
};
if (match[0].rm_eo == 0)
{
nonfatalerror ( "The LEARN pattern matched zero length! ",
"\n Forcing an increment to avoid an infinite loop.");
match[0].rm_eo = 1;
};
// slide previous hashes up 1
for (h = OSB_BAYES_WINDOW_LEN-1; h > 0; h--){
hashpipe [h] = hashpipe [h-1];
};
// and put new hash into pipeline
hashpipe[0] = strnhash ( tempbuf, wlen);
// and account for the text used up.
textoffset = textoffset + match[0].rm_eo;
i++;
// is the pipe full enough to do the hashing?
// we init with 0xDEADBEEF, so the pipe is always full (i >=5)
if (1)
{
int j;
unsigned th=0; // a counter used only in TSS hashing
unsigned long hindex;
unsigned long h1; //, h2;
//
th = 0;
//
if (use_unigram_features == 1)
{
h1 = hashpipe[0];
if (h1 == 0) h1 = 0xdeadbeef;
if (internal_trace)
fprintf (stderr, "Singleton feature : %lud\n", h1);
hashes[hashcounts].hash = h1;
hashcounts++;
}
else
{
for (j = 1;
j < OSB_BAYES_WINDOW_LEN; //OSB_BAYES_WINDOW_LEN;
j++)
{
h1 = hashpipe[0] * hctable[0]
+ hashpipe[j] * hctable[j<<1];
if (h1 == 0) h1 = 0xdeadbeef;
hindex = h1;
if (internal_trace)
fprintf (stderr, "Polynomial %d has h1:%lud \n",
j, h1);
hashes[hashcounts].hash = h1;
hashcounts++;
};
};
};
}
else
{
if (ptext[0] != '\0') crm_regfree (®cb);
k = 1;
}
}; // end the while k==0
// Now sort the hashes array.
//
//hashcounts--;
qsort (hashes, hashcounts,
sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT),
&hash_compare);
//mark the end of a feature vector
hashes[hashcounts].hash = 0;
if (user_trace)
fprintf (stderr, "Total hashes generated: %ld\n", hashcounts);
// And uniqueify the hashes array
//
i = 0;
j = 0;
if (unique)
{
while ( i <= hashcounts )
{
if (hashes[i].hash != hashes[i+1].hash)
{
hashes[j]= hashes[i];
j++;
};
i++;
};
hashcounts = j;
};
if (user_trace)
fprintf (stderr, "Unique hashes generated: %ld\n", hashcounts);
totalfeatures = hashcounts;
}
else
{
nonfatalerror ("Sorry, but I can't classify the null string.", "");
return 0;
}
if (user_trace)
{
fprintf(stderr,"sorted hashes:\n");
for (i=0;i<hashcounts;i++)
{
fprintf(stderr, "hashes[%ld]=%lud\n",i,hashes[i].hash);
}
fprintf (stderr, "Total hashes generated: %ld\n", hashcounts);
}
// extract the file names.( file1.svm | file2.svm | 1vs2_solver.svm )
crm_get_pgm_arg (ftext, MAX_PATTERN, apb->p1start, apb->p1len);
flen = apb->p1len;
flen = crm_nexpandvar (ftext, flen, MAX_PATTERN);
strcpy (ptext,
"[[:space:]]*([[:graph:]]+)[[:space:]]+\\|[[:space:]]+([[:graph:]]+)[[:space:]]+\\|[[:space:]]+([[:graph:]]+)[[:space:]]*");
plen = strlen(ptext);
i = crm_regcomp (®cb, ptext, plen, cflags);
if ( i > 0)
{
crm_regerror ( i, ®cb, tempbuf, data_window_size);
nonfatalerror ("Regular Expression Compilation Problem:", tempbuf);
goto regcomp_failed;
};
k = crm_regexec (®cb, ftext,
flen, 5, match, 0, NULL);
if( k==0 )
{
long file1_hashlens;
HYPERSPACE_FEATUREBUCKET_STRUCT *file1_hashes;
long file2_hashlens;
HYPERSPACE_FEATUREBUCKET_STRUCT *file2_hashes;
int k1, k2, k3;
//get three input files.
memmove(file1,&ftext[match[1].rm_so],(match[1].rm_eo-match[1].rm_so));
file1[match[1].rm_eo-match[1].rm_so]='\000';
memmove(file2,&ftext[match[2].rm_so],(match[2].rm_eo-match[2].rm_so));
file2[match[2].rm_eo-match[2].rm_so]='\000';
memmove(file3,&ftext[match[3].rm_so],(match[3].rm_eo-match[3].rm_so));
file3[match[3].rm_eo-match[3].rm_so]='\000';
if(internal_trace)
fprintf(stderr, "file1=%s\tfile2=%s\tfile3=%s\n", file1, file2, file3);
// open all files,
// first check whether file3 is the current version solution.
k1 = stat (file1, &statbuf1);
k2 = stat (file2, &statbuf2);
k3 = stat (file3, &statbuf3);
if (k1 != 0)
{
nonfatalerror ("Refuting from nonexistent data cannot be done!"
" More specifically, this data file doesn't exist: ",
file1);
return (0);
}
else if(k2 != 0)
{
nonfatalerror ("Refuting from nonexistent data cannot be done!"
" More specifically, this data file doesn't exist: ",
file2);
return (0);
}
else
{
int temp_k1 = 0, temp_k2 = 0;
int *y = NULL;
HYPERSPACE_FEATUREBUCKET_STRUCT **x = NULL;
k1 = 0;
k2 = 0;
file1_hashlens = statbuf1.st_size;
crm_force_munmap_filename (file1);
crm_force_munmap_filename (file2);
file1_hashes = (HYPERSPACE_FEATUREBUCKET_STRUCT *)
crm_mmap_file (file1,
0, file1_hashlens,
PROT_READ | PROT_WRITE,
MAP_SHARED,
NULL);
file1_hashlens = file1_hashlens
/ sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT );
hashlens[0] = file1_hashlens;
hashname[0] = (char *) malloc (strlen(file1)+10);
if (!hashname[0])
untrappableerror( "Couldn't malloc hashname[0]\n",
"We need that part later, so we're stuck. Sorry.");
strcpy(hashname[0],file1);
file2_hashlens = statbuf2.st_size;
file2_hashes = (HYPERSPACE_FEATUREBUCKET_STRUCT *)
crm_mmap_file (file2,
0, file2_hashlens,
PROT_READ | PROT_WRITE,
MAP_SHARED,
NULL);
file2_hashlens = file2_hashlens
/ sizeof (HYPERSPACE_FEATUREBUCKET_STRUCT );
hashlens[1] = file2_hashlens;
hashname[1] = (char *) malloc (strlen(file2)+10);
if (!hashname[1])
untrappableerror("Couldn't malloc hashname[1]\n",
"We need that part later, so we're stuck. Sorry.");
strcpy(hashname[1],file2);
//find out how many documents in file1 and file2 separately
for(i = 0;i< file1_hashlens;i++)
{
if(internal_trace)
fprintf (stderr,
"\nThe %ldth hash value in file1 is %lud",
i, file1_hashes[i].hash);
if(file1_hashes[i].hash == 0)
{
k1 ++;
}
}
if(internal_trace)
fprintf (stderr,
"\nThe total number of documents in file1 is %d\n", k1);
for(i = 0;i< file2_hashlens;i++)
{
if(internal_trace)
fprintf (stderr,
"\nThe %ldth hash value in file2 is %lud",
i, file2_hashes[i].hash);
if(file2_hashes[i].hash == 0)
{
k2 ++;
}
}
if(internal_trace)
fprintf (stderr,
"\nThe total number of documents in file2 is %d\n", k2);
hashf = fopen ( file3 , "r+");
if(k3 == 0)
{
//hashf = fopen ( file3 , "r+");
fread(&temp_k1, sizeof(int), 1, hashf);
fread(&temp_k2, sizeof(int), 1, hashf);
//fscanf(hashf,"%d\t%d", &temp_k1, &temp_k2);
if (internal_trace)
fprintf(stderr, "temp_k1=%d\ttemp_k2=%d\n",temp_k1,temp_k2);
}
doc_num[0] = k1;
doc_num[1] = k2;
//assign svm_prob.x, svm_prob.y
svm_prob.l = k1 + k2;
y = calloc(svm_prob.l , sizeof(y[0]));
x = calloc(svm_prob.l , sizeof(x[0]));
for(i = 0; i < k1; i++)
y[i] = 1;
for(i = k1; i < svm_prob.l; i++)
y[i] = -1;
svm_prob.y = y;
x[0] = &(file1_hashes[0]);
k = 1;
for(i = 1;i< file1_hashlens - 1;i++)
{
if(file1_hashes[i].hash == 0)
{
x[k++] = &(file1_hashes[i+1]);
}
}
x[k++] = &(file2_hashes[0]);
for(i = 1;i< file2_hashlens - 1;i++)
{
if(file2_hashes[i].hash == 0)
{
x[k++] = &(file2_hashes[i+1]);
}
}
svm_prob.x = x;
alpha = (double *)malloc(svm_prob.l * sizeof(double));
if((k3 != 0) || (temp_k1 != k1) || (temp_k2 != k2))
{
if(user_trace)
fprintf(stderr,
"temp_k1=%d\ttemp_k2=%d\tSVM solution file is not up-to-date! we'll recalculate it!\n",
temp_k1, temp_k2);
//recalculate the svm solution
if((k1 > 0) && (k2 >0))
{
double *deci_array = NULL;
// extract parameters for svm
crm_get_pgm_arg(ptext, MAX_PATTERN,apb->s2start, apb->s2len);
plen = apb->s2len;
plen = crm_nexpandvar (ptext, plen, MAX_PATTERN);
if(plen)
{
sscanf(ptext,
"%d %d %lf %lf %lf %lf %lf %d",&(param.svm_type),
&(param.kernel_type),
&(param.cache_size),
&(param.eps),
&(param.C),
&(param.nu),
&(param.max_run_time) ,
&(param.shrinking));
}
else
{
//set default parameters for SVM
param.svm_type = C_SVC;
param.kernel_type = LINEAR;
param.cache_size = 1;//MB
param.eps = 1e-3;
param.C = 1;
param.max_run_time = 1; //second
param.shrinking = 1;//not available right now
}
if(internal_trace)
{
for(i = 0;i< k;i++)
{
fprintf(stderr,
"\nx[%ld]=%lud\n",i,svm_prob.x[i][1].hash);
};
};
Q_init();
solve(); //result is in solver
b = calc_b();
if(internal_trace)
{
fprintf(stderr, "b=%lf\n",b);
}
for(i = 0; i < svm_prob.l; i++)
alpha[i] = solver.alpha[i];
//compute A,B for sigmoid prediction
deci_array = (double*) malloc(svm_prob.l * sizeof(double));
for(i = 0; i < svm_prob.l; i++)
{
deci_array[i] = calc_decision(svm_prob.x[i], alpha, b);
}
calc_AB(AB, deci_array, k1, k2);
//free cache
cache_free(&svmcache);
free(deci_array);
free(solver.G);
free(solver.alpha);
free(DiagQ);
if(user_trace)
fprintf(stderr,
"Recalculation of svm hyperplane is finished!\n");
}
else
{
if(user_trace)
fprintf(stderr,
"There hasn't enough documents to recalculate a svm hyperplane!\n");
return (0);
}
}
else
{
if(internal_trace)
{
for(i=0;i<svm_prob.l;i++)
{
j = 0;
do
{
fprintf (stderr,
"x[%ld][%ld]=%lud\n",
i,j,svm_prob.x[i][j].hash);
}
while(svm_prob.x[i][j++].hash!=0);
}
}
for(i = 0; i<svm_prob.l; i++)
{
fread(&alpha[i], sizeof(double), 1, hashf);
}
fread(&b, sizeof(double), 1, hashf);
fread(&AB[0], sizeof(double), 1, hashf);
fread(&AB[1], sizeof(double), 1, hashf);
fclose(hashf);
}
decision = calc_decision(hashes,alpha,b);
decision = sigmoid_predict(decision, AB[0], AB[1]);
free(alpha);
crm_force_munmap_filename (file1);
crm_force_munmap_filename (file2);
crm_force_munmap_filename (file3);
} //end (k1==0 && k2 ==0)
} //end (k==0)
else
{
nonfatalerror (
"You need to input (file1.svm | file2.svm | f1vsf2.svmhyp)\n", "");
return (0);
};
free(hashes);
if(svlen > 0)
{
char buf [4096];
double pr;
char fname[MAX_FILE_NAME_LEN];
buf [0] = '\000';
// put in standard CRM114 result standard header:
ptc[0] = decision;
ptc[1] = 1 - decision;
if(decision >= 0.5)
{
pr = 10 *(log10 (decision + 1e-300) - log10 (1.0 - decision +1e-300 ));
sprintf(buf,
"CLASSIFY succeeds; success probability: %6.4f pR: %6.4f\n",
decision, pr);
bestseen = 0;
}
else
{
pr = 10 *(log10 (decision + 1e-300) - log10 (1.0 - decision +1e-300 ));
sprintf(buf,
"CLASSIFY fails; success probability: %6.4f pR: %6.4f\n",
decision, pr);
bestseen = 1;
}
if (strlen (stext) + strlen(buf) <= stext_maxlen)
strcat (stext, buf);
// Second line of the status report is the "best match" line:
//
if(bestseen)
strcpy(fname, file2);
else
strcpy(fname, file1);
sprintf (buf, "Best match to file #%ld (%s) " \
"prob: %6.4f pR: %6.4f \n",
bestseen,
fname,
ptc[bestseen],
10 * (log10 (ptc[bestseen] + 1e-300) - log10 (1.0 - ptc[bestseen] +1e-300 )));
if (strlen (stext) + strlen(buf) <= stext_maxlen)
strcat (stext, buf);
sprintf (buf, "Total features in input file: %ld\n", totalfeatures);
if (strlen (stext) + strlen(buf) <= stext_maxlen)
strcat (stext, buf);
for(k = 0; k < 2; k++)
{
sprintf (buf,
"#%ld (%s):" \
" documents: %ld, features: %ld, prob: %3.2e, pR: %6.2f \n",
k,
hashname[k],
doc_num[k],
hashlens[k],
ptc[k],
10*(log10 (ptc[k] + 1e-300) - log10 (1.0 - ptc[k] + 1e-300)));
if (strlen(stext)+strlen(buf) <= stext_maxlen)
strcat (stext, buf);
}
for(k = 0; k < 2; k++)
{
free(hashname[k]);
}
// finally, save the status output
//
crm_destructive_alter_nvariable (svrbl, svlen,
stext, strlen (stext));
}
// Return with the correct status, so an actual FAIL or not can occur.
if (decision >= 0.5 )
{
// all done... if we got here, we should just continue execution
if (user_trace)
fprintf (stderr, "CLASSIFY was a SUCCESS, continuing execution.\n");
}
else
{
// Classify was a fail. Do the skip.
if (user_trace)
fprintf (stderr, "CLASSIFY was a FAIL, skipping forward.\n");
// and do what we do for a FAIL here
csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1;
csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1;
return (0);
}
regcomp_failed:
return (0);
}
syntax highlighted by Code2HTML, v. 0.9.1