/*
File name: estimate_error.c
Created by: Ljubomir Buturovic
Created: 07/23/2001
Purpose: Bayes error estimation using Fukunaga's k-NN method.
*/
/*
Copyright 2004 Ljubomir J. Buturovic
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation files
(the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of the Software,
and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
The principal functions in this file are l_estimate() and
r_estimate(). They provide upper and lower bounds, respectively, of
the Bayes error estimate, using the k-NN method described in
Keinosuke Fukunaga, Introduction to Statistical Pattern Recognition,
Second Edition, Chapter 7, Morgan Kaufmann, San Francisco, 1990.
The method implemented in l_estimate() is fully leave-one-out. For
the classification of each input vector, the covariance matrix of
its' class is reestimated by removing the vector, and all nearest
neighbors to the class are recomputed. As a result, the
computational complexity of the method is high, as are its' memory
requirements (roughly proportional to N^2, where N is the number of
input vectors).
The fully leave-one-out method implemented in l_estimate() is known
as 'x'. The approximate method, achieved for 'approximate_l_mode' ==
1 (see below), is known as 'l'. It should produce results identical
to FORTRAN implementation of the same method, known as 'a'.
*/
/* TBD: seed unused? */
static char rcsid[] = "$Id: estimate_error.c,v 1.138 2005/05/06 14:40:03 ljubomir Exp $";
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <unistd.h>
#include <float.h>
#include <errno.h>
#include "pcp.h"
#include "hash_util.h"
#include "lmat.h"
#include "lau.h"
#include "knn.h"
#include "pau.h"
#include "estimate_error.h"
static int r_mode = 0;
/*
The code uses two variables, 'r_mode' and 'approximate_l_mode',
which control two different development/testing modes for
l_estimate() function. 'r_mode' == 1 turns on the R estimation
compatibility (i.e., l_estimate() will return the same results as
r_estimate(). 'approximate_l_mode' == 1 turns on compatibility mode
with approximate L estimate (identical to PCP FORTRAN function
ee1()). This mode uses all available samples to estimate covariance
matrices (i.e., the covariance matrix estimates are not
leave-one-out, but resubstitution estimates).
The following matrix determines the method executed by l_estimate():
approximate_l_mode r_mode method
0 0 'x'
0 1 'r'
1 0 'l'
1 1 'l'
*/
/*
LLR comparison function.
*/
static int compare_likelihoods(const void *llr1, const void *llr2)
{
int retval = -1;
struct LLR *l1;
struct LLR *l2;
l1 = (struct LLR *) llr1;
l2 = (struct LLR *) llr2;
if (l1->llr > l2->llr)
retval = 1;
else if (l1->llr == l2->llr)
retval = 0;
return retval;
}
/*
Memory allocation for r_estimate(), l_estimate().
*/
static void ee_init(int c, int kmin, int kmax, float ***fpn, float **estimate)
{
int i;
int k;
float **xpn;
float *ex;
xpn = malloc(c*sizeof(float *));
for (i = 0; i < c; i++)
{
xpn[i] = malloc((kmax-kmin+1)*sizeof(float));
for (k = kmin; k <= kmax; k++)
xpn[i][k-kmin] = 0.0;
}
*fpn = xpn;
ex = malloc((kmax-kmin+1)*sizeof(float));
for (k = kmin; k <= kmax; k++)
ex[k-kmin] = 0.0;
*estimate = ex;
}
/*
Free memory allocated by ee_init().
*/
static void ee_free(int c, float **fpn, float *estimate)
{
int i;
for (i = 0; i < c; i++)
free(fpn[i]);
free(fpn);
free(estimate);
}
/*
Calculate all pairwise distances for 'nv' vectors in 'x' using
Mahalanobis distance measure. 'sigma' is array of 'c' inverse
covariance matrices in Symmetric Storage Mode, where 'c' is number
of classes. 'd' is dimension of input vectors. 'nd[i]' are class
cardinalities.
Returns an array of 'nv' memory pointers. 'distances[i][j]' is
Mahalanobis distance between X[i] and X[j].
In case of malloc() error, return (float **) 0 and set errno.
*/
static float **calculate_distances(float **x, int nv, int d, int c, int *nd,
float **sigma, FILE *outdev)
{
int i;
int j;
int category;
int class_count;
float *matrix;
float *mxv;
float *mxe;
float **distances;
distances = malloc(nv*sizeof(float *));
if (distances != (float **) 0)
{
mxv = malloc(d*sizeof(float));
if (mxv != (float *) 0)
{
mxe = malloc(d*sizeof(float));
if (mxe != (float *) 0)
{
for (i = 0; i < nv; i++)
{
category = 0;
class_count = nd[0];
matrix = sigma[category];
if (outdev != (FILE *) 0)
fprintf(outdev, "Calculating distances for vector %7d.\n", i+1);
distances[i] = malloc(nv*sizeof(float));
if (distances[i] != (float *) 0)
{
for (j = 0; j < nv; j++)
{
if (j == class_count)
{
class_count += nd[++category];
matrix = sigma[category];
}
distances[i][j] = mahalanobis(x[i], x[j], d, matrix, mxv, mxe);
}
}
else
distances = (float **) mx_free((void **) distances, i);
}
vx_free(mxe);
}
vx_free(mxv);
}
}
return distances;
}
/*
Calculate distances from all 'nv' vectors in 'x' to median vectors
of all classes, using Mahalanobis distance measure. sigma[i] is
inverse covariance matrices of class 'i' in SSM. 'd' is dimension of
input vectors, 'c' is number of categories with 'nd[i]' vectors in
category i.
Returns an array of 'c' memory pointers. 'distances[i]' points to
array of 'nv' distances to median vector of class i.
In case of error, return (float **) 0 and set errno.
*/
static float **calculate_mean_distances(float **x, int nv, int d, int c, int *nd,
float **sigma)
{
int i;
int j;
int m;
int cntr;
int status = 0;
float *mean;
float *mxv;
float *mxe;
float **distances = (float **) 0;
cntr = 0;
mean = malloc(d*sizeof(float));
if (mean != (float *) 0)
{
mxv = malloc(d*sizeof(float));
if (mxv != (float *) 0)
{
mxe = malloc(d*sizeof(float));
if (mxe != (float *) 0)
{
distances = malloc(c*sizeof(float *));
if (distances != (float **) 0)
{
for (i = 0; (i < c) && (status == 0); i++)
{
/*
Estimate mean vector for class i.
*/
for (j = 0; j < d; j++)
{
mean[j] = 0.0;
for (m = 0; m < nd[i]; m++)
mean[j] += x[cntr+m][j];
mean[j] = mean[j]/nd[i];
}
/*
Done - now calculate Mahalanobis distances from all vectors in
'x' to 'mean'.
*/
distances[i] = malloc(nv*sizeof(float));
if (distances[i] != (float *) 0)
{
for (m = 0; m < nv; m++)
distances[i][m] = mahalanobis(x[m], mean, d, sigma[i], mxv, mxe);
cntr += nd[i];
}
else
{
distances = (float **) mx_free((void **) distances, i);
status = -1;
}
}
vx_free(mxe);
}
vx_free(mxv);
}
}
vx_free(mean);
}
return distances;
}
/*
Calculate k-nearest neighbor distances for all 'nv' vectors in 'x',
for all 'c' classes, with 'nd[i]' vectors per class. Returns an
array of pointers to class neighbors, one per class. Each pointer
points to the following memory structure:
+--------------+
for nv vectors: | 1 |
k-th neighbor | 2 | k = kmin
from class c | ... |
| nv |
+--------------+
for nv vectors: | 1 |
k-th neighbor | 2 | k = kmin+1
from class c | ... |
| nv |
+--------------+
| |
| ... |
| |
+--------------+
for nv vectors: | 1 |
k-th neighbor | 2 | k = kmax
from class c | ... |
| nv |
+--------------+
*/
static float **calculate_neighbors(float **distances, int nv, int kmin, int kmax,
int c, int *nd, FILE *outdev, FILE *fdbg)
{
int i;
int j;
int k;
int m;
int idx;
int cidx;
int cmax;
int cnv; /* total number of vectors in classes 0..i */
float *dist;
float *nx;
float **neighbors;
neighbors = malloc(c*sizeof(float *));
if (neighbors != (float **) 0)
{
cmax = ivec_max(nd, c);
/* +1 in order to use finsort() */
dist = malloc((cmax+1)*sizeof(float));
cnv = 0;
if (dist != (float *) 0)
{
for (i = 0; i < c; i++)
{
if (outdev != (FILE *) 0)
fprintf(outdev, "Calculating neighbors from class %7d.\n", i+1);
cidx = 0;
nx = malloc((kmax-kmin+2)*nv*sizeof(float));
if (nx != (float *) 0)
{
for (m = 0; m < nv; m++)
{
idx = cidx;
for (j = 0; j < nd[i]; j++)
dist[j] = distances[m][cnv+j];
finsort(dist, nd[i]);
if (fdbg != (FILE *) 0)
for (j = 0; j < nd[i]; j++)
fprintf(fdbg, "calculate_neighbors(); target class: %3d; vector: %5d; dist[%5d]: %12.5g\n",
i+1, m+1, j, dist[j]);
for (k = kmin-1; k <= kmax; k++)
{
nx[idx] = dist[k];
idx += nv;
}
cidx++;
}
if (fdbg != (FILE *) 0)
{
for (j = 0; j < nv*(kmax-kmin+1); j++)
fprintf(fdbg, "calculate_neighbors(); target class: %3d; nx[%5d]: %12.5g\n",
i+1, j, nx[j]);
}
neighbors[i] = nx;
cnv += nd[i];
}
else
{
neighbors = (float **) 0;
dist = vx_free(dist);
neighbors = (float **) mx_free((void **) neighbors, i);
}
}
vx_free(dist);
}
}
return neighbors;
}
/*
Get cardinality of classes 0..clx-1, with 'nd[i]' vectors per
class. TBD: replace with ivec_sum().
*/
static int get_cardinality(int clx, int *nd)
{
int i;
int cdx = 0;
for (i = 0; i < clx; i++)
cdx += nd[i];
return cdx;
}
/*
Determine distances from all 'd'-dimensional vectors in 'x' to
nearest neighbors in 'category', excluding vector 'icx' from
'category'. The k-NN distances are calculated for k between 'kmin'
and 'kmax' and are obtained by adjusting 'distances', using formulae
(1), (2) and (3) of 05/01/2002, and (1) of 04/30/2002, from R&D,
vol. III. 'xmult' is the product (X-M)*sigma^(-1) from formula
(2). 'dxk' is distance from vector 'icx' to the centroid of
'category'. 'c' is number of classes, 'nd' has class cardinalities.
The index 'icx' is 0-based, and the range is 0 to nv-1.
Memory layout of the neighbors is as shown for one class in
calculate_neighbors().
In case of malloc() error, return (float *) 0.
*/
static float *recalculate_class_neighbors(int kmin, int kmax, float **x, int d, int nv,
float **distances, float *xmult, float dkx,
int category, int c, int *nd, int icx,
int approximate_l_mode, FILE *fdbg)
{
int i;
int j;
int mx;
int nx;
int idx;
int jcntr;
char *err_msg;
float *neighbors = (float *) 0;
float *dist = (float *) 0;
float *xdiff; /* xmult*(Xa-Xb) from formula (2) */
float distance;
float dot;
float fnx;
float factor;
dist = malloc((nd[category])*sizeof(float));
if (dist != (float *) 0)
{
xdiff = malloc(d*sizeof(float));
if (xdiff != (float *) 0)
{
mx = 0;
mx = get_cardinality(category, nd);
nx = nd[category];
fnx = (nx-2.0)/(nx-1.0);
factor = nx;
/* 'factor' is f from formula (1), R&D, vol. III, 04/30/2002 */
factor = factor/((factor-1.0)*(factor-1.0)-factor*dkx*dkx);
neighbors = malloc((kmax-kmin+1)*nv*sizeof(float));
if (neighbors != (float *) 0)
{
for (i = 0; i < nv; i++)
{
jcntr = 0;
/*
Calculate distances between vector 'i' and all vectors in
'category' except 'icx'.
*/
for (j = 0; j < nd[category]; j++)
{
if (j != icx-mx)
{
distance = distances[i][mx+j]; /* original distance between i and j */
if ((r_mode == 1) || (approximate_l_mode == 1))
dist[jcntr] = distance;
else
{
distance = distance*distance;
fvec_diff(x[i], x[j+mx], xdiff, d, (int *) 0);
/*
'dot' is 'x' in formula (2) of
05/01/2002, R&D, vol. III.
*/
dot = fvec_dot(xdiff, xmult, d, (int *) 0);
dist[jcntr] = fnx*(distance+factor*dot*dot);
dist[jcntr] = sqrt(dist[jcntr]);
}
jcntr++;
}
}
finsort(dist, nd[category]-1);
idx = i;
for (j = kmin-1; j < kmax; j++)
{
neighbors[idx] = dist[j];
idx += nv;
}
}
}
vx_free(xdiff);
}
vx_free(dist);
}
if ((neighbors == (float *) 0) && (fdbg != (FILE *) 0))
{
err_msg = strerror(errno);
fprintf(fdbg, "recalculate_class_neighbors(); malloc() failure; %s\n", err_msg);
}
return neighbors;
}
/*
Compute determinant of leave-one-out covariance matrix estimate, by
excluding vector 'icx', for matrix characterized by 'mdet', 'edet'
(determinant = mdet*2^edet). 'd' is number of features, 'nc' number
of vectors in the class whose determinant is being reestimated, 'dm'
distance from 'icx' to mean vector of its' class. Returns log2 of
the updated determinant, so the actual determinant is
2**return_value (mantisa is set to 1.0). Based on formula (7.67)
from Fukunaga.
*/
static float recalculate_determinant(float mdet, float edet, int d, int nc,
int icx, float dm, int approximate_l_mode)
{
float efactor;
float fx;
float det;
efactor = log(2.0);
fx = 1.0-nc*dm*dm/((nc-1.0)*(nc-1.0));
det = log(mdet)+edet*efactor+d*log((nc-1.0)/(nc-2.0))+log(fx);
det = det/efactor;
if ((r_mode == 1) || (approximate_l_mode == 1))
det = edet+log(mdet)/efactor;
return det;
}
static float calculate_llr(int d, int nv1, int nv2, float d1, float d2,
float mdet1, float edet1, float mdet2, float edet2)
{
float llr;
float constant_term;
constant_term = nv1;
constant_term = log(constant_term/nv2);
constant_term += 0.5*(log(mdet1/mdet2)+(edet1-edet2)*log(2.0));
if ((d1 != 0) && (d2 != 0))
llr = d*log(d1/d2)+constant_term;
else if (d1 != 0)
llr = FLT_MAX;
else
llr = -FLT_MAX;
return llr;
}
/*
Determine more likely class between 'c1' and 'c2' of cardinalities
'nv1', 'nv2', given decision threshold 't', distances 'd1' and 'd2'
and determinants 'mdet1', 'edet1', 'mdet2', 'edet2', for a
'd'-dimensional vector characterized by distances 'd1' and
'd2'. 'd1' is distance between the vector being classified and k-th
nearest neighbor from class 1. 'd2' is distance beween the vector
and k-th nearest neighbor from class 2. Based on formula (7.5) from
Fukunaga. Returns 'c1' or 'c2'.
*/
static int assign_class(int d, int c1, int c2, int nv1, int nv2, float d1, float d2,
float mdet1, float edet1, float mdet2, float edet2, float t)
{
int aclass;
float llr;
llr = calculate_llr(d, nv1, nv2, d1, d2, mdet1, edet1, mdet2, edet2);
aclass = c1;
if (llr > t)
aclass = c2;
return aclass;
}
/*
Return 0 if 'index' and 'lix' index the same vector, otherwise
return 1. 'index' range is from 0..nc1+nc2-1, 'lix' from 0..nc1-1 if
'category' equals CLASS_C1, 0..nc2-1 if 'category' equals CLASS_C2.
*/
static int cmp_index(int index, int lix, int category, int nc1, int nc2)
{
int check = 1;
if (r_mode != 1)
if (lix != IDX_NONE)
{
if ((category == CLASS_C1) && (index < nc1))
{
if (index == lix)
check = 0;
}
else if ((category == CLASS_C2) && (index >= nc1))
{
if (index-nc1 == lix)
check = 0;
}
}
return check;
}
/*
Find the optimal threshold for deciding between classes c1 and c2,
characterized by covariance matrix determinants 'mdet1', 'edet1' and
'mdet2', 'edet2'. 'c1_c1_neighbors', 'c2_c1_neighbors' and
'c1_c2_neighbors', 'c2_c2_neighbors' are distances to k-th nearest
neighbor from class c1 and c2, respectively, for all nc1+nc2
'd'-dimensional vectors. The length of 'c1_*' arrays is 'nc1' and
the length of 'c2_*' arrays is 'nc2'. The optimal threshold is
defined as the log-likelihood value which gives least classification
error between the two classes.
Leave-one-out functionality added 09/03/2001: if 'lix' != IDX_NONE,
omit vector 'lix' from 'category' from optimization. 'lix' range is
0..nc1-1 for 'category' == CLASS_C1, 0..nc2-1 for 'category' ==
CLASS_C2.
In case of success, errno is set to 0; otherwise, it is set to any
of the values set by malloc().
*/
static float find_optimal_threshold(int d, int nc1, int nc2, float mdet1,
float edet1, float mdet2, float edet2,
float *c1_c1_neighbors, float *c2_c1_neighbors,
float *c1_c2_neighbors, float *c2_c2_neighbors,
int lix, int category, FILE *fdbg)
{
int i;
int j;
int mc;
int cmc;
int done;
int nix1;
int nix2;
int icntr;
float constant_term;
float threshold = 0.0;
float ct;
float nxct;
float d1;
float d2;
float det;
float rcc;
char *err_msg;
struct LLR *llr;
errno = 0;
constant_term = nc1;
constant_term = log(constant_term/nc2);
det = 0.5*(log(mdet1/mdet2)+(edet1-edet2)*log(2.0));
constant_term += det;
nix1 = nc1;
nix2 = nc2;
rcc = 0.0;
if (r_mode != 1)
{
if ((lix != IDX_NONE) && (category == CLASS_C1))
{
nix1--;
rcc = nc1;
rcc = log(nix1/rcc);
}
if ((lix != IDX_NONE) && (category == CLASS_C2))
{
nix2--;
rcc = nc2;
rcc = log(rcc/nix2);
}
}
constant_term += rcc;
icntr = 0;
llr = malloc((nix1+nix2)*sizeof(struct LLR));
if (llr != (struct LLR *) 0)
{
for (i = 0; i < nc1+nc2; i++)
{
if (cmp_index(i, lix, category, nc1, nc2) != 0)
{
if (i < nc1)
{
d1 = c1_c1_neighbors[i];
d2 = c1_c2_neighbors[i];
llr[icntr].category = CLASS_C1;
}
else
{
d1 = c2_c1_neighbors[i-nc1];
d2 = c2_c2_neighbors[i-nc1];
llr[icntr].category = CLASS_C2;
}
if (d2 != 0.0)
{
if (d1 != 0.0)
llr[icntr].llr = d*log(d1/d2)+constant_term;
else
llr[icntr].llr = -FLT_MAX;
}
else
{
if (d1 == 0.0)
llr[icntr].llr = constant_term;
else
llr[icntr].llr = FLT_MAX;
}
#if 0
if ((fdbg != (FILE *) 0) && (lix == 8) && (category == CLASS_C2))
fprintf(fdbg, "find_optimal_threshold(); lix: %5d; i: %5d; "
"d1: %12.5g; d2: %12.5g; det: %12.5g; constant_term: %12.5g\n",
lix+1, i, d1, d2, det, constant_term);
#endif
icntr++;
}
}
qsort(llr, nix1+nix2, sizeof(struct LLR), compare_likelihoods);
/*
Set threshold to large negative value. Then, everything is
classified as c2, hence misclassification count equals nc1.
*/
threshold = -FLT_MAX;
cmc = nix1;
mc = nix1;
/*
Set threshold between each two neighboring log-likelihood ratios
in turn, and adjust misclassification count.
*/
ct = threshold;
for (i = 0; i < nix1+nix2-1; i++)
{
nxct = 0.5*(llr[i].llr+llr[i+1].llr);
#if 0
if ((fdbg != (FILE *) 0) && (lix == 8) && (category == CLASS_C2))
fprintf(fdbg, "find_optimal_threshold(); lix: %5d; threshold: %12.5g; nxct: %12.5g; llr[%5d]: "
"%12.5g; llr[%5d]: %12.5g\n", lix+1, threshold, nxct,
i, llr[i].llr, i+1, llr[i+1].llr);
#endif
/*
Vector i, and all vectors having the same llr as i, may have
changed classification from c2 to c1. If that is the case,
update error count.
*/
if (nxct != ct)
{
done = 0;
for (j = i; (j >= 0) && (done == 0); j--)
{
if (llr[j].llr == llr[i].llr)
{
if (llr[j].category == CLASS_C1)
mc -= 1;
else
mc += 1;
}
else
done = 1;
}
}
/*
If this is the best result so far, update the optimal
threshold.
*/
if (mc < cmc)
{
cmc = mc;
threshold = nxct;
}
ct = nxct;
}
free(llr);
}
else if (fdbg != (FILE *) 0)
{
err_msg = strerror(errno);
fprintf(fdbg, "find_optimal_threshold(); malloc() failure; %s\n", err_msg);
}
return threshold;
}
/*
Calculate optimal thresholds for all pairs of classes.
In case of malloc() error, return (float **) 0 and set errno.
*/
static float **get_optimal_thresholds(int nv, int d, int c, int *nd,
float *det, int kmin, int kmax,
float **neighbors, FILE *fdbg)
{
int status = 0;
int length;
int k;
int c1;
int c2;
int cix;
int cix1;
int cix2;
int idx;
float threshold;
float **optimal_thresholds = (float **) 0;
length = (c-1)*c;
length = length/2;
idx = 0;
optimal_thresholds = malloc((kmax-kmin+1)*sizeof(float *));
if (optimal_thresholds != (float **) 0)
{
for (k = 0; (k <= kmax-kmin) && (status == 0); k++)
{
cix = 0;
optimal_thresholds[k] = malloc(length*sizeof(float));
if (optimal_thresholds[k] != (float *) 0)
{
cix1 = 0; /* cumulative cardinality of classes 0..c1-1 */
for (c1 = 0; c1 < c-1; c1++)
{
cix2 = cix1+nd[c1]; /* cumulative cardinality of classes 0..c2-1 */
for (c2 = c1+1; c2 < c; c2++)
{
threshold =
find_optimal_threshold(d, nd[c1], nd[c2], det[2*c1], det[2*c1+1],
det[2*c2], det[2*c2+1], &neighbors[c1][idx+cix1],
&neighbors[c1][idx+cix2], &neighbors[c2][idx+cix1],
&neighbors[c2][idx+cix2],
IDX_NONE, CLASS_NONE, fdbg);
if (fdbg != (FILE *) 0)
fprintf(fdbg, "get_optimal_thresholds(); c1: %5d; c2: %5d; "
"k: %5d; threshold: %12.5g\n", c1+1, c2+1, k+1, threshold);
optimal_thresholds[k][cix++] = threshold;
cix2 += nd[c2];
}
cix1 += nd[c1];
}
}
else
{
status = -1;
optimal_thresholds = (float **) mx_free((void **) optimal_thresholds, k);
}
idx += nv;
}
}
return optimal_thresholds;
}
static void log_thresholds(FILE *fdbg, char *method, float **optimal_thresholds,
int kmin, int kmax, int c)
{
int cix;
int k;
int c1;
int c2;
if (fdbg != (FILE *) 0)
{
for (k = kmin; k <= kmax; k++)
{
cix = 0;
for (c1 = 0; c1 < c-1; c1++)
for (c2 = c1+1; c2 < c; c2++)
{
fprintf(fdbg, "%s; optimal_thresholds[%d][%d]: %12.5g\n",
method, k, cix, optimal_thresholds[k-kmin][cix]);
cix++;
}
}
}
}
/*
Calculate k-NN Bayes error leave-one-out estimate for set of 'nv'
vectors in 'x'. The vectors are 'd'-dimensional and the number of
classes is 'c', with 'nd[i]' vectors in class i and
covariance/determinant information in 'sigma' and 'det',
respectively. The estimates are calculated for k between 'kmin' and
'kmax', inclusive. 'distances' are pairwise distances between all
vectors in 'x'. 'neighbors' are resubstitution neighbors. 'fpn' has
FN (false negatives) per class. Send messages to 'outdev', debug
information to 'fdbg'.
In case of error, return (float *) 0.
*/
static float *l_estimate(float **x, int nv, int d, int c, int *nd,
float **sigma, float *det, int kmin, int kmax,
float **mean_distances, float **distances,
float **neighbors, int approximate_l_mode,
float ***fpn, FILE *outdev, FILE *fdbg)
{
int ivx;
int k;
int c1;
int c2;
int assigned_class = 0; /* assigned class for vector 'i' */
int true_class; /* true class for vector 'i' */
int class_count; /* total number of vectors in classes 0..true_class */
int offx; /* offset of first vector in class true_class */
int idx;
int icx1;
int icx2;
int cix;
int rx;
float llr;
float threshold;
float d1;
float d2;
float edet;
float *xmean;
float *class_mean = (float *) 0;
float *xmult;
float *c1_c1_neighbors;
float *c1_c2_neighbors;
float *c2_c1_neighbors;
float *c2_c2_neighbors;
float *class_neighbors;
float *estimate = (float *) 0;
float **xpn;
float **optimal_thresholds;
hash_t *eliminated_classes;
true_class = 0;
class_neighbors = neighbors[0]; /* initialize */
class_count = nd[0];
offx = 0;
optimal_thresholds = get_optimal_thresholds(nv, d, c, nd, det, kmin, kmax,
neighbors, fdbg);
if (optimal_thresholds != (float **) 0)
{
log_thresholds(fdbg, "l_estimate()", optimal_thresholds, kmin, kmax, c);
ee_init(c, kmin, kmax, fpn, &estimate);
xpn = *fpn;
xmean = malloc(d*sizeof(float)); /* store difference X-M; see R&D vol. III, 05/01/2002 */
if (xmean != (float *) 0)
{
xmult = malloc(d*sizeof(float)); /* store product (X-M)*sigma; see R&D vol. III, 05/01/2002 */
if (xmult != (float *) 0)
{
class_mean = mest(&x[offx], d, nd[true_class]);
for (ivx = 0; (ivx < nv) && (class_neighbors != (float *) 0); ivx++)
{
if (outdev != (FILE *) 0)
fprintf(outdev, "Classifying vector %10d, for all k.\n", ivx+1);
if (ivx == class_count)
{
vx_free(class_mean);
offx += nd[true_class];
true_class++;
class_mean = mest(&x[offx], d, nd[true_class]);
class_count += nd[true_class];
}
fvec_diff(x[ivx], class_mean, xmean, d, (int *) 0);
fvec_smx(xmean, sigma[true_class], d, xmult);
if (r_mode == 1)
class_neighbors = neighbors[true_class];
else
class_neighbors =
recalculate_class_neighbors(kmin, kmax, x, d, nv, distances,
xmult, mean_distances[true_class][ivx],
true_class, c, nd, ivx, approximate_l_mode, fdbg);
if (class_neighbors != (float *) 0)
{
edet = recalculate_determinant(det[2*true_class], det[2*true_class+1],
d, nd[true_class], ivx,
mean_distances[true_class][ivx],
approximate_l_mode);
if (fdbg != (FILE *) 0)
fprintf(fdbg, "l_estimate(); true_class: %5d; det: %12.5g; edet: %12.5g; L edet: %12.5g\n",
true_class+1, det[2*true_class], det[2*true_class+1], edet);
for (k = 0; k <= kmax-kmin; k++)
{
idx = k*nv;
eliminated_classes = hashCreate();
cix = 0;
for (c1 = 0; c1 < c-1; c1++)
{
icx1 = get_cardinality(c1, nd);
for (c2 = c1+1; c2 < c; c2++)
{
if ((hashGetInt(eliminated_classes, c1) == INT_MAX) &&
(hashGetInt(eliminated_classes, c2) == INT_MAX))
{
icx2 = get_cardinality(c2, nd);
/*
'threshold' is optimal threshold for classes
'c1', 'c2', excluding vector 'ivx'.
*/
if (true_class == c1)
{
c1_c1_neighbors = &(class_neighbors[idx+icx1]);
c2_c1_neighbors = &(class_neighbors[idx+icx2]);
/*
TBD: perhaps precompute and store
cardinalities, for minor
speedup. 08/29/2001
*/
rx = k*nv+get_cardinality(c1, nd);
c1_c2_neighbors = &neighbors[c2][rx];
rx = k*nv+get_cardinality(c2, nd);
c2_c2_neighbors = &neighbors[c2][rx];
threshold =
find_optimal_threshold(d, nd[c1], nd[c2], 1.0, edet,
det[2*c2], det[2*c2+1],
c1_c1_neighbors, c2_c1_neighbors,
c1_c2_neighbors, c2_c2_neighbors,
ivx-get_cardinality(true_class, nd),
CLASS_C1, fdbg);
d1 = c1_c1_neighbors[ivx-icx1];
d2 = c1_c2_neighbors[ivx-icx1];
if ((r_mode == 1) || (approximate_l_mode == 1))
assigned_class = assign_class(d, c1, c2, nd[c1], nd[c2], d1, d2,
1.0, edet, det[2*c2], det[2*c2+1],
threshold);
else
assigned_class = assign_class(d, c1, c2, nd[c1]-1, nd[c2], d1, d2,
1.0, edet, det[2*c2], det[2*c2+1],
threshold);
}
else if (true_class == c2)
{
c1_c2_neighbors = &(class_neighbors[idx+icx1]);
c2_c2_neighbors = &(class_neighbors[idx+icx2]);
rx = k*nv+get_cardinality(c1, nd);
c1_c1_neighbors = &neighbors[c1][rx];
rx = k*nv+get_cardinality(c2, nd);
c2_c1_neighbors = &neighbors[c1][rx];
threshold =
find_optimal_threshold(d, nd[c1], nd[c2], det[2*c1],
det[2*c1+1], 1.0, edet,
c1_c1_neighbors, c2_c1_neighbors,
c1_c2_neighbors, c2_c2_neighbors,
ivx-get_cardinality(true_class, nd),
CLASS_C2, fdbg);
d1 = c2_c1_neighbors[ivx-icx2];
d2 = c2_c2_neighbors[ivx-icx2];
if ((r_mode == 1) || (approximate_l_mode == 1))
assigned_class = assign_class(d, c1, c2, nd[c1], nd[c2], d1, d2,
det[2*c1], det[2*c1+1], 1.0, edet,
threshold);
else
assigned_class = assign_class(d, c1, c2, nd[c1], nd[c2]-1, d1, d2,
det[2*c1], det[2*c1+1], 1.0, edet,
threshold);
}
else
{
/*
If 'ivx' does not belong to either of the
two classes being classified against, then
use the optimized threshold value.
*/
d1 = neighbors[c1][idx+ivx];
d2 = neighbors[c2][idx+ivx];
threshold = optimal_thresholds[k][cix];
assigned_class = assign_class(d, c1, c2, nd[c1], nd[c2], d1, d2,
det[2*c1], det[2*c1+1], det[2*c2],
det[2*c2+1], threshold);
}
if (fdbg != (FILE *) 0)
{
llr = calculate_llr(d, nd[c1], nd[c2], d1, d2,
det[2*c1], det[2*c1+1], det[2*c2],
det[2*c2+1]);
fprintf(fdbg, "l_estimate(); ivx: %4d; class: %3d; "
"c1: %3d; c2: %3d; k: %5d; d1: %10.5g; d2: %10.5g; "
"llr: %11.5g; thd: %11.5g; ac: %3d\n",
ivx+1, true_class+1, c1+1, c2+1, k+1, d1, d2,
llr, threshold, assigned_class+1);
}
if (assigned_class == c1)
hashPutInt(eliminated_classes, c2, c2);
else
hashPutInt(eliminated_classes, c1, c1);
}
cix++;
}
}
if (assigned_class != true_class)
{
estimate[k] += 1.0;
xpn[true_class][k] += 1.0;
if (fdbg != (FILE *) 0)
fprintf(fdbg, "l_estimate(); misclassified ivx: %5d; k: %5d; "
"true_class: %3d; assigned_class: %3d\n",
ivx+1, k+1, true_class+1, assigned_class+1);
}
hashDelete(eliminated_classes);
}
if (r_mode != 1)
free(class_neighbors);
}
else
estimate = (float *) 0;
}
mx_free((void **) optimal_thresholds, kmax-kmin+1);
vx_free(xmult);
vx_free(class_mean);
}
}
vx_free(xmean);
}
return estimate;
}
/*
Resubstitution Bayes error estimate. The method uses threshold
optimization as described in Fukunaga, section 7.4, paragraph 'The
threshold for non-normal distributions'.
*/
static float *r_estimate(float **x, int nv, int d, int c, int *nd,
float *det, int kmin, int kmax,
float **distances, float **neighbors,
float ***fpn, FILE *outdev, FILE *fdbg)
{
int i;
int k;
int c1;
int c2;
int assigned_class = 0; /* assigned class for vector 'i' */
int true_class; /* true class for vector 'i' */
int class_count; /* total number of vectors in classes 0..true_class */
int cix;
int idx;
float threshold;
float d1;
float d2;
float llr;
float *estimate;
float **xpn;
float **optimal_thresholds;
hash_t *eliminated_classes;
if (outdev != (FILE *) 0)
fprintf(outdev, "Calculating resubstitution error...");
true_class = 0;
class_count = nd[0];
/*
Calculate optimal threshold for each pair of classes.
*/
optimal_thresholds = get_optimal_thresholds(nv, d, c, nd, det, kmin, kmax,
neighbors, fdbg);
log_thresholds(fdbg, "r_estimate()", optimal_thresholds, kmin, kmax, c);
ee_init(c, kmin, kmax, fpn, &estimate);
xpn = *fpn;
for (i = 0; i < nv; i++)
{
if (i == class_count)
class_count += nd[++true_class];
for (k = 0; k <= kmax-kmin; k++)
{
eliminated_classes = hashCreate();
idx = k*nv+i;
cix = 0;
for (c1 = 0; c1 < c-1; c1++)
for (c2 = c1+1; c2 < c; c2++)
{
if ((hashGetInt(eliminated_classes, c1) == INT_MAX) &&
(hashGetInt(eliminated_classes, c2) == INT_MAX))
{
d1 = neighbors[c1][idx];
d2 = neighbors[c2][idx];
threshold = optimal_thresholds[k][cix];
assigned_class = assign_class(d, c1, c2, nd[c1], nd[c2], d1, d2,
det[2*c1], det[2*c1+1], det[2*c2],
det[2*c2+1], threshold);
if (fdbg != (FILE *) 0)
{
llr = calculate_llr(d, nd[c1], nd[c2], d1, d2,
det[2*c1], det[2*c1+1], det[2*c2],
det[2*c2+1]);
fprintf(fdbg, "r_estimate(); ivx: %4d; class: %3d; c1: %3d; c2: %3d; "
"k: %4d; d1: %10.5g; d2: %10.5g; det: %10.5g; edet: %10.5g; "
"llr: %11.5g; thd: %10.5g; ac: %3d\n", i+1, true_class+1,
c1+1, c2+1, k+1, d1, d2, det[2*true_class],
det[2*true_class+1], llr, threshold, assigned_class+1);
}
if (assigned_class == c1)
hashPutInt(eliminated_classes, c2, c2);
else
hashPutInt(eliminated_classes, c1, c1);
}
cix++;
}
if (assigned_class != true_class)
{
estimate[k] += 1.0;
xpn[true_class][k] += 1.0;
if (fdbg != (FILE *) 0)
fprintf(fdbg, "r_estimate(); misclassified ivx: %5d; k: %5d; "
"true_class: %3d; assigned_class: %3d\n",
i+1, k+1, true_class+1, assigned_class+1);
}
hashDelete(eliminated_classes);
}
}
for (k = 0; k <= kmax-kmin; k++)
free(optimal_thresholds[k]);
free(optimal_thresholds);
if (outdev != (FILE *) 0)
fprintf(outdev, " done.\n");
return estimate;
}
/*
Save Bayes error estimate 'bayes_error' and misclassification rate
per class 'fpn', produced by 'method', for k between 'kmin' and
'kmax' in 'fname'. Return 0 for success, -1 in case of failure and
set errno.
*/
static int save_estimate(char *method, float *bayes_error, float **fpn,
int c, int kmin, int kmax, char *fname)
{
int retval = 0;
int j;
int k;
FILE *fptr;
/*
If file exists, append to it, otherwise create it.
*/
fptr = fopen(fname, "a");
if (fptr == (FILE *) 0)
retval = -1;
else
{
for (k = 0; k < kmax-kmin+1; k++)
{
fprintf(fptr, "%s\t%5d\t%10.2f", method, k+kmin, bayes_error[k]);
for (j = 0; j < c; j++)
fprintf(fptr, "\t%10.2f", fpn[j][k]);
fprintf(fptr, "\n");
}
fclose(fptr);
}
return retval;
}
/*
Launch Bayes error estimation for TDS.
'errc' is output error code. If 'debug' is not 0, send debugging
information to 'pcp.dbg' file.
For 'approximate_l_mode' == 1, use approximate L mode.
*/
void p_bayes(int *errc, int approximate_l_mode, int debug)
{
int i;
int j;
int kmin;
int kmax;
int status;
int idx;
int knn;
int dflt;
int mink;
int maxk;
float factor;
char *fname;
char *msg;
float *det;
float *bayes_l;
float *bayes_r;
float **distances;
float **mean_distances;
float **neighbors;
float **fpn;
FILE *outdev;
FILE *fdbg;
unsigned int seed;
clear_screen();
fname = (char *) 0;
if ((r_mode == 1) && (approximate_l_mode == 1))
r_mode = 0;
*errc = 0;
fdbg = (FILE *) 0;
if (debug)
fdbg = fopen(PCP_DBG, "a");
outdev = stdout;
dflt = 1;
mink = 1;
maxk = ivec_min(tds->nd, tds->c)-1;
cursor_on();
kmin = input_integer(stdin, outdev, " First nearest neighbor [1]:", PCP_QLEN, &dflt, &mink, &maxk);
mink = kmin;
msg = malloc((PCP_QLEN+1)*sizeof(char));
snprintf(msg, PCP_QLEN+1, " Last nearest neighbor [%d]:", maxk);
dflt = maxk;
kmax = input_integer(stdin, outdev, msg, PCP_QLEN, &dflt, &mink, &maxk);
free(msg);
seed = input_seed(stdin, outdev);
cursor_off();
inverse_video();
knn = kmax-kmin+1;
status = dataset_sigma(tds, errc);
if (!status)
{
det = malloc(2*tds->c*sizeof(float));
if (det)
{
factor = log(2.0);
idx = 0;
for (i = 0; i < tds->c; i++)
{
det[idx] = tds->det[idx];
idx++;
det[idx] = tds->det[idx]/factor;
idx++;
}
distances = calculate_distances(tds->x, tds->nv, tds->d, tds->c, tds->nd,
tds->sigma, outdev);
neighbors = calculate_neighbors(distances, tds->nv, kmin, kmax, tds->c,
tds->nd, outdev, fdbg);
mean_distances = calculate_mean_distances(tds->x, tds->nv, tds->d, tds->c,
tds->nd, tds->sigma);
bayes_r = r_estimate(tds->x, tds->nv, tds->d, tds->c, tds->nd, det, kmin,
kmax, distances, neighbors, &fpn, outdev, fdbg);
/*
'fpn' is class-conditional error rate, calculated as FN/CC (false
negatives/class cardinality), according to equation (2.1) from
B. D. Ripley, 'Pattern Recognition and Neural Networks', Cambridge
University Press, Cambridge, 1996, section 2.1.
*/
if (bayes_r)
{
for (i = 0; i < knn; i++)
{
bayes_r[i] = 100.0*(bayes_r[i]/tds->nv);
for (j = 0; j < tds->c; j++)
fpn[j][i] = 100.0*(fpn[j][i]/tds->nd[j]);
}
}
fname = strdup(PCP_BEE);
unlink(fname);
status = save_estimate(RESUBSTITUTION, bayes_r, fpn, tds->c, kmin, kmax, fname);
ee_free(tds->c, fpn, bayes_r);
bayes_l = l_estimate(tds->x, tds->nv, tds->d, tds->c, tds->nd, tds->sigma,
det, kmin, kmax, mean_distances, distances, neighbors,
approximate_l_mode, &fpn, outdev, fdbg);
if (bayes_l)
{
for (i = 0; i < knn; i++)
{
bayes_l[i] = 100.0*(bayes_l[i]/tds->nv);
for (j = 0; j < tds->c; j++)
fpn[j][i] = 100.0*(fpn[j][i]/tds->nd[j]);
}
status = save_estimate(LEAVE_ONE_OUT, bayes_l, fpn, tds->c, kmin, kmax, fname);
ee_free(tds->c, fpn, bayes_l);
}
else
*errc = 101; /* TBD: what is this error code? */
vx_free(det);
}
else
*errc = errno;
}
free(fname);
}
static int p_disp_alloc(int nl, float **rval, float **lval)
{
int status = 0;
*rval = malloc(nl*sizeof(float));
if (rval)
{
*lval = malloc(nl*sizeof(float));
if (!lval)
{
status = -1;
vx_free(*rval);
}
}
else
status = -1;
return status;
}
/*
Display Bayes error estimation results.
*/
void p_disp_bayes(int *errc, char **xname)
{
int i;
int nl;
int llen;
int cntr;
int rcntr;
int lcntr;
int status;
int kmin;
char *fname;
char *line;
float *rval;
float *lval;
char **tokens;
FILE *fptr;
status = 0;
clear_screen();
cursor_on();
fname = input_filename(PCP_UMSG_FILENAME, PCP_BEE, stdout);
inverse_video();
kmin = 0;
nl = file_info(fname, &llen, (int *) 0, '\0');
rcntr = 0;
lcntr = 0;
cntr = 0;
if (nl >= 0)
{
line = malloc((llen+2)*sizeof(char));
status = p_disp_alloc(nl, &rval, &lval);
if (!status)
{
fptr = fopen(fname, "r");
if (fptr)
{
while (fgets(line, llen+2, fptr))
{
tokens = str_tokenize(line, WHITESPACE);
if (cntr == 0)
kmin = atoi(tokens[1]);
if (!strcmp(tokens[0], RESUBSTITUTION))
rval[rcntr++] = atof(tokens[2]);
else if (!strcmp(tokens[0], LEAVE_ONE_OUT))
lval[lcntr++] = atof(tokens[2]);
str_free(tokens);
cntr++;
}
free(line);
status = fclose(fptr);
if (status == -1)
{
*errc = errno;
*xname = strdup(fname);
}
else
{
if (lcntr == rcntr)
{
printf("%s", PCP_XLINE);
printf("%s", PCP_EMPTY_LINE);
printf("| B A Y E S E R R O R E S T I M A T I O N |\n");
printf("%s", PCP_EMPTY_LINE);
printf("%s", PCP_XLINE);
printf("| k | Leave-one-out error estimate | Resubstitution error estimate |\n");
printf("%s", PCP_XLINE);
for (i = 0; i < rcntr; i++)
{
printf("| %10d | %5.2f | %5.2f |\n",
kmin+i, lval[i], rval[i]);
}
printf("%s", PCP_XLINE);
pwait();
}
else
*errc = PERR_BAD_INPUT_FILE;
}
}
else
{
*errc = errno;
*xname = strdup(fname);
}
vx_free(rval);
vx_free(lval);
}
else
*errc = errno;
}
else
{
*errc = errno;
*xname = strdup(fname);
}
free(fname);
}
/*
Calculate leave-one-out k-NN estimate of Bayes error for set of 'nv'
vectors in 'x'. The vectors are 'd'-dimensional and the number of
classes is 'c', with 'nd[i]' vectors in class i. 'sigma[i]' is
covariance matrix of class i in SSM. 'det[i]' is natural log of the
determinant of the covariance matrix. The estimates are calculated
for k between 'kmin' and 'kmax', inclusive. Send messages to
'outdev', debug information to 'fdbg'.
In case of error, return (float *) 0 and set 'errc'.
*/
float *L_error(float **x, int nv, int d, int c, int *nd,
float **sigma, float *det, int kmin, int kmax,
FILE *outdev, FILE *fdbg, int *errc)
{
int i;
int status;
int idx;
int approximate_l_mode = 1;
float efactor;
float *det2;
float *bayes_l = (float *) 0;
float **distances;
float **mean_distances;
float **neighbors;
float **fpn;
status = 0;
/*
Convert 'det' to the format required by l_estimate().
*/
if (status == 0)
{
det2 = malloc(c*2*sizeof(float));
if (det2 != (float *) 0)
{
idx = 0;
efactor = 1.0/log(2.0);
for (i = 0; i < c; i++)
{
det2[idx++] = 1.0;
det2[idx++] = det[i]*efactor;
}
}
else
status = -1;
}
if (status == 0)
{
distances = calculate_distances(x, nv, d, c, nd, sigma, outdev);
if (distances != (float **) 0)
{
neighbors = calculate_neighbors(distances, nv, kmin, kmax, c, nd, outdev, fdbg);
if (neighbors != (float **) 0)
{
mean_distances = calculate_mean_distances(x, nv, d, c, nd, sigma);
if (mean_distances != (float **) 0)
{
bayes_l = l_estimate(x, nv, d, c, nd, sigma, det2, kmin, kmax,
mean_distances, distances, neighbors,
approximate_l_mode, &fpn, outdev, fdbg);
if (bayes_l != (float *) 0)
{
for (i = 0; i < kmax-kmin+1; i++)
bayes_l[i] = 100.0*(bayes_l[i]/nv);
}
mx_free((void **) mean_distances, c);
}
else
*errc = errno;
mx_free((void **) neighbors, c);
}
else
*errc = errno;
mx_free((void **) distances, nv);
}
else
*errc = errno;
}
else
*errc = errno;
free(det2);
return bayes_l;
}
/*
Calculate inverted covariance matrices of `c' classes in `x' in SSM
format.
*/
float **calc_sig(float **x, int c, int d, int *nd, float **det, int *errc)
{
int status;
int i;
int offset;
float dsign;
float dexp;
float **sig;
float **sigma;
float **inverse; /* inverted covariance matrix for one class */
float *x_det; /* array of natural log of determinants */
status = 0;
sig = (float **) 0;
x_det = malloc(c*sizeof(float));
if (x_det)
{
sig = calloc(c, sizeof(float *));
if (sig)
{
/*
Estimate and invert covariance matrices in the transformed space.
*/
offset = 0;
for (i = 0; (i < c) && !status; i++)
{
sigma = cest(&x[offset], d, nd[i], COVARIANCE_MATRIX);
/*
Invert.
*/
inverse = fmx_inv(sigma, d, &dsign, &dexp, errc);
mx_free((void **) sigma, d);
if (!inverse)
{
status = -1;
sig = (float **) mx_free((void **) sig, d);
}
else
{
/*
Convert 'sigma' to SSM, as required by L_error().
*/
sig[i] = fmx_ssm(inverse, d);
mx_free((void **) inverse, d);
if (!sig[i])
{
sig = (float **) mx_free((void **) sig, i);
status = -1;
*errc = errno;
}
else
x_det[i] = dexp;
}
offset += nd[i];
}
if (!status && det)
*det = x_det;
}
}
return sig;
}
syntax highlighted by Code2HTML, v. 0.9.1