/* File name: estimate_error.c Created by: Ljubomir Buturovic Created: 07/23/2001 Purpose: Bayes error estimation using Fukunaga's k-NN method. */ /* Copyright 2004 Ljubomir J. Buturovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* The principal functions in this file are l_estimate() and r_estimate(). They provide upper and lower bounds, respectively, of the Bayes error estimate, using the k-NN method described in Keinosuke Fukunaga, Introduction to Statistical Pattern Recognition, Second Edition, Chapter 7, Morgan Kaufmann, San Francisco, 1990. The method implemented in l_estimate() is fully leave-one-out. For the classification of each input vector, the covariance matrix of its' class is reestimated by removing the vector, and all nearest neighbors to the class are recomputed. As a result, the computational complexity of the method is high, as are its' memory requirements (roughly proportional to N^2, where N is the number of input vectors). The fully leave-one-out method implemented in l_estimate() is known as 'x'. The approximate method, achieved for 'approximate_l_mode' == 1 (see below), is known as 'l'. It should produce results identical to FORTRAN implementation of the same method, known as 'a'. */ /* TBD: seed unused? */ static char rcsid[] = "$Id: estimate_error.c,v 1.138 2005/05/06 14:40:03 ljubomir Exp $"; #include #include #include #include #include #include #include "pcp.h" #include "hash_util.h" #include "lmat.h" #include "lau.h" #include "knn.h" #include "pau.h" #include "estimate_error.h" static int r_mode = 0; /* The code uses two variables, 'r_mode' and 'approximate_l_mode', which control two different development/testing modes for l_estimate() function. 'r_mode' == 1 turns on the R estimation compatibility (i.e., l_estimate() will return the same results as r_estimate(). 'approximate_l_mode' == 1 turns on compatibility mode with approximate L estimate (identical to PCP FORTRAN function ee1()). This mode uses all available samples to estimate covariance matrices (i.e., the covariance matrix estimates are not leave-one-out, but resubstitution estimates). The following matrix determines the method executed by l_estimate(): approximate_l_mode r_mode method 0 0 'x' 0 1 'r' 1 0 'l' 1 1 'l' */ /* LLR comparison function. */ static int compare_likelihoods(const void *llr1, const void *llr2) { int retval = -1; struct LLR *l1; struct LLR *l2; l1 = (struct LLR *) llr1; l2 = (struct LLR *) llr2; if (l1->llr > l2->llr) retval = 1; else if (l1->llr == l2->llr) retval = 0; return retval; } /* Memory allocation for r_estimate(), l_estimate(). */ static void ee_init(int c, int kmin, int kmax, float ***fpn, float **estimate) { int i; int k; float **xpn; float *ex; xpn = malloc(c*sizeof(float *)); for (i = 0; i < c; i++) { xpn[i] = malloc((kmax-kmin+1)*sizeof(float)); for (k = kmin; k <= kmax; k++) xpn[i][k-kmin] = 0.0; } *fpn = xpn; ex = malloc((kmax-kmin+1)*sizeof(float)); for (k = kmin; k <= kmax; k++) ex[k-kmin] = 0.0; *estimate = ex; } /* Free memory allocated by ee_init(). */ static void ee_free(int c, float **fpn, float *estimate) { int i; for (i = 0; i < c; i++) free(fpn[i]); free(fpn); free(estimate); } /* Calculate all pairwise distances for 'nv' vectors in 'x' using Mahalanobis distance measure. 'sigma' is array of 'c' inverse covariance matrices in Symmetric Storage Mode, where 'c' is number of classes. 'd' is dimension of input vectors. 'nd[i]' are class cardinalities. Returns an array of 'nv' memory pointers. 'distances[i][j]' is Mahalanobis distance between X[i] and X[j]. In case of malloc() error, return (float **) 0 and set errno. */ static float **calculate_distances(float **x, int nv, int d, int c, int *nd, float **sigma, FILE *outdev) { int i; int j; int category; int class_count; float *matrix; float *mxv; float *mxe; float **distances; distances = malloc(nv*sizeof(float *)); if (distances != (float **) 0) { mxv = malloc(d*sizeof(float)); if (mxv != (float *) 0) { mxe = malloc(d*sizeof(float)); if (mxe != (float *) 0) { for (i = 0; i < nv; i++) { category = 0; class_count = nd[0]; matrix = sigma[category]; if (outdev != (FILE *) 0) fprintf(outdev, "Calculating distances for vector %7d.\n", i+1); distances[i] = malloc(nv*sizeof(float)); if (distances[i] != (float *) 0) { for (j = 0; j < nv; j++) { if (j == class_count) { class_count += nd[++category]; matrix = sigma[category]; } distances[i][j] = mahalanobis(x[i], x[j], d, matrix, mxv, mxe); } } else distances = (float **) mx_free((void **) distances, i); } vx_free(mxe); } vx_free(mxv); } } return distances; } /* Calculate distances from all 'nv' vectors in 'x' to median vectors of all classes, using Mahalanobis distance measure. sigma[i] is inverse covariance matrices of class 'i' in SSM. 'd' is dimension of input vectors, 'c' is number of categories with 'nd[i]' vectors in category i. Returns an array of 'c' memory pointers. 'distances[i]' points to array of 'nv' distances to median vector of class i. In case of error, return (float **) 0 and set errno. */ static float **calculate_mean_distances(float **x, int nv, int d, int c, int *nd, float **sigma) { int i; int j; int m; int cntr; int status = 0; float *mean; float *mxv; float *mxe; float **distances = (float **) 0; cntr = 0; mean = malloc(d*sizeof(float)); if (mean != (float *) 0) { mxv = malloc(d*sizeof(float)); if (mxv != (float *) 0) { mxe = malloc(d*sizeof(float)); if (mxe != (float *) 0) { distances = malloc(c*sizeof(float *)); if (distances != (float **) 0) { for (i = 0; (i < c) && (status == 0); i++) { /* Estimate mean vector for class i. */ for (j = 0; j < d; j++) { mean[j] = 0.0; for (m = 0; m < nd[i]; m++) mean[j] += x[cntr+m][j]; mean[j] = mean[j]/nd[i]; } /* Done - now calculate Mahalanobis distances from all vectors in 'x' to 'mean'. */ distances[i] = malloc(nv*sizeof(float)); if (distances[i] != (float *) 0) { for (m = 0; m < nv; m++) distances[i][m] = mahalanobis(x[m], mean, d, sigma[i], mxv, mxe); cntr += nd[i]; } else { distances = (float **) mx_free((void **) distances, i); status = -1; } } vx_free(mxe); } vx_free(mxv); } } vx_free(mean); } return distances; } /* Calculate k-nearest neighbor distances for all 'nv' vectors in 'x', for all 'c' classes, with 'nd[i]' vectors per class. Returns an array of pointers to class neighbors, one per class. Each pointer points to the following memory structure: +--------------+ for nv vectors: | 1 | k-th neighbor | 2 | k = kmin from class c | ... | | nv | +--------------+ for nv vectors: | 1 | k-th neighbor | 2 | k = kmin+1 from class c | ... | | nv | +--------------+ | | | ... | | | +--------------+ for nv vectors: | 1 | k-th neighbor | 2 | k = kmax from class c | ... | | nv | +--------------+ */ static float **calculate_neighbors(float **distances, int nv, int kmin, int kmax, int c, int *nd, FILE *outdev, FILE *fdbg) { int i; int j; int k; int m; int idx; int cidx; int cmax; int cnv; /* total number of vectors in classes 0..i */ float *dist; float *nx; float **neighbors; neighbors = malloc(c*sizeof(float *)); if (neighbors != (float **) 0) { cmax = ivec_max(nd, c); /* +1 in order to use finsort() */ dist = malloc((cmax+1)*sizeof(float)); cnv = 0; if (dist != (float *) 0) { for (i = 0; i < c; i++) { if (outdev != (FILE *) 0) fprintf(outdev, "Calculating neighbors from class %7d.\n", i+1); cidx = 0; nx = malloc((kmax-kmin+2)*nv*sizeof(float)); if (nx != (float *) 0) { for (m = 0; m < nv; m++) { idx = cidx; for (j = 0; j < nd[i]; j++) dist[j] = distances[m][cnv+j]; finsort(dist, nd[i]); if (fdbg != (FILE *) 0) for (j = 0; j < nd[i]; j++) fprintf(fdbg, "calculate_neighbors(); target class: %3d; vector: %5d; dist[%5d]: %12.5g\n", i+1, m+1, j, dist[j]); for (k = kmin-1; k <= kmax; k++) { nx[idx] = dist[k]; idx += nv; } cidx++; } if (fdbg != (FILE *) 0) { for (j = 0; j < nv*(kmax-kmin+1); j++) fprintf(fdbg, "calculate_neighbors(); target class: %3d; nx[%5d]: %12.5g\n", i+1, j, nx[j]); } neighbors[i] = nx; cnv += nd[i]; } else { neighbors = (float **) 0; dist = vx_free(dist); neighbors = (float **) mx_free((void **) neighbors, i); } } vx_free(dist); } } return neighbors; } /* Get cardinality of classes 0..clx-1, with 'nd[i]' vectors per class. TBD: replace with ivec_sum(). */ static int get_cardinality(int clx, int *nd) { int i; int cdx = 0; for (i = 0; i < clx; i++) cdx += nd[i]; return cdx; } /* Determine distances from all 'd'-dimensional vectors in 'x' to nearest neighbors in 'category', excluding vector 'icx' from 'category'. The k-NN distances are calculated for k between 'kmin' and 'kmax' and are obtained by adjusting 'distances', using formulae (1), (2) and (3) of 05/01/2002, and (1) of 04/30/2002, from R&D, vol. III. 'xmult' is the product (X-M)*sigma^(-1) from formula (2). 'dxk' is distance from vector 'icx' to the centroid of 'category'. 'c' is number of classes, 'nd' has class cardinalities. The index 'icx' is 0-based, and the range is 0 to nv-1. Memory layout of the neighbors is as shown for one class in calculate_neighbors(). In case of malloc() error, return (float *) 0. */ static float *recalculate_class_neighbors(int kmin, int kmax, float **x, int d, int nv, float **distances, float *xmult, float dkx, int category, int c, int *nd, int icx, int approximate_l_mode, FILE *fdbg) { int i; int j; int mx; int nx; int idx; int jcntr; char *err_msg; float *neighbors = (float *) 0; float *dist = (float *) 0; float *xdiff; /* xmult*(Xa-Xb) from formula (2) */ float distance; float dot; float fnx; float factor; dist = malloc((nd[category])*sizeof(float)); if (dist != (float *) 0) { xdiff = malloc(d*sizeof(float)); if (xdiff != (float *) 0) { mx = 0; mx = get_cardinality(category, nd); nx = nd[category]; fnx = (nx-2.0)/(nx-1.0); factor = nx; /* 'factor' is f from formula (1), R&D, vol. III, 04/30/2002 */ factor = factor/((factor-1.0)*(factor-1.0)-factor*dkx*dkx); neighbors = malloc((kmax-kmin+1)*nv*sizeof(float)); if (neighbors != (float *) 0) { for (i = 0; i < nv; i++) { jcntr = 0; /* Calculate distances between vector 'i' and all vectors in 'category' except 'icx'. */ for (j = 0; j < nd[category]; j++) { if (j != icx-mx) { distance = distances[i][mx+j]; /* original distance between i and j */ if ((r_mode == 1) || (approximate_l_mode == 1)) dist[jcntr] = distance; else { distance = distance*distance; fvec_diff(x[i], x[j+mx], xdiff, d, (int *) 0); /* 'dot' is 'x' in formula (2) of 05/01/2002, R&D, vol. III. */ dot = fvec_dot(xdiff, xmult, d, (int *) 0); dist[jcntr] = fnx*(distance+factor*dot*dot); dist[jcntr] = sqrt(dist[jcntr]); } jcntr++; } } finsort(dist, nd[category]-1); idx = i; for (j = kmin-1; j < kmax; j++) { neighbors[idx] = dist[j]; idx += nv; } } } vx_free(xdiff); } vx_free(dist); } if ((neighbors == (float *) 0) && (fdbg != (FILE *) 0)) { err_msg = strerror(errno); fprintf(fdbg, "recalculate_class_neighbors(); malloc() failure; %s\n", err_msg); } return neighbors; } /* Compute determinant of leave-one-out covariance matrix estimate, by excluding vector 'icx', for matrix characterized by 'mdet', 'edet' (determinant = mdet*2^edet). 'd' is number of features, 'nc' number of vectors in the class whose determinant is being reestimated, 'dm' distance from 'icx' to mean vector of its' class. Returns log2 of the updated determinant, so the actual determinant is 2**return_value (mantisa is set to 1.0). Based on formula (7.67) from Fukunaga. */ static float recalculate_determinant(float mdet, float edet, int d, int nc, int icx, float dm, int approximate_l_mode) { float efactor; float fx; float det; efactor = log(2.0); fx = 1.0-nc*dm*dm/((nc-1.0)*(nc-1.0)); det = log(mdet)+edet*efactor+d*log((nc-1.0)/(nc-2.0))+log(fx); det = det/efactor; if ((r_mode == 1) || (approximate_l_mode == 1)) det = edet+log(mdet)/efactor; return det; } static float calculate_llr(int d, int nv1, int nv2, float d1, float d2, float mdet1, float edet1, float mdet2, float edet2) { float llr; float constant_term; constant_term = nv1; constant_term = log(constant_term/nv2); constant_term += 0.5*(log(mdet1/mdet2)+(edet1-edet2)*log(2.0)); if ((d1 != 0) && (d2 != 0)) llr = d*log(d1/d2)+constant_term; else if (d1 != 0) llr = FLT_MAX; else llr = -FLT_MAX; return llr; } /* Determine more likely class between 'c1' and 'c2' of cardinalities 'nv1', 'nv2', given decision threshold 't', distances 'd1' and 'd2' and determinants 'mdet1', 'edet1', 'mdet2', 'edet2', for a 'd'-dimensional vector characterized by distances 'd1' and 'd2'. 'd1' is distance between the vector being classified and k-th nearest neighbor from class 1. 'd2' is distance beween the vector and k-th nearest neighbor from class 2. Based on formula (7.5) from Fukunaga. Returns 'c1' or 'c2'. */ static int assign_class(int d, int c1, int c2, int nv1, int nv2, float d1, float d2, float mdet1, float edet1, float mdet2, float edet2, float t) { int aclass; float llr; llr = calculate_llr(d, nv1, nv2, d1, d2, mdet1, edet1, mdet2, edet2); aclass = c1; if (llr > t) aclass = c2; return aclass; } /* Return 0 if 'index' and 'lix' index the same vector, otherwise return 1. 'index' range is from 0..nc1+nc2-1, 'lix' from 0..nc1-1 if 'category' equals CLASS_C1, 0..nc2-1 if 'category' equals CLASS_C2. */ static int cmp_index(int index, int lix, int category, int nc1, int nc2) { int check = 1; if (r_mode != 1) if (lix != IDX_NONE) { if ((category == CLASS_C1) && (index < nc1)) { if (index == lix) check = 0; } else if ((category == CLASS_C2) && (index >= nc1)) { if (index-nc1 == lix) check = 0; } } return check; } /* Find the optimal threshold for deciding between classes c1 and c2, characterized by covariance matrix determinants 'mdet1', 'edet1' and 'mdet2', 'edet2'. 'c1_c1_neighbors', 'c2_c1_neighbors' and 'c1_c2_neighbors', 'c2_c2_neighbors' are distances to k-th nearest neighbor from class c1 and c2, respectively, for all nc1+nc2 'd'-dimensional vectors. The length of 'c1_*' arrays is 'nc1' and the length of 'c2_*' arrays is 'nc2'. The optimal threshold is defined as the log-likelihood value which gives least classification error between the two classes. Leave-one-out functionality added 09/03/2001: if 'lix' != IDX_NONE, omit vector 'lix' from 'category' from optimization. 'lix' range is 0..nc1-1 for 'category' == CLASS_C1, 0..nc2-1 for 'category' == CLASS_C2. In case of success, errno is set to 0; otherwise, it is set to any of the values set by malloc(). */ static float find_optimal_threshold(int d, int nc1, int nc2, float mdet1, float edet1, float mdet2, float edet2, float *c1_c1_neighbors, float *c2_c1_neighbors, float *c1_c2_neighbors, float *c2_c2_neighbors, int lix, int category, FILE *fdbg) { int i; int j; int mc; int cmc; int done; int nix1; int nix2; int icntr; float constant_term; float threshold = 0.0; float ct; float nxct; float d1; float d2; float det; float rcc; char *err_msg; struct LLR *llr; errno = 0; constant_term = nc1; constant_term = log(constant_term/nc2); det = 0.5*(log(mdet1/mdet2)+(edet1-edet2)*log(2.0)); constant_term += det; nix1 = nc1; nix2 = nc2; rcc = 0.0; if (r_mode != 1) { if ((lix != IDX_NONE) && (category == CLASS_C1)) { nix1--; rcc = nc1; rcc = log(nix1/rcc); } if ((lix != IDX_NONE) && (category == CLASS_C2)) { nix2--; rcc = nc2; rcc = log(rcc/nix2); } } constant_term += rcc; icntr = 0; llr = malloc((nix1+nix2)*sizeof(struct LLR)); if (llr != (struct LLR *) 0) { for (i = 0; i < nc1+nc2; i++) { if (cmp_index(i, lix, category, nc1, nc2) != 0) { if (i < nc1) { d1 = c1_c1_neighbors[i]; d2 = c1_c2_neighbors[i]; llr[icntr].category = CLASS_C1; } else { d1 = c2_c1_neighbors[i-nc1]; d2 = c2_c2_neighbors[i-nc1]; llr[icntr].category = CLASS_C2; } if (d2 != 0.0) { if (d1 != 0.0) llr[icntr].llr = d*log(d1/d2)+constant_term; else llr[icntr].llr = -FLT_MAX; } else { if (d1 == 0.0) llr[icntr].llr = constant_term; else llr[icntr].llr = FLT_MAX; } #if 0 if ((fdbg != (FILE *) 0) && (lix == 8) && (category == CLASS_C2)) fprintf(fdbg, "find_optimal_threshold(); lix: %5d; i: %5d; " "d1: %12.5g; d2: %12.5g; det: %12.5g; constant_term: %12.5g\n", lix+1, i, d1, d2, det, constant_term); #endif icntr++; } } qsort(llr, nix1+nix2, sizeof(struct LLR), compare_likelihoods); /* Set threshold to large negative value. Then, everything is classified as c2, hence misclassification count equals nc1. */ threshold = -FLT_MAX; cmc = nix1; mc = nix1; /* Set threshold between each two neighboring log-likelihood ratios in turn, and adjust misclassification count. */ ct = threshold; for (i = 0; i < nix1+nix2-1; i++) { nxct = 0.5*(llr[i].llr+llr[i+1].llr); #if 0 if ((fdbg != (FILE *) 0) && (lix == 8) && (category == CLASS_C2)) fprintf(fdbg, "find_optimal_threshold(); lix: %5d; threshold: %12.5g; nxct: %12.5g; llr[%5d]: " "%12.5g; llr[%5d]: %12.5g\n", lix+1, threshold, nxct, i, llr[i].llr, i+1, llr[i+1].llr); #endif /* Vector i, and all vectors having the same llr as i, may have changed classification from c2 to c1. If that is the case, update error count. */ if (nxct != ct) { done = 0; for (j = i; (j >= 0) && (done == 0); j--) { if (llr[j].llr == llr[i].llr) { if (llr[j].category == CLASS_C1) mc -= 1; else mc += 1; } else done = 1; } } /* If this is the best result so far, update the optimal threshold. */ if (mc < cmc) { cmc = mc; threshold = nxct; } ct = nxct; } free(llr); } else if (fdbg != (FILE *) 0) { err_msg = strerror(errno); fprintf(fdbg, "find_optimal_threshold(); malloc() failure; %s\n", err_msg); } return threshold; } /* Calculate optimal thresholds for all pairs of classes. In case of malloc() error, return (float **) 0 and set errno. */ static float **get_optimal_thresholds(int nv, int d, int c, int *nd, float *det, int kmin, int kmax, float **neighbors, FILE *fdbg) { int status = 0; int length; int k; int c1; int c2; int cix; int cix1; int cix2; int idx; float threshold; float **optimal_thresholds = (float **) 0; length = (c-1)*c; length = length/2; idx = 0; optimal_thresholds = malloc((kmax-kmin+1)*sizeof(float *)); if (optimal_thresholds != (float **) 0) { for (k = 0; (k <= kmax-kmin) && (status == 0); k++) { cix = 0; optimal_thresholds[k] = malloc(length*sizeof(float)); if (optimal_thresholds[k] != (float *) 0) { cix1 = 0; /* cumulative cardinality of classes 0..c1-1 */ for (c1 = 0; c1 < c-1; c1++) { cix2 = cix1+nd[c1]; /* cumulative cardinality of classes 0..c2-1 */ for (c2 = c1+1; c2 < c; c2++) { threshold = find_optimal_threshold(d, nd[c1], nd[c2], det[2*c1], det[2*c1+1], det[2*c2], det[2*c2+1], &neighbors[c1][idx+cix1], &neighbors[c1][idx+cix2], &neighbors[c2][idx+cix1], &neighbors[c2][idx+cix2], IDX_NONE, CLASS_NONE, fdbg); if (fdbg != (FILE *) 0) fprintf(fdbg, "get_optimal_thresholds(); c1: %5d; c2: %5d; " "k: %5d; threshold: %12.5g\n", c1+1, c2+1, k+1, threshold); optimal_thresholds[k][cix++] = threshold; cix2 += nd[c2]; } cix1 += nd[c1]; } } else { status = -1; optimal_thresholds = (float **) mx_free((void **) optimal_thresholds, k); } idx += nv; } } return optimal_thresholds; } static void log_thresholds(FILE *fdbg, char *method, float **optimal_thresholds, int kmin, int kmax, int c) { int cix; int k; int c1; int c2; if (fdbg != (FILE *) 0) { for (k = kmin; k <= kmax; k++) { cix = 0; for (c1 = 0; c1 < c-1; c1++) for (c2 = c1+1; c2 < c; c2++) { fprintf(fdbg, "%s; optimal_thresholds[%d][%d]: %12.5g\n", method, k, cix, optimal_thresholds[k-kmin][cix]); cix++; } } } } /* Calculate k-NN Bayes error leave-one-out estimate for set of 'nv' vectors in 'x'. The vectors are 'd'-dimensional and the number of classes is 'c', with 'nd[i]' vectors in class i and covariance/determinant information in 'sigma' and 'det', respectively. The estimates are calculated for k between 'kmin' and 'kmax', inclusive. 'distances' are pairwise distances between all vectors in 'x'. 'neighbors' are resubstitution neighbors. 'fpn' has FN (false negatives) per class. Send messages to 'outdev', debug information to 'fdbg'. In case of error, return (float *) 0. */ static float *l_estimate(float **x, int nv, int d, int c, int *nd, float **sigma, float *det, int kmin, int kmax, float **mean_distances, float **distances, float **neighbors, int approximate_l_mode, float ***fpn, FILE *outdev, FILE *fdbg) { int ivx; int k; int c1; int c2; int assigned_class = 0; /* assigned class for vector 'i' */ int true_class; /* true class for vector 'i' */ int class_count; /* total number of vectors in classes 0..true_class */ int offx; /* offset of first vector in class true_class */ int idx; int icx1; int icx2; int cix; int rx; float llr; float threshold; float d1; float d2; float edet; float *xmean; float *class_mean = (float *) 0; float *xmult; float *c1_c1_neighbors; float *c1_c2_neighbors; float *c2_c1_neighbors; float *c2_c2_neighbors; float *class_neighbors; float *estimate = (float *) 0; float **xpn; float **optimal_thresholds; hash_t *eliminated_classes; true_class = 0; class_neighbors = neighbors[0]; /* initialize */ class_count = nd[0]; offx = 0; optimal_thresholds = get_optimal_thresholds(nv, d, c, nd, det, kmin, kmax, neighbors, fdbg); if (optimal_thresholds != (float **) 0) { log_thresholds(fdbg, "l_estimate()", optimal_thresholds, kmin, kmax, c); ee_init(c, kmin, kmax, fpn, &estimate); xpn = *fpn; xmean = malloc(d*sizeof(float)); /* store difference X-M; see R&D vol. III, 05/01/2002 */ if (xmean != (float *) 0) { xmult = malloc(d*sizeof(float)); /* store product (X-M)*sigma; see R&D vol. III, 05/01/2002 */ if (xmult != (float *) 0) { class_mean = mest(&x[offx], d, nd[true_class]); for (ivx = 0; (ivx < nv) && (class_neighbors != (float *) 0); ivx++) { if (outdev != (FILE *) 0) fprintf(outdev, "Classifying vector %10d, for all k.\n", ivx+1); if (ivx == class_count) { vx_free(class_mean); offx += nd[true_class]; true_class++; class_mean = mest(&x[offx], d, nd[true_class]); class_count += nd[true_class]; } fvec_diff(x[ivx], class_mean, xmean, d, (int *) 0); fvec_smx(xmean, sigma[true_class], d, xmult); if (r_mode == 1) class_neighbors = neighbors[true_class]; else class_neighbors = recalculate_class_neighbors(kmin, kmax, x, d, nv, distances, xmult, mean_distances[true_class][ivx], true_class, c, nd, ivx, approximate_l_mode, fdbg); if (class_neighbors != (float *) 0) { edet = recalculate_determinant(det[2*true_class], det[2*true_class+1], d, nd[true_class], ivx, mean_distances[true_class][ivx], approximate_l_mode); if (fdbg != (FILE *) 0) fprintf(fdbg, "l_estimate(); true_class: %5d; det: %12.5g; edet: %12.5g; L edet: %12.5g\n", true_class+1, det[2*true_class], det[2*true_class+1], edet); for (k = 0; k <= kmax-kmin; k++) { idx = k*nv; eliminated_classes = hashCreate(); cix = 0; for (c1 = 0; c1 < c-1; c1++) { icx1 = get_cardinality(c1, nd); for (c2 = c1+1; c2 < c; c2++) { if ((hashGetInt(eliminated_classes, c1) == INT_MAX) && (hashGetInt(eliminated_classes, c2) == INT_MAX)) { icx2 = get_cardinality(c2, nd); /* 'threshold' is optimal threshold for classes 'c1', 'c2', excluding vector 'ivx'. */ if (true_class == c1) { c1_c1_neighbors = &(class_neighbors[idx+icx1]); c2_c1_neighbors = &(class_neighbors[idx+icx2]); /* TBD: perhaps precompute and store cardinalities, for minor speedup. 08/29/2001 */ rx = k*nv+get_cardinality(c1, nd); c1_c2_neighbors = &neighbors[c2][rx]; rx = k*nv+get_cardinality(c2, nd); c2_c2_neighbors = &neighbors[c2][rx]; threshold = find_optimal_threshold(d, nd[c1], nd[c2], 1.0, edet, det[2*c2], det[2*c2+1], c1_c1_neighbors, c2_c1_neighbors, c1_c2_neighbors, c2_c2_neighbors, ivx-get_cardinality(true_class, nd), CLASS_C1, fdbg); d1 = c1_c1_neighbors[ivx-icx1]; d2 = c1_c2_neighbors[ivx-icx1]; if ((r_mode == 1) || (approximate_l_mode == 1)) assigned_class = assign_class(d, c1, c2, nd[c1], nd[c2], d1, d2, 1.0, edet, det[2*c2], det[2*c2+1], threshold); else assigned_class = assign_class(d, c1, c2, nd[c1]-1, nd[c2], d1, d2, 1.0, edet, det[2*c2], det[2*c2+1], threshold); } else if (true_class == c2) { c1_c2_neighbors = &(class_neighbors[idx+icx1]); c2_c2_neighbors = &(class_neighbors[idx+icx2]); rx = k*nv+get_cardinality(c1, nd); c1_c1_neighbors = &neighbors[c1][rx]; rx = k*nv+get_cardinality(c2, nd); c2_c1_neighbors = &neighbors[c1][rx]; threshold = find_optimal_threshold(d, nd[c1], nd[c2], det[2*c1], det[2*c1+1], 1.0, edet, c1_c1_neighbors, c2_c1_neighbors, c1_c2_neighbors, c2_c2_neighbors, ivx-get_cardinality(true_class, nd), CLASS_C2, fdbg); d1 = c2_c1_neighbors[ivx-icx2]; d2 = c2_c2_neighbors[ivx-icx2]; if ((r_mode == 1) || (approximate_l_mode == 1)) assigned_class = assign_class(d, c1, c2, nd[c1], nd[c2], d1, d2, det[2*c1], det[2*c1+1], 1.0, edet, threshold); else assigned_class = assign_class(d, c1, c2, nd[c1], nd[c2]-1, d1, d2, det[2*c1], det[2*c1+1], 1.0, edet, threshold); } else { /* If 'ivx' does not belong to either of the two classes being classified against, then use the optimized threshold value. */ d1 = neighbors[c1][idx+ivx]; d2 = neighbors[c2][idx+ivx]; threshold = optimal_thresholds[k][cix]; assigned_class = assign_class(d, c1, c2, nd[c1], nd[c2], d1, d2, det[2*c1], det[2*c1+1], det[2*c2], det[2*c2+1], threshold); } if (fdbg != (FILE *) 0) { llr = calculate_llr(d, nd[c1], nd[c2], d1, d2, det[2*c1], det[2*c1+1], det[2*c2], det[2*c2+1]); fprintf(fdbg, "l_estimate(); ivx: %4d; class: %3d; " "c1: %3d; c2: %3d; k: %5d; d1: %10.5g; d2: %10.5g; " "llr: %11.5g; thd: %11.5g; ac: %3d\n", ivx+1, true_class+1, c1+1, c2+1, k+1, d1, d2, llr, threshold, assigned_class+1); } if (assigned_class == c1) hashPutInt(eliminated_classes, c2, c2); else hashPutInt(eliminated_classes, c1, c1); } cix++; } } if (assigned_class != true_class) { estimate[k] += 1.0; xpn[true_class][k] += 1.0; if (fdbg != (FILE *) 0) fprintf(fdbg, "l_estimate(); misclassified ivx: %5d; k: %5d; " "true_class: %3d; assigned_class: %3d\n", ivx+1, k+1, true_class+1, assigned_class+1); } hashDelete(eliminated_classes); } if (r_mode != 1) free(class_neighbors); } else estimate = (float *) 0; } mx_free((void **) optimal_thresholds, kmax-kmin+1); vx_free(xmult); vx_free(class_mean); } } vx_free(xmean); } return estimate; } /* Resubstitution Bayes error estimate. The method uses threshold optimization as described in Fukunaga, section 7.4, paragraph 'The threshold for non-normal distributions'. */ static float *r_estimate(float **x, int nv, int d, int c, int *nd, float *det, int kmin, int kmax, float **distances, float **neighbors, float ***fpn, FILE *outdev, FILE *fdbg) { int i; int k; int c1; int c2; int assigned_class = 0; /* assigned class for vector 'i' */ int true_class; /* true class for vector 'i' */ int class_count; /* total number of vectors in classes 0..true_class */ int cix; int idx; float threshold; float d1; float d2; float llr; float *estimate; float **xpn; float **optimal_thresholds; hash_t *eliminated_classes; if (outdev != (FILE *) 0) fprintf(outdev, "Calculating resubstitution error..."); true_class = 0; class_count = nd[0]; /* Calculate optimal threshold for each pair of classes. */ optimal_thresholds = get_optimal_thresholds(nv, d, c, nd, det, kmin, kmax, neighbors, fdbg); log_thresholds(fdbg, "r_estimate()", optimal_thresholds, kmin, kmax, c); ee_init(c, kmin, kmax, fpn, &estimate); xpn = *fpn; for (i = 0; i < nv; i++) { if (i == class_count) class_count += nd[++true_class]; for (k = 0; k <= kmax-kmin; k++) { eliminated_classes = hashCreate(); idx = k*nv+i; cix = 0; for (c1 = 0; c1 < c-1; c1++) for (c2 = c1+1; c2 < c; c2++) { if ((hashGetInt(eliminated_classes, c1) == INT_MAX) && (hashGetInt(eliminated_classes, c2) == INT_MAX)) { d1 = neighbors[c1][idx]; d2 = neighbors[c2][idx]; threshold = optimal_thresholds[k][cix]; assigned_class = assign_class(d, c1, c2, nd[c1], nd[c2], d1, d2, det[2*c1], det[2*c1+1], det[2*c2], det[2*c2+1], threshold); if (fdbg != (FILE *) 0) { llr = calculate_llr(d, nd[c1], nd[c2], d1, d2, det[2*c1], det[2*c1+1], det[2*c2], det[2*c2+1]); fprintf(fdbg, "r_estimate(); ivx: %4d; class: %3d; c1: %3d; c2: %3d; " "k: %4d; d1: %10.5g; d2: %10.5g; det: %10.5g; edet: %10.5g; " "llr: %11.5g; thd: %10.5g; ac: %3d\n", i+1, true_class+1, c1+1, c2+1, k+1, d1, d2, det[2*true_class], det[2*true_class+1], llr, threshold, assigned_class+1); } if (assigned_class == c1) hashPutInt(eliminated_classes, c2, c2); else hashPutInt(eliminated_classes, c1, c1); } cix++; } if (assigned_class != true_class) { estimate[k] += 1.0; xpn[true_class][k] += 1.0; if (fdbg != (FILE *) 0) fprintf(fdbg, "r_estimate(); misclassified ivx: %5d; k: %5d; " "true_class: %3d; assigned_class: %3d\n", i+1, k+1, true_class+1, assigned_class+1); } hashDelete(eliminated_classes); } } for (k = 0; k <= kmax-kmin; k++) free(optimal_thresholds[k]); free(optimal_thresholds); if (outdev != (FILE *) 0) fprintf(outdev, " done.\n"); return estimate; } /* Save Bayes error estimate 'bayes_error' and misclassification rate per class 'fpn', produced by 'method', for k between 'kmin' and 'kmax' in 'fname'. Return 0 for success, -1 in case of failure and set errno. */ static int save_estimate(char *method, float *bayes_error, float **fpn, int c, int kmin, int kmax, char *fname) { int retval = 0; int j; int k; FILE *fptr; /* If file exists, append to it, otherwise create it. */ fptr = fopen(fname, "a"); if (fptr == (FILE *) 0) retval = -1; else { for (k = 0; k < kmax-kmin+1; k++) { fprintf(fptr, "%s\t%5d\t%10.2f", method, k+kmin, bayes_error[k]); for (j = 0; j < c; j++) fprintf(fptr, "\t%10.2f", fpn[j][k]); fprintf(fptr, "\n"); } fclose(fptr); } return retval; } /* Launch Bayes error estimation for TDS. 'errc' is output error code. If 'debug' is not 0, send debugging information to 'pcp.dbg' file. For 'approximate_l_mode' == 1, use approximate L mode. */ void p_bayes(int *errc, int approximate_l_mode, int debug) { int i; int j; int kmin; int kmax; int status; int idx; int knn; int dflt; int mink; int maxk; float factor; char *fname; char *msg; float *det; float *bayes_l; float *bayes_r; float **distances; float **mean_distances; float **neighbors; float **fpn; FILE *outdev; FILE *fdbg; unsigned int seed; clear_screen(); fname = (char *) 0; if ((r_mode == 1) && (approximate_l_mode == 1)) r_mode = 0; *errc = 0; fdbg = (FILE *) 0; if (debug) fdbg = fopen(PCP_DBG, "a"); outdev = stdout; dflt = 1; mink = 1; maxk = ivec_min(tds->nd, tds->c)-1; cursor_on(); kmin = input_integer(stdin, outdev, " First nearest neighbor [1]:", PCP_QLEN, &dflt, &mink, &maxk); mink = kmin; msg = malloc((PCP_QLEN+1)*sizeof(char)); snprintf(msg, PCP_QLEN+1, " Last nearest neighbor [%d]:", maxk); dflt = maxk; kmax = input_integer(stdin, outdev, msg, PCP_QLEN, &dflt, &mink, &maxk); free(msg); seed = input_seed(stdin, outdev); cursor_off(); inverse_video(); knn = kmax-kmin+1; status = dataset_sigma(tds, errc); if (!status) { det = malloc(2*tds->c*sizeof(float)); if (det) { factor = log(2.0); idx = 0; for (i = 0; i < tds->c; i++) { det[idx] = tds->det[idx]; idx++; det[idx] = tds->det[idx]/factor; idx++; } distances = calculate_distances(tds->x, tds->nv, tds->d, tds->c, tds->nd, tds->sigma, outdev); neighbors = calculate_neighbors(distances, tds->nv, kmin, kmax, tds->c, tds->nd, outdev, fdbg); mean_distances = calculate_mean_distances(tds->x, tds->nv, tds->d, tds->c, tds->nd, tds->sigma); bayes_r = r_estimate(tds->x, tds->nv, tds->d, tds->c, tds->nd, det, kmin, kmax, distances, neighbors, &fpn, outdev, fdbg); /* 'fpn' is class-conditional error rate, calculated as FN/CC (false negatives/class cardinality), according to equation (2.1) from B. D. Ripley, 'Pattern Recognition and Neural Networks', Cambridge University Press, Cambridge, 1996, section 2.1. */ if (bayes_r) { for (i = 0; i < knn; i++) { bayes_r[i] = 100.0*(bayes_r[i]/tds->nv); for (j = 0; j < tds->c; j++) fpn[j][i] = 100.0*(fpn[j][i]/tds->nd[j]); } } fname = strdup(PCP_BEE); unlink(fname); status = save_estimate(RESUBSTITUTION, bayes_r, fpn, tds->c, kmin, kmax, fname); ee_free(tds->c, fpn, bayes_r); bayes_l = l_estimate(tds->x, tds->nv, tds->d, tds->c, tds->nd, tds->sigma, det, kmin, kmax, mean_distances, distances, neighbors, approximate_l_mode, &fpn, outdev, fdbg); if (bayes_l) { for (i = 0; i < knn; i++) { bayes_l[i] = 100.0*(bayes_l[i]/tds->nv); for (j = 0; j < tds->c; j++) fpn[j][i] = 100.0*(fpn[j][i]/tds->nd[j]); } status = save_estimate(LEAVE_ONE_OUT, bayes_l, fpn, tds->c, kmin, kmax, fname); ee_free(tds->c, fpn, bayes_l); } else *errc = 101; /* TBD: what is this error code? */ vx_free(det); } else *errc = errno; } free(fname); } static int p_disp_alloc(int nl, float **rval, float **lval) { int status = 0; *rval = malloc(nl*sizeof(float)); if (rval) { *lval = malloc(nl*sizeof(float)); if (!lval) { status = -1; vx_free(*rval); } } else status = -1; return status; } /* Display Bayes error estimation results. */ void p_disp_bayes(int *errc, char **xname) { int i; int nl; int llen; int cntr; int rcntr; int lcntr; int status; int kmin; char *fname; char *line; float *rval; float *lval; char **tokens; FILE *fptr; status = 0; clear_screen(); cursor_on(); fname = input_filename(PCP_UMSG_FILENAME, PCP_BEE, stdout); inverse_video(); kmin = 0; nl = file_info(fname, &llen, (int *) 0, '\0'); rcntr = 0; lcntr = 0; cntr = 0; if (nl >= 0) { line = malloc((llen+2)*sizeof(char)); status = p_disp_alloc(nl, &rval, &lval); if (!status) { fptr = fopen(fname, "r"); if (fptr) { while (fgets(line, llen+2, fptr)) { tokens = str_tokenize(line, WHITESPACE); if (cntr == 0) kmin = atoi(tokens[1]); if (!strcmp(tokens[0], RESUBSTITUTION)) rval[rcntr++] = atof(tokens[2]); else if (!strcmp(tokens[0], LEAVE_ONE_OUT)) lval[lcntr++] = atof(tokens[2]); str_free(tokens); cntr++; } free(line); status = fclose(fptr); if (status == -1) { *errc = errno; *xname = strdup(fname); } else { if (lcntr == rcntr) { printf("%s", PCP_XLINE); printf("%s", PCP_EMPTY_LINE); printf("| B A Y E S E R R O R E S T I M A T I O N |\n"); printf("%s", PCP_EMPTY_LINE); printf("%s", PCP_XLINE); printf("| k | Leave-one-out error estimate | Resubstitution error estimate |\n"); printf("%s", PCP_XLINE); for (i = 0; i < rcntr; i++) { printf("| %10d | %5.2f | %5.2f |\n", kmin+i, lval[i], rval[i]); } printf("%s", PCP_XLINE); pwait(); } else *errc = PERR_BAD_INPUT_FILE; } } else { *errc = errno; *xname = strdup(fname); } vx_free(rval); vx_free(lval); } else *errc = errno; } else { *errc = errno; *xname = strdup(fname); } free(fname); } /* Calculate leave-one-out k-NN estimate of Bayes error for set of 'nv' vectors in 'x'. The vectors are 'd'-dimensional and the number of classes is 'c', with 'nd[i]' vectors in class i. 'sigma[i]' is covariance matrix of class i in SSM. 'det[i]' is natural log of the determinant of the covariance matrix. The estimates are calculated for k between 'kmin' and 'kmax', inclusive. Send messages to 'outdev', debug information to 'fdbg'. In case of error, return (float *) 0 and set 'errc'. */ float *L_error(float **x, int nv, int d, int c, int *nd, float **sigma, float *det, int kmin, int kmax, FILE *outdev, FILE *fdbg, int *errc) { int i; int status; int idx; int approximate_l_mode = 1; float efactor; float *det2; float *bayes_l = (float *) 0; float **distances; float **mean_distances; float **neighbors; float **fpn; status = 0; /* Convert 'det' to the format required by l_estimate(). */ if (status == 0) { det2 = malloc(c*2*sizeof(float)); if (det2 != (float *) 0) { idx = 0; efactor = 1.0/log(2.0); for (i = 0; i < c; i++) { det2[idx++] = 1.0; det2[idx++] = det[i]*efactor; } } else status = -1; } if (status == 0) { distances = calculate_distances(x, nv, d, c, nd, sigma, outdev); if (distances != (float **) 0) { neighbors = calculate_neighbors(distances, nv, kmin, kmax, c, nd, outdev, fdbg); if (neighbors != (float **) 0) { mean_distances = calculate_mean_distances(x, nv, d, c, nd, sigma); if (mean_distances != (float **) 0) { bayes_l = l_estimate(x, nv, d, c, nd, sigma, det2, kmin, kmax, mean_distances, distances, neighbors, approximate_l_mode, &fpn, outdev, fdbg); if (bayes_l != (float *) 0) { for (i = 0; i < kmax-kmin+1; i++) bayes_l[i] = 100.0*(bayes_l[i]/nv); } mx_free((void **) mean_distances, c); } else *errc = errno; mx_free((void **) neighbors, c); } else *errc = errno; mx_free((void **) distances, nv); } else *errc = errno; } else *errc = errno; free(det2); return bayes_l; } /* Calculate inverted covariance matrices of `c' classes in `x' in SSM format. */ float **calc_sig(float **x, int c, int d, int *nd, float **det, int *errc) { int status; int i; int offset; float dsign; float dexp; float **sig; float **sigma; float **inverse; /* inverted covariance matrix for one class */ float *x_det; /* array of natural log of determinants */ status = 0; sig = (float **) 0; x_det = malloc(c*sizeof(float)); if (x_det) { sig = calloc(c, sizeof(float *)); if (sig) { /* Estimate and invert covariance matrices in the transformed space. */ offset = 0; for (i = 0; (i < c) && !status; i++) { sigma = cest(&x[offset], d, nd[i], COVARIANCE_MATRIX); /* Invert. */ inverse = fmx_inv(sigma, d, &dsign, &dexp, errc); mx_free((void **) sigma, d); if (!inverse) { status = -1; sig = (float **) mx_free((void **) sig, d); } else { /* Convert 'sigma' to SSM, as required by L_error(). */ sig[i] = fmx_ssm(inverse, d); mx_free((void **) inverse, d); if (!sig[i]) { sig = (float **) mx_free((void **) sig, i); status = -1; *errc = errno; } else x_det[i] = dexp; } offset += nd[i]; } if (!status && det) *det = x_det; } } return sig; }