/* File name: knn.c Created by: Ljubomir Buturovic Created: 12/31/2002 Purpose: implementation of k-NN classification methods. */ /* Copyright 2004 Ljubomir J. Buturovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ static char rcsid[] = "$Id: knn.c,v 1.47 2006/01/23 02:20:51 ljubomir Exp $"; #include #include #include #include "pcp.h" #include "knn.h" #include "lmat.h" #include "heap.h" #include "pau.h" #include "lau.h" #include "bagging.h" #include "xpar.h" /* Calculate Mahalanobis distance between 'd'-dimensional vectors 'x' and 'y' using matrix 'ax' in IMSL Symmetric Storage Mode. 'mxv' and 'mxe' are work vectors of length 'd'. Must be preallocated. */ float mahalanobis(float *x, float *y, int d, float *ax, float *mxv, float *mxe) { int i; float distance; double dsum; for (i = 0; i < d; i++) mxv[i] = x[i]-y[i]; fvec_smx(mxv, ax, d, mxe); dsum = 0.0; for (i = 0; i < d; i++) dsum += mxv[i]*mxe[i]; distance = sqrt(dsum); return distance; } /* Calculate distance between 'vector1', 'vector2', using distance measure 'dist'. */ static float distance(float *vector1, float *vector2, int d, int dist, float *matrix, float *mxv, float *mxe) { int i; double sum; double dot; float dst; dst = 0.0; sum = 0.0; if (dist == EUCLIDEAN_DIST) { for (i = 0; i < d; i++) { dot = vector1[i]-vector2[i]; sum += dot*dot; } dst = sum; } else if (dist == MAHALANOBIS_DIST) dst = mahalanobis(vector1, vector2, d, matrix, mxv, mxe); if (dist == EUCLIDEAN_DIST) dst = sqrt(dst); return dst; } /* Return indexes of 'k' nearest neighbors to `vector' using distance measure 'dist'. In case of failure, return NULL and set 'errc'. The possible errors are EINVAL for bad arguments, memory allocation errors and LERR_SINGULAR for attempted inversion of singular matrix. The last error can occur only if 'dist' is MAHALANOBIS_DIST. */ static int *nearest_neighbors(float *vector, int category, struct dataset *dset, int dist, int k, int *errc, FILE *fdbg) { int status; int class_count; int j; int offset; int icl; int itx; int idx; float dsign; float dexp; float dmin; int *isx; int *index; float *mxv = (float *) 0; float *mxe = (float *) 0; float *matrix; float *distances; float **cov; float **inverse; float **sigma; isx = (int *) 0; matrix = (float *) 0; status = 0; if (errc) { if (dist == MAHALANOBIS_DIST) { /* Calculate inverse covariance matrices, if needed. */ if (!dset->sigma) { sigma = malloc(dset->c*sizeof(float *)); offset = 0; for (j = 0; j < dset->c && !status; j++) { cov = cest(&dset->x[offset], dset->d, dset->nd[j], COVARIANCE_MATRIX); if (cov) { inverse = fmx_inv(cov, dset->d, &dsign, &dexp, errc); mx_free((void **) cov, dset->d); if (inverse) { sigma[j] = fmx_ssm(inverse, dset->d); offset += dset->nd[j]; if (!sigma[j]) { status = -1; *errc = errno; } } else status = -1; } else { status = -1; *errc = errno; } } if (!status) dset->sigma = sigma; else dset->sigma = vx_free(dset->sigma); } if (!status) { mxv = malloc(dset->d*sizeof(float)); if (mxv) { mxe = malloc(dset->d*sizeof(float)); if (!mxe) { status = -1; *errc = errno; } } else { status = -1; *errc = errno; } } } /* Calculate distances. */ if (!status) { distances = malloc(dset->nv*sizeof(float)); if (distances) { icl = 0; class_count = dset->nd[0]; if (dist == MAHALANOBIS_DIST) matrix = dset->sigma[icl]; for (j = 0; j < dset->nv; j++) { distances[j] = distance(vector, dset->x[j], dset->d, dist, matrix, mxv, mxe); if (fdbg) { fprintf(fdbg, "knn(); class %d test vector:\t", category); for (itx = 0; itx < dset->d; itx++) fprintf(fdbg, "%f ", vector[itx]); fprintf(fdbg, "; training vector %d:\t", j); for (itx = 0; itx < dset->d; itx++) fprintf(fdbg, "%f ", dset->x[j][itx]); fprintf(fdbg, "; distance: %f\n", distances[j]); } if ((dist == MAHALANOBIS_DIST) && (j == class_count-1)) { icl++; class_count += dset->nd[icl]; matrix = dset->sigma[icl]; } } if (k > 1) { /* Sort distances. */ status = fheap(distances, dset->nv, &index); if (!status) { isx = malloc(dset->nv*sizeof(int)); if (isx) { for (j = 0; j < dset->nv; j++) { idx = index[j]; isx[idx] = j; } free(index); } else { status = -1; *errc = errno; } } } else { /* k == 1, no need to sort. */ isx = malloc(sizeof(int)); dmin = distances[0]; isx[0] = 0; for (j = 1; j < dset->nv; j++) { if (distances[j] < dmin) { dmin = distances[j]; isx[0] = j; } } } vx_free(distances); } else { status = -1; *errc = errno; } } } return isx; } /* Classify 'vector' from 'category' using k-NN method, 'dset' and distance measure 'dist'. Return the assigned class. Break ties pseudo-randomly. NOTE: `category' is only used for logging, not computation. In case of failure, return -1 and set 'errc'. */ int knn(float *vector, int category, struct dataset *dset, int k, int dist, int *errc, FILE *fdbg) { int j; int idx; int assigned_class; int status; int icl; int *isx; int *cc; status = 0; /* Get nearest neighbors. */ cc = (int *) 0; isx = nearest_neighbors(vector, category, dset, dist, k, errc, fdbg); if (isx) { /* Classify input vector. Count number of votes (neighbors) from each class. */ cc = calloc(dset->c, sizeof(int)); if (cc) { for (j = 0; j < k; j++) { idx = isx[j]; icl = dataset_class(idx, dset->c, dset->nd); cc[icl]++; } } else status = -1; vx_free(isx); } else status = -1; if (status == -1) assigned_class = -1; else assigned_class = ivec_rand_argmax(cc, dset->c); vx_free(cc); return assigned_class; } /* Classify 'vector' using bagging k-NN method with 'nmodels', 'dset' and distance measure 'dist'. Return the assigned class. Ties are resolved pseudo-randomly. In case of success, set errc to 0, otherwise to errno. */ int knn_bagging(float *vector, int tcl, struct dataset *dset, int nmodels, int bag_size, int k, int dist, int *errc, FILE *fdbg) { int i; int status; int nvec; int icl; int assigned_class; int *bnd; int *cc = (int *) 0; float **bag = (float **) 0; struct dataset *bag_set; *errc = 0; status = 0; assigned_class = PCP_UNASSIGNED; cc = calloc(tds->c, sizeof(int)); bag = malloc(bag_size*sizeof(float *)); bnd = calloc(dset->c, sizeof(int)); if (cc && bag && bnd) { for (i = 0; (i < nmodels) && !status; i++) { nvec = resample(i, dset, bag_size, bag, bnd, fdbg); bag_set = dataset_lt(dset->d, dset->c, bnd, nvec, (char **) 0, bag); icl = knn(vector, tcl, bag_set, k, dist, errc, fdbg); if (icl == -1) status = -1; else cc[icl]++; free(bag_set); } } else { status = -1; *errc = errno; } if (!status) assigned_class = ivec_rand_argmax(cc, dset->c); vx_free(cc); vx_free(bag); vx_free(bnd); return assigned_class; } /* Classify test set using k-NN method, display and save results. */ void p_knn(int *errc, char **xname, int *dbg) { int i; int j; int k; int idx; int icl; int tcum; int tcl; int dist; int min_range; int max_range; int verbose; int *isx; int *cc; char *output_fname; FILE *outdev; FILE *fdbg = (FILE *) 0; FILE *fptr; if (tds) { if (*dbg > 0) fdbg = fopen(PCP_DBG, "a"); outdev = stdout; max_range = ivec_min(tds->nd, tds->c); input_knn(outdev, (int *) 0, 0, (int *) 0, &k, max_range, (unsigned int *) 0, (char **) 0, &dist); min_range = 0; max_range = 1; verbose = input_integer(stdin, outdev, OUTPUT_MSG, PCP_QLEN, &min_range, &min_range, &max_range); tcum = teds->nd[0]; /* cumulative cardinality of classes 0..tcl */ tcl = 0; output_fname = strdup(PCP_RCL); fptr = fopen(output_fname, "w"); if (fptr) { cc = calloc(tds->c, sizeof(int)); if (cc) { teds->prediction = malloc(teds->nv*sizeof(int)); if (teds->prediction) { for (i = 0; i < teds->nv; i++) { if (i == tcum) tcum += teds->nd[++tcl]; isx = nearest_neighbors(teds->x[i], tcl, tds, dist, k, errc, fdbg); if (isx) { ivec_set(cc, tds->c, 0); for (j = 0; j < k; j++) { idx = isx[j]; icl = dataset_class(idx, tds->c, tds->nd); cc[icl]++; } teds->prediction[i] = ivec_rand_argmax(cc, tds->c); } vx_free(isx); } } /* Store predictions in PCP_RCL. */ for (i = 0; i < teds->nv; i++) write_rcl(fptr, i, teds, tds); if (!fclose(fptr)) { predict_disp(teds, verbose, PALG_KNN); pwait(); vx_free(teds->prediction); vx_free(cc); } else { *errc = errno; *xname = strdup(output_fname); } } } else { *errc = errno; *xname = strdup(output_fname); } } else *errc = PERR_UNDEFINED_TDS; } /* Accept input parameters and pass them to the k-NN bagging function. In case of error, set 'errc'. If error is file access error, set 'xname'. */ void p_knn_bagging(int *errc, char **xname, int *dbg) { int i; int tcl; int k; int status; int dist; int nmodels; int bag_size; int min_range; int max_range; int verbose; int icl; char *output_fname; unsigned int seed; FILE *outdev; FILE *fdbg = (FILE *) 0; FILE *fptr; status = 0; tcl = 0; seed = 1; /* TBD: fixed seed. */ /* TBD: decided to fix bag_size to 'nvec' in November 2002. Perhaps revisit. */ bag_size = tds->nv; outdev = stdout; fflush(outdev); if (*dbg > 0) fdbg = fopen(PCP_DBG, "a"); teds->prediction = malloc(teds->nv*sizeof(int)); if (teds->prediction) { input_knn(outdev, (int *) 0, 0, (int *) 0, &k, -1, (unsigned int *) 0, (char **) 0, &dist); nmodels = input_nmodels(stdin, outdev); min_range = 0; max_range = 1; verbose = input_integer(stdin, outdev, OUTPUT_MSG, PCP_QLEN, &min_range, &min_range, &max_range); output_fname = strdup(PCP_RCL); fptr = fopen(output_fname, "w"); if (fptr) { inverse_video(); srand(seed); for (i = 0; i < teds->nv; i++) { icl = knn_bagging(teds->x[i], tcl, tds, nmodels, bag_size, k, dist, errc, fdbg); teds->prediction[i] = icl; } /* Store predictions in PCP_RCL. */ for (i = 0; i < teds->nv; i++) write_rcl(fptr, i, teds, tds); if (!fclose(fptr)) { predict_disp(teds, verbose, PALG_KNN); pwait(); } else { *errc = errno; *xname = strdup(output_fname); } } else { *errc = errno; *xname = strdup(output_fname); } } else *errc = errno; } /* k-NN model selection. The function chooses optimal number of nearest neighbors. */ void pcp_knn_xpar(int *errc, int dbg, char **xname) { int status; int i; int kmin; int kmax; int kstep; int ndim; int iter; float fval; float grid_point[2]; struct xpar_crit_parameters *xpar_parameters; struct knn_options *knn_options; FILE *fdbg; ndim = 1; if (dbg > 0) fdbg = fopen(PCP_DBG, "a"); else fdbg = (FILE *) 0; status = 0; clear_screen(); cursor_on(); xpar_parameters = init_spar(PALG_KNN, tds, fdbg); knn_options = calloc(1, sizeof(struct knn_options)); knn_options->dist = input_knn_dist(stdout); xpar_parameters->options = knn_options; input_xpar(tds, xpar_parameters); input_xpar_knn(tds, xpar_parameters->nxval, &kmin, &kmax, &kstep); /* Compute optimal feature transformations/subsets once, reuse them later. */ if (xpar_parameters->dr_method != PDR_NONE) status = compute_dr(xpar_parameters, xpar_parameters->dr_method, xpar_parameters->idr, xpar_parameters->fscrit, fdbg, errc); iter = 1; unlink(PCP_MSL); for (i = kmin; i <= kmax && !*errc && !status; i += kstep) { grid_point[0] = i; fval = xpar_func(grid_point, ndim, iter, xpar_parameters, errc); if (!*errc) { viprint_line(5, 1, "Current k: %10d; error rate: %7.2f%%.", (int) grid_point[0], fval); viprint_line(5, 1, "Optimal k: %10d; error rate: %7.2f%%.", (int) xpar_parameters->x1, xpar_parameters->eval); printf("\n"); iter++; } } /* We have the optimal k - now apply to the test set, if defined. */ if (!*errc && !status) { if (teds) { viprint_line(2, 1, "Applying the optimal classifier to the test dataset..."); xtest_optimal(xpar_parameters, errc, xname); } else pwait(); } xpar_free(xpar_parameters); }