/*
  File name: knn.c
  Created by: Ljubomir Buturovic
  Created: 12/31/2002
  Purpose: implementation of k-NN classification methods.
*/

/*
  Copyright 2004 Ljubomir J. Buturovic

  Permission is hereby granted, free of charge, to any person
  obtaining a copy of this software and associated documentation files
  (the "Software"), to deal in the Software without restriction,
  including without limitation the rights to use, copy, modify, merge,
  publish, distribute, sublicense, and/or sell copies of the Software,
  and to permit persons to whom the Software is furnished to do so,
  subject to the following conditions:

  The above copyright notice and this permission notice shall be
  included in all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  SOFTWARE.
*/

static char rcsid[] = "$Id: knn.c,v 1.47 2006/01/23 02:20:51 ljubomir Exp $";

#include <stdlib.h>
#include <math.h>
#include <errno.h>
#include "pcp.h"
#include "knn.h"
#include "lmat.h"
#include "heap.h"
#include "pau.h"
#include "lau.h"
#include "bagging.h"
#include "xpar.h"

/*
  Calculate Mahalanobis distance between 'd'-dimensional vectors 'x'
  and 'y' using matrix 'ax' in IMSL Symmetric Storage Mode. 'mxv' and
  'mxe' are work vectors of length 'd'. Must be preallocated.
*/
float mahalanobis(float *x, float *y, int d, float *ax, float *mxv, float *mxe)
{
  int   i;
  float distance;
  double dsum;
  
  for (i = 0; i < d; i++)
    mxv[i] = x[i]-y[i];
  fvec_smx(mxv, ax, d, mxe);
  dsum = 0.0;
  for (i = 0; i < d; i++)
    dsum += mxv[i]*mxe[i];
  distance = sqrt(dsum);
  return distance;
}

/*
  Calculate distance between 'vector1', 'vector2', using distance
  measure 'dist'.
*/
static float distance(float *vector1, float *vector2, int d, int dist, float *matrix,
		      float *mxv, float *mxe)
{
  int    i;
  double sum;
  double dot;
  float  dst;

  dst = 0.0;
  sum = 0.0;
  if (dist == EUCLIDEAN_DIST)
    {
      for (i = 0; i < d; i++)
	{
	  dot = vector1[i]-vector2[i];
	  sum += dot*dot;
	}
      dst = sum;
    }
  else if (dist == MAHALANOBIS_DIST)
    dst = mahalanobis(vector1, vector2, d, matrix, mxv, mxe);
  if (dist == EUCLIDEAN_DIST)
    dst = sqrt(dst);
  return dst;
}

/*
  Return indexes of 'k' nearest neighbors to `vector' using distance
  measure 'dist'.

  In case of failure, return NULL and set 'errc'. The possible errors
  are EINVAL for bad arguments, memory allocation errors and
  LERR_SINGULAR for attempted inversion of singular matrix. The last
  error can occur only if 'dist' is MAHALANOBIS_DIST.
*/
static int *nearest_neighbors(float *vector, int category, struct dataset *dset,
			      int dist, int k, int *errc, FILE *fdbg)
{
  int   status;
  int   class_count;
  int   j;
  int   offset;
  int   icl;
  int   itx;
  int   idx;
  float dsign;
  float dexp;
  float dmin;
  int   *isx;
  int   *index;
  float *mxv = (float *) 0;
  float *mxe = (float *) 0;
  float *matrix;
  float *distances;
  float **cov;
  float **inverse;
  float **sigma;

  isx = (int *) 0;
  matrix = (float *) 0;
  status = 0;
  if (errc)
    {
      if (dist == MAHALANOBIS_DIST)
	{
	  /*
	    Calculate inverse covariance matrices, if needed.
	  */
	  if (!dset->sigma)
	    {
	      sigma = malloc(dset->c*sizeof(float *));
	      offset = 0;
	      for (j = 0; j < dset->c && !status; j++)
		{
		  cov = cest(&dset->x[offset], dset->d, dset->nd[j], COVARIANCE_MATRIX);
		  if (cov)
		    {
		      inverse = fmx_inv(cov, dset->d, &dsign, &dexp, errc);
		      mx_free((void **) cov, dset->d);
		      if (inverse)
			{
			  sigma[j] = fmx_ssm(inverse, dset->d);
			  offset += dset->nd[j];
			  if (!sigma[j])
			    {
			      status = -1;
			      *errc = errno;
			    }
			}
		      else
			status = -1;
		    }
		  else
		    {
		      status = -1;
		      *errc = errno;
		    }
		}
	      if (!status)
		dset->sigma = sigma;
	      else
		dset->sigma = vx_free(dset->sigma);
	    }
	  if (!status)
	    {
	      mxv = malloc(dset->d*sizeof(float));
	      if (mxv)
		{
		  mxe = malloc(dset->d*sizeof(float));
		  if (!mxe)
		    {
		      status = -1;
		      *errc = errno;
		    }
		}
	      else
		{
		  status = -1;
		  *errc = errno;
		}
	    }
	}
      /*
	Calculate distances.
      */
      if (!status)
	{
	  distances = malloc(dset->nv*sizeof(float));
	  if (distances)
	    {
	      icl = 0;
	      class_count = dset->nd[0];
	      if (dist == MAHALANOBIS_DIST)
		matrix = dset->sigma[icl];
	      for (j = 0; j < dset->nv; j++)
		{
		  distances[j] = distance(vector, dset->x[j], dset->d, dist, matrix, mxv, mxe);
		  if (fdbg)
		    {
		      fprintf(fdbg, "knn(); class %d test vector:\t", category);
		      for (itx = 0; itx < dset->d; itx++)
			fprintf(fdbg, "%f ", vector[itx]);
		      fprintf(fdbg, "; training vector %d:\t", j);
		      for (itx = 0; itx < dset->d; itx++)
			fprintf(fdbg, "%f ", dset->x[j][itx]);
		      fprintf(fdbg, "; distance: %f\n", distances[j]);
		    }
		  if ((dist == MAHALANOBIS_DIST) && (j == class_count-1))
		    {
		      icl++;
		      class_count += dset->nd[icl];
		      matrix = dset->sigma[icl];
		    }
		}
	      if (k > 1)
		{
		  /*
		    Sort distances.
		  */
		  status = fheap(distances, dset->nv, &index);
		  if (!status)
		    {
		      isx = malloc(dset->nv*sizeof(int));
		      if (isx)
			{
			  for (j = 0; j < dset->nv; j++)
			    {
			      idx = index[j];
			      isx[idx] = j;
			    }
			  free(index);
			}
		      else
			{
			  status = -1;
			  *errc = errno;
			}
		    }
		}
	      else
		{
		  /*
		    k == 1, no need to sort.
		  */
		  isx = malloc(sizeof(int));
		  dmin = distances[0];
		  isx[0] = 0;
		  for (j = 1; j < dset->nv; j++)
		    {
		      if (distances[j] < dmin)
			{
			  dmin = distances[j];
			  isx[0] = j;
			}
		    }
		}
	      vx_free(distances);
	    }
	  else
	    {
	      status = -1;
	      *errc = errno;
	    }
	}
    }
  return isx;
}

/*
  Classify 'vector' from 'category' using k-NN method, 'dset' and
  distance measure 'dist'. Return the assigned class. Break ties
  pseudo-randomly.

  NOTE: `category' is only used for logging, not computation.

  In case of failure, return -1 and set 'errc'.
*/
int knn(float *vector, int category, struct dataset *dset, int k, int dist, 
	int *errc, FILE *fdbg)
{
  int   j;
  int   idx;
  int   assigned_class;
  int   status;
  int   icl;
  int   *isx;
  int   *cc;

  status = 0;
  /*
    Get nearest neighbors.
  */
  cc = (int *) 0;
  isx = nearest_neighbors(vector, category, dset, dist, k, errc, fdbg);
  if (isx)
    {
      /*
	Classify input vector.
	
	Count number of votes (neighbors) from each class.
      */
      cc = calloc(dset->c, sizeof(int));
      if (cc)
	{
	  for (j = 0; j < k; j++)
	    {
	      idx = isx[j];
	      icl = dataset_class(idx, dset->c, dset->nd);
	      cc[icl]++;
	    }
	}
      else
	status = -1;
      vx_free(isx);
    }
  else
    status = -1;
  if (status == -1)
    assigned_class = -1;
  else
    assigned_class = ivec_rand_argmax(cc, dset->c);
  vx_free(cc);
  return assigned_class;
}

/*
  Classify 'vector' using bagging k-NN method with 'nmodels', 'dset'
  and distance measure 'dist'. Return the assigned class. Ties are
  resolved pseudo-randomly.
  
  In case of success, set errc to 0, otherwise to errno.
*/
int knn_bagging(float *vector, int tcl, struct dataset *dset, int nmodels, 
		int bag_size, int k, int dist, int *errc, FILE *fdbg)
{
  int    i;
  int    status;
  int    nvec;
  int    icl;
  int    assigned_class;
  int    *bnd;
  int    *cc = (int *) 0;
  float  **bag = (float **) 0;
  struct dataset *bag_set;

  *errc = 0;
  status = 0;
  assigned_class = PCP_UNASSIGNED;
  cc = calloc(tds->c, sizeof(int));
  bag = malloc(bag_size*sizeof(float *));
  bnd = calloc(dset->c, sizeof(int));
  if (cc && bag && bnd)
    {
      for (i = 0; (i < nmodels) && !status; i++)
	{
	  nvec = resample(i, dset, bag_size, bag, bnd, fdbg);
	  bag_set = dataset_lt(dset->d, dset->c, bnd, nvec, (char **) 0, bag);
	  icl = knn(vector, tcl, bag_set, k, dist, errc, fdbg);
	  if (icl == -1)
	    status = -1;
	  else
	    cc[icl]++;
	  free(bag_set);
	}
    }
  else
    {
      status = -1;
      *errc = errno;
    }
  if (!status)
    assigned_class = ivec_rand_argmax(cc, dset->c);
  vx_free(cc);
  vx_free(bag);
  vx_free(bnd);
  return assigned_class;
}

/*
  Classify test set using k-NN method, display and save results.
*/
void p_knn(int *errc, char **xname, int *dbg)
{
  int  i;
  int  j;
  int  k;
  int  idx;
  int  icl;
  int  tcum;
  int  tcl;
  int  dist;
  int  min_range;
  int  max_range;
  int  verbose;
  int  *isx;
  int  *cc;
  char *output_fname;
  FILE *outdev;
  FILE *fdbg = (FILE *) 0;
  FILE *fptr;

  if (tds)
    {
      if (*dbg > 0)
	fdbg = fopen(PCP_DBG, "a");
      outdev = stdout;
      max_range = ivec_min(tds->nd, tds->c);
      input_knn(outdev, (int *) 0, 0, (int *) 0, &k, max_range, (unsigned int *) 0, 
		(char **) 0, &dist);
      min_range = 0;
      max_range = 1;
      verbose = input_integer(stdin, outdev, OUTPUT_MSG, PCP_QLEN, 
			      &min_range, &min_range, &max_range);
      tcum = teds->nd[0]; /* cumulative cardinality of classes 0..tcl */
      tcl = 0;
      output_fname = strdup(PCP_RCL);
      fptr = fopen(output_fname, "w");
      if (fptr)
	{
	  cc = calloc(tds->c, sizeof(int));
	  if (cc)
	    {
	      teds->prediction = malloc(teds->nv*sizeof(int));
	      if (teds->prediction)
		{
		  for (i = 0; i < teds->nv; i++)
		    {
		      if (i == tcum)
			tcum += teds->nd[++tcl];
		      isx = nearest_neighbors(teds->x[i], tcl, tds, dist, k, errc, fdbg);
		      if (isx)
			{
			  ivec_set(cc, tds->c, 0);
			  for (j = 0; j < k; j++)
			    {
			      idx = isx[j];
			      icl = dataset_class(idx, tds->c, tds->nd);
			      cc[icl]++;
			    }
			  teds->prediction[i] = ivec_rand_argmax(cc, tds->c);
			}
		      vx_free(isx);
		    }
		}
	      /*
		Store predictions in PCP_RCL.
	      */
	      for (i = 0; i < teds->nv; i++)
		write_rcl(fptr, i, teds, tds);
	      if (!fclose(fptr))
		{
		  predict_disp(teds, verbose, PALG_KNN);
		  pwait();
		  vx_free(teds->prediction);
		  vx_free(cc);
		}
	      else
		{
		  *errc = errno;
		  *xname = strdup(output_fname);
		}
	    }
	}
      else
	{
	  *errc = errno;
	  *xname = strdup(output_fname);
	}
    }
  else
    *errc = PERR_UNDEFINED_TDS;
}

/*
  Accept input parameters and pass them to the k-NN bagging function.
  In case of error, set 'errc'. If error is file access error, set
  'xname'.
*/
void p_knn_bagging(int *errc, char **xname, int *dbg)
{
  int   i;
  int   tcl;
  int   k;
  int   status;
  int   dist;
  int   nmodels;
  int   bag_size;
  int   min_range;
  int   max_range;
  int   verbose;
  int   icl;
  char  *output_fname;
  unsigned int seed;
  FILE  *outdev;
  FILE  *fdbg = (FILE *) 0;
  FILE  *fptr;

  status = 0;
  tcl = 0;
  seed = 1; /* TBD: fixed seed. */
  /*
    TBD: decided to fix bag_size to 'nvec' in November 2002. Perhaps
    revisit.
  */
  bag_size = tds->nv;
  outdev = stdout;
  fflush(outdev);
  if (*dbg > 0)
    fdbg = fopen(PCP_DBG, "a");
  teds->prediction = malloc(teds->nv*sizeof(int));
  if (teds->prediction)
    {
      input_knn(outdev, (int *) 0, 0, (int *) 0, &k, -1, (unsigned int *) 0, (char **) 0, &dist);
      nmodels = input_nmodels(stdin, outdev);
      min_range = 0;
      max_range = 1;
      verbose = input_integer(stdin, outdev, OUTPUT_MSG, PCP_QLEN, 
			      &min_range, &min_range, &max_range);
      output_fname = strdup(PCP_RCL);
      fptr = fopen(output_fname, "w");
      if (fptr)
	{
	  inverse_video();
	  srand(seed);
	  for (i = 0; i < teds->nv; i++)
	    {
	      icl = knn_bagging(teds->x[i], tcl, tds, nmodels, bag_size, k, dist,
				errc, fdbg);
	      teds->prediction[i] = icl;
	    }
	  /*
	    Store predictions in PCP_RCL.
	  */
	  for (i = 0; i < teds->nv; i++)
	    write_rcl(fptr, i, teds, tds);
	  if (!fclose(fptr))
	    {
	      predict_disp(teds, verbose, PALG_KNN);
	      pwait();
	    }
	  else
	    {
	      *errc = errno;
	      *xname = strdup(output_fname);
	    }
	}
      else
	{
	  *errc = errno;
	  *xname = strdup(output_fname);
	}
    }
  else
    *errc = errno;
}

/*
  k-NN model selection. The function chooses optimal number of nearest
  neighbors.
*/
void pcp_knn_xpar(int *errc, int dbg, char **xname)
{
  int   status;
  int   i;
  int   kmin;
  int   kmax;
  int   kstep;
  int   ndim;
  int   iter;
  float fval;
  float grid_point[2];
  struct xpar_crit_parameters *xpar_parameters;
  struct knn_options *knn_options;
  FILE  *fdbg;

  ndim = 1;
  if (dbg > 0)
    fdbg = fopen(PCP_DBG, "a");
  else
    fdbg = (FILE *) 0;
  status = 0;
  clear_screen();
  cursor_on();
  xpar_parameters = init_spar(PALG_KNN, tds, fdbg);
  knn_options = calloc(1, sizeof(struct knn_options));
  knn_options->dist = input_knn_dist(stdout);
  xpar_parameters->options = knn_options;
  input_xpar(tds, xpar_parameters);
  input_xpar_knn(tds, xpar_parameters->nxval, &kmin, &kmax, &kstep);
  /*
    Compute optimal feature transformations/subsets once, reuse them
    later.
  */
  if (xpar_parameters->dr_method != PDR_NONE)
    status = compute_dr(xpar_parameters, xpar_parameters->dr_method, 
			xpar_parameters->idr, xpar_parameters->fscrit, fdbg, errc);
  iter = 1;
  unlink(PCP_MSL);
  for (i = kmin; i <= kmax && !*errc && !status; i += kstep)
    {
      grid_point[0] = i;
      fval = xpar_func(grid_point, ndim, iter, xpar_parameters, errc);
      if (!*errc)
	{
	  viprint_line(5, 1, "Current k: %10d; error rate: %7.2f%%.", (int) grid_point[0], fval);
	  viprint_line(5, 1, "Optimal k: %10d; error rate: %7.2f%%.", (int) xpar_parameters->x1, 
		       xpar_parameters->eval);
	  printf("\n");
	  iter++;
	}
    }
  /*
    We have the optimal k - now apply to the test set, if defined.
  */
  if (!*errc && !status)
    {
      if (teds)
	{
	  viprint_line(2, 1, "Applying the optimal classifier to the test dataset...");
	  xtest_optimal(xpar_parameters, errc, xname);
	}
      else
	pwait();
    }
  xpar_free(xpar_parameters);
}



syntax highlighted by Code2HTML, v. 0.9.1