/*
  File name: fselect.c
  Created by: Ljubomir Buturovic
  Created: 01/10/2003
  Purpose: feature selection functions.
*/

/*
  Copyright 2004-2005 Ljubomir J. Buturovic, Sasha Jaksic

  Permission is hereby granted, free of charge, to any person
  obtaining a copy of this software and associated documentation files
  (the "Software"), to deal in the Software without restriction,
  including without limitation the rights to use, copy, modify, merge,
  publish, distribute, sublicense, and/or sell copies of the Software,
  and to permit persons to whom the Software is furnished to do so,
  subject to the following conditions:

  The above copyright notice and this permission notice shall be
  included in all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  SOFTWARE.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <math.h>
#include "pcp.h"
#include "pau.h"
#include "lau.h"
#include "lmat.h"
#include "heap.h"
#include "estimate_error.h"
#include "fselect.h"
#include "xlearn.h"

static char rcsid[] = "$Id: fselect.c,v 1.73 2005/11/22 06:17:00 ljubomir Exp $";

#define FSELECT_LINE_WIDTH   100

#define ODD(x)  (x&1)

/*
  Calculate mean of feature 'idx' in matrix 'x'.
*/
static float calc_mean(float **x, int nv, int idx)
{
  int    i;
  float  mean;
  double dsum = 0.0;

  for (i = 0; i < nv; i++)
      dsum += x[i][idx];
  mean = dsum/nv;
  return mean;
}

/*
  Calculate standard deviation of feature 'idx' in matrix 'x', given
  'mean'.
*/
static float calc_sd(float **x, int nv, int idx, float mean)
{
  int    i;
  float  sd;
  double diff;
  double dsum = 0.0;

  for (i = 0; i < nv; i++)
    {
      diff = x[i][idx]-mean;
      dsum += diff*diff;
    }
  sd = sqrt(dsum/(nv-1));
  return sd;
}

/*
  Calculate Pearson correlation coefficient between 'ivec' and feature
  'idx' in matrix 'x'.
*/
static double calc_pearson(int *ivec, float **x, int nv, int idx)
{
  int    i;
  double sxx;
  double syy;
  double sxy;
  double meanx;
  double meany;
  double pearson;
  double dsum1;
  double dsum2;

  dsum1 = 0.0;
  dsum2 = 0.0;
  for (i = 0; i < nv; i++)
    {
      dsum1 += ivec[i]*ivec[i];
      dsum2 += ivec[i];
    }
  meanx = dsum2/nv;
  sxx = dsum1-nv*meanx*meanx;
  dsum1 = 0.0;
  dsum2 = 0.0;
  for (i = 0; i < nv; i++)
    {
      dsum1 += x[i][idx]*x[i][idx];
      dsum2 += x[i][idx];
    }
  meany = dsum2/nv;
  syy = dsum1-nv*meany*meany;
  dsum1 = 0.0;
  for (i = 0; i < nv; i++)
    dsum1 += ivec[i]*x[i][idx];
  sxy = dsum1-nv*meanx*meany;
  pearson = sxy/sqrt(sxx*syy);
  return pearson;
}

/*
  Return ranking of features in 'dset' according to criterion
  `fscrit'.  The available feature evaluation criteria are defined in
  pcp.h.

  The returned array has sorted criterion values for all input
  features (best comes first; thus array[0] is the best feature,
  array[1] next best, etc.).  Output array 'index' contains the
  original indexes of these features.

  In case of failure, return NULL and set errno.  
*/
static float *rank_features(struct dataset *dset, int fscrit, int **index)
{
  int    i;
  int    j;
  int    status;
  int    offset;
  int    idx;
  int    kmin;
  int    kmax;
  int    nn;
  int    errc;
  float  mean1;
  float  mean2;
  float  sd1;
  float  sd2;
  double dsum1;
  double dsum2;
  int    *isx;
  float  *det;
  float  **x;
  float  *ranking;
  float  **sig;
  float  *bayes_l;
  unsigned int seed;

  status = 0;
  ranking = malloc(dset->d*sizeof(float));
  if (ranking)
    {
      if (fscrit == PCP_FSEL_EUCLIDEAN)
	{
	  for (i = 0; i < dset->d; i++)
	    {
	      dsum2 = 0.0;
	      for (j = 0; j < dset->nv; j++)
		{
		  dsum1 = dset->x[j][i]-dset->label[j];
		  dsum2 += dsum1*dsum1;
		}
	      ranking[i] = sqrt(dsum2);
	    }
	}
      else if (fscrit == PCP_FSEL_GOLUB)
	{
	  /*
	    NOTE: this method only works for two classes. It uses
	    'neighborhood analysis' described in Golub, T.R. et al.,
	    'Molecular Classification of Cancer: Class Discovery and
	    Class Prediction by Gene Expression Monitoring'. Science,
	    vol. 286, 15 October 1999, pp. 531-537.
	  */
	  for (i = 0; i < dset->d; i++)
	    {
	      offset = dset->nd[0];
	      mean1 = calc_mean(dset->x, dset->nd[0], i);
	      mean2 = calc_mean(&dset->x[offset], dset->nd[1], i);
	      sd1 = calc_sd(dset->x, dset->nd[0], i, mean1);
	      sd2 = calc_sd(&dset->x[offset], dset->nd[1], i, mean2);
	      ranking[i] = -fabs((mean1-mean2)/(sd1+sd2));
	    }
	}
      else if (fscrit == PCP_FSEL_PEARSON)
	/*
	  This criterion computes Pearson correlation coefficient
	  between a vector of feature values, for all input vectors,
	  and the corresponding class labels. The class labels go from
	  0 to c-1.

	  The criterion often provides very good performance. However,
	  there are still some open questions regarding this
	  criterion.  Namely, the class labels are arbitrary
	  integers. We happen to use [0..c-1], but that is arbitrary
	  choice. The value of the criterion depends on this arbitrary
	  assignment of the class labels. It is unclear how much using
	  different class labels would affect classification
	  performance of this criterion. This is an important issue
	  which should be investigated.
	*/
	for (i = 0; i < dset->d; i++)
	  ranking[i] = -fabs(calc_pearson(dset->label, dset->x, dset->nv, i));
      else if (fscrit == PCP_FSEL_BAYES)
	for (i = 0; (i < dset->d) && !status; i++)
	  {
	    x = fmx_alloc(dset->nv, 1);
	    for (j = 0; j < dset->nv; j++)
	      x[j][0] = dset->x[j][i];
	    seed = 1;
	    kmin = 1;
	    kmax = ivec_min(dset->nd, dset->c)-1;
	    nn = kmax-kmin+1;
	    sig = calc_sig(x, dset->c, 1, dset->nd, &det, &errc);
	    bayes_l = L_error(x, dset->nv, 1, dset->c, dset->nd, 
			      sig, det, kmin, kmax, (FILE *) 0, (FILE *) 0, &errc);
	    if (bayes_l)
	      ranking[i] = fvec_sum(bayes_l, nn)/nn;
	    else
	      {
		status = -1;
		errno = errc;
	      }
	  }
      if (!status)
	{
	  status = fheap(ranking, dset->d, index);
	  isx = malloc(dset->d*sizeof(int));
	  if (isx)
	    {
	      for (i = 0; i < dset->d; i++)
		{
		  idx = (*index)[i];
		  isx[idx] = i;
		}
	      ivec_copy(*index, isx, dset->d);
	      free(isx);
	    }
	  else
	    status = -1;
	  if (status == -1)
	    ranking = vx_free(ranking);
	  else if ((fscrit == PCP_FSEL_GOLUB) || (fscrit == PCP_FSEL_PEARSON))
	    {
	      /*
		Restore sign for similarity-like measures.
	      */
	      for (i = 0; i < dset->d; i++)
		ranking[i] = -ranking[i];
	    }
	}
    }
  return ranking;
}

/*
  Select optimal feature subset in TDS. The function collects
  parameters from the user (output file name and criterion), computes
  the optimal feature subset, and saves it in the output file.
*/
void p_fselect(int *errc, char **xname, int dbg)
{
  int    i;
  int    fscrit;
  int    nfeat;
  int    dr_method;
  double val_crit;
  int    *index;
  char   *fname;
  float  *ranking;
  char   **names;
  FILE   *outdev;
  FILE   *fptr;

  if (tds)
    {
      *errc = 0;
      clear_screen();
      outdev = stdout;
      cursor_on();
      dr_method = input_fsel(outdev);
      if (dr_method == PDR_RANKING)
	{
	  fname = input_filename(PCP_UMSG_FNAME_RNK, PCP_RNK, outdev);
	  fscrit = input_fscrit(outdev, dr_method, tds->c);
	  if ((ranking = rank_features(tds, fscrit, &index)) == (float *) 0)
	    *errc = errno;
	  else
	    {
	      fptr = fopen(fname, "w");
	      if (fptr)
		{
		  names = tds->alab;
		  for (i = 0; i < tds->d; i++)
		    {
		      fprintf(fptr, "%d\t%f", index[i]+1, ranking[i]);
		      if (names)
			fprintf(fptr, "\t%s", names[index[i]]);
		      fprintf(fptr, "\n");
		    }
		  if (fclose(fptr))
		    {
		      *errc = errno;
		      *xname = strdup(fname);
		    }
		}
	      else
		{
		  *errc = errno;
		  *xname = strdup(fname);
		}
	    }
	}
      else
	{
	  fname = input_filename(PCP_UMSG_FNAME_SEL, PCP_SET, outdev);
	  fscrit = input_fscrit(outdev, dr_method, tds->c);
	  nfeat = input_nfeat(stdin, stdout, tds->d, PCP_DFLT_NFSEL);
	  if (dr_method == PDR_FORWARD)
	    index = forward_select(tds, nfeat, fscrit, 1, &val_crit, errc);
	  else if(dr_method == PDR_BACKWARD)
	    index = backward_select(tds, nfeat, fscrit, 1, &val_crit, errc);
	  else if(dr_method == PDR_FLOAT_FORWARD)
	    {
	      int real_featcount;
	      index = float_fwd_select(tds, nfeat, fscrit, &val_crit, 
				       &real_featcount, errc);
	      nfeat = real_featcount;
	    }
	  if (index)
	    {
	      fptr = fopen(fname, "w");
	      if (fptr)
		{
		  names = tds->alab;
		  for (i = 0; i < nfeat; i++)
		    {
		      fprintf(fptr, "%d\t%f", index[i]+1, val_crit);
		      if (names)
			fprintf(fptr, "\t%s", names[index[i]]);
		      fprintf(fptr, "\n");
		    }
		  if (fclose(fptr))
		    {
		      *errc = errno;
		      *xname = strdup(fname);
		    }
		}
	      else
		{
		  *errc = errno;
		  *xname = strdup(fname);
		}
	    }
	}
    }
  else
    *errc = PERR_UNDEFINED_TDS;
}

static void fsel_display(int *index, float *ranking, int d)
{
  int   i;
  int   idx;
  char  *msg;
  char  *name;
  char  **names;

  clear_screen();
  inverse_video();
  cursor_off();
  printf(PCP_WLINE);
  printf(PCP_EMPTY_WLINE);
  printf("|                                 S E L E C T E D  F E A T U R E S                                 |\n");
  printf(PCP_EMPTY_WLINE);
  printf(PCP_WLINE);
  names = tds->alab;
  msg = malloc((PCP_WLEN+1)*sizeof(char));
  for (i = 0; i < d; i++)
    {
      idx = index[i];
      if (names)
	{
	  name = strdup(names[idx]);
	  if (strlen(name) > 74)
	    name[74] = '\0';
	  sprintf(msg, "| %6d | %10.5f | %-74s |", index[i]+1, ranking[i], name);
	  free(name);
	}
      else
	sprintf(msg, "| %6d | %10.5f | unnamed feature                                                            |",
		index[i], ranking[i]);
      print_line(stdout, msg, PCP_WLEN);
    }
  inverse_video();
  printf(PCP_WLINE);
  free(msg);
  reset_video();
  pwait();
}

/*
  Extract top 'd' ranking features from the feature ranking file
  'fname' into 'index'.  'frel' is the feature relevance (i.e., the
  quantity upon which the ranking is based upon).
*/
static int read_ranking(char *fname, int d, int fmax, int **index, float **frel, int *errc, char **xname)
{
  int   i;
  int   idx;
  int   imax;
  int   dnx;
  int   status;
  int   fscrit;
  int   *ifx;
  float *rx;
  char  *line;
  char  **tokens;
  FILE  *fptr;

  fscrit = 0;
  ifx = (int *) 0;
  rx = (float *) 0;
  /*
    Read indexes.
  */
  status = 0;
  idx = file_info(fname, &imax, &dnx, '\0');
  if ((idx != -1) && (dnx >= 2))
    {
      if (d == 0)
	d = idx;
      ifx = malloc(idx*sizeof(int));
      if (frel)
	rx = malloc(idx*sizeof(float));
      fptr = fopen(fname, "r");
      if (fptr)
	{
	  line = malloc(imax+2);
	  i = 0;
	  while (fgets(line, imax+2, fptr) && (i < idx) && !status)
	    {
	      tokens = str_tokenize(line, WHITESPACE);
	      ifx[i] = atoi(tokens[0])-1;
	      if (ifx[i]+1 > fmax)
		{
		  status = -1;
		  *errc = PERR_ILLEGAL_RNK_FILE;
		  *xname = strdup(fname);
		}
	      else
		{
		  if (frel)
		    rx[i] = atof(tokens[1]);
		  i++;
		  str_free(tokens);
		}
	    }
	  free(line);
	  free(fname);
	  if (!status)
	    status = fclose(fptr);
	  else
	    fclose(fptr);
	}
      else
	{
	  status = -1;
	  *errc = errno;
	  *xname = strdup(fname);
	}
    }
  else if (!idx)
    {
      status = -2;
      *errc = PERR_BAD_INPUT_FILE;
      *xname = strdup(fname);
    }
  else
    {
      status = -1;
      *errc = errno;
      *xname = strdup(fname);
    }
  if (!status)
    {
      *index = ifx;
      if (frel)
	*frel = rx;
    }
  return status;
}

/*
  Display the ranking of features in a user-defined file.
*/
void p_fdisp(int *errc, char **xname, int dbg)
{
  int   d;
  int   status;
  int   min_range;
  int   imax;
  int   *index;
  char  *fname;
  float *frel;
  FILE  *outdev;

  status = 0;
  *errc = 0;
  clear_screen();
  outdev = stdout;
  cursor_on();
  fname = input_filename(PCP_UMSG_FNAME_RNK, PCP_RNK, outdev);
  min_range = 1;
  imax = file_info(fname, (int *) 0, (int *) 0, '\0');
  if (imax != -1)
    {
      d = input_nfeat(stdin, outdev, imax, PCP_DFLT_NFSEL);
      status = read_ranking(fname, d, tds->d, &index, &frel, errc, xname);
      if (!status)
	fsel_display(index, frel, d);
    }
  else
    {
      *errc = errno;
      *xname = strdup(fname);
    }
}

  
/*
  Extract and save a user-defined subset of features from current data
  sets. Optionally, replace the current data sets with the subset.
*/
void p_f_subset(int *errc, char **xname, int dbg)
{
  int   d;
  int   min_range;
  int   imax;
  int   int_default;
  int   status;
  int   mode;
  int   *index;
  char  *msg;
  char  *fname;
  float *frel;
  struct dataset *dset1 = (struct dataset *) 0;
  struct dataset *dset2 = (struct dataset *) 0;
  FILE  *outdev;

  *errc = 0;
  if (tds)
    {
      status = 0;
      clear_screen();
      outdev = stdout;
      cursor_on();
      msg = malloc((PCP_QLEN+1)*sizeof(char));
      fname = input_filename(PCP_UMSG_FNAME_RNK_2, PCP_RNK, outdev);
      mode = input_replace(&dset1, &dset2, "sel");
      min_range = 1;
      int_default = 2;
      imax = tds->d;
      sprintf(msg, PCP_UMSG_NSEL, int_default);
      d = input_integer(stdin, outdev, msg, PCP_QLEN, &int_default, &min_range, &imax);
      free(msg);
      status = read_ranking(fname, d, imax, &index, &frel, errc, xname);
      if (!status)
	{
	  /*
	    Input file read. Now subselect dset1, dset2.
	  */
	  if (dset1)
	    status = dataset_inset(dset1, index, d);
	  if (!status)
	    {
	      status = dataset_inset(dset2, index, d);
	      if (!status)
		status = save_datasets(dset1, dset2, mode, errc, xname);
	      vx_free(index);
	      vx_free(frel);
	    }
	  else
	    *errc = errno;
	}
      else if (status == -1)
	{
	  *errc = errno;
	  *xname = strdup(fname);
	}
    }
  else
    *errc = PERR_UNDEFINED_TDS;
}

/* subtracts vec from mat row and multiplies result with transposed self
repeats for each row accumulating sum in result matrix. If multipliers is
non-null multiplies result of each row iteration with corresponding multipliers
element. Non-null multiplier must have rows elements. Result matrix needs to be 
pre-alocated as cols by cols and initialized if desired */

static int fmx_sum_sq_sub(float** mat, float* vec, int rows, int cols, float** result,
			  float* multipliers)
{
  float* tmp_vec = (float*) malloc(sizeof(float) * cols);
  int retval = -1;
  if(tmp_vec)
    {
      int i, j, k;
      for(i = 0; i != rows; i++)
	{
	  for(j = 0; j != cols; j++)
	    tmp_vec[j] = mat[i][j] - vec[j];
	  if(multipliers != NULL) /*pulled outside of loop to save time*/
	    for(j = 0; j != cols; j++)
	      {
		result[j][j] += tmp_vec[j] * tmp_vec[j] * multipliers[i];
		for(k = j+1; k != cols; k++)
		  {
		    float tres = tmp_vec[j] * tmp_vec[k] * multipliers[i];
		    result[j][k] += tres;
		    result[k][j] += tres;
		  }
	      }
	  else /*same thing but without multipliers*/
	    for(j = 0; j != cols; j++)
	      {
		result[j][j] += tmp_vec[j] * tmp_vec[j];
		for(k = j+1; k != cols; k++)
		  {
		    float tres = tmp_vec[j] * tmp_vec[k];
		    result[j][k] += tres;
		    result[k][j] += tres;
		  }
	      }
	}
      free(tmp_vec);
      retval = 0;
    }
  return retval;
}

/*
  Compute the value of `select_crit' discrimination criterion for
  subset of `nfeat' features in `dset'. The subset is defined by
  `index'.

  All criteria implemented in this function have to satisfy: the
  larger the value of the criterion, the better.
  
  In case of success, set *errc to 0; otherwise, set error code in
  *errc.
*/
static double eval_crit(struct dataset *dset, int nfeat, int *index, int select_crit, int *errc)
{
  int     i;
  int     j;
  int     idx;
  int     kmin;
  int     kmax;
  int     nn;
  int     x_offset;
  int     nxval;
  double  crit;
  float   *det;
  float   **x;
  float   **sig;
  float** scat_b;
  float** scat_w;
  float   *bayes_l;
  float*  cardinalities;
  float** inv_w = NULL;
  float** int_intra = NULL;
  struct dataset *xset;
  unsigned int seed;

  crit = 0.0;
  if (errc)
    *errc = 0;
  if (select_crit == PCP_FSUB_KNN)
    {
      xset = dataset_select(dset, index, nfeat);
      if (xset)
	{
	  /*
	    NOTE: nexp fixed to 1. nxval set to min(smallest class,
	    5).
	  */
	  nxval = 5;
	  i = ivec_min(xset->nd, xset->c);
	  if (i < nxval)
	    nxval = i;
	  crit = xlearn_machine(xset, PALG_KNN, 1, nxval, errc);
	  dataset_free(xset);
	  crit = 100-crit;
	}
      else
	*errc = errno;
    }
  else
    {
      x = fmx_alloc(dset->nv, nfeat);
      if (x)
	{
	  for (i = 0; i < nfeat; i++)
	    {
	      idx = index[i];
	      for (j = 0; j < dset->nv; j++)
		x[j][i] = dset->x[j][idx];
	    }
	  if (select_crit == PCP_FSUB_BAYES)
	    {
	      seed = 1;
	      kmin = 1;
	      kmax = ivec_min(dset->nd, dset->c)-1;
	      nn = kmax-kmin+1;
	      sig = calc_sig(x, dset->c, nfeat, dset->nd, &det, errc);
	      if (sig)
		{
		  bayes_l = L_error(x, dset->nv, nfeat, dset->c, dset->nd, 
				    sig, det, kmin, kmax, (FILE *) 0, (FILE *) 0, errc);
		  if (bayes_l)
		    crit = 100-fvec_sum(bayes_l, nn)/nn;
		}
	    }
	  else if (select_crit == PCP_FSUB_IN_IN)
	    {
	      float** class_avgs = fmx_alloc(dset->c, nfeat);
	      float* set_avg = (float*) calloc(nfeat, sizeof(float));
	      int curr_vec, i, j, k;
	      if(class_avgs && set_avg)
		{
		  fmx_set(class_avgs, dset->c, nfeat, 0.);
		  curr_vec = 0;
		  for(i = 0; i != dset->c; i++)
		    {
		      for(j = 0; j != (dset->nd)[i]; j++)
			{
			  for(k = 0; k != nfeat; k++)
			    {
			      class_avgs[i][k] += x[curr_vec][k];
			      set_avg[k] += x[curr_vec][k];
			    }
			  curr_vec++;
			}
		      for(k = 0; k != nfeat; k++)
			class_avgs[i][k] /= (dset->nd)[i];
		    }
		  for(k = 0; k != nfeat; k++)
		    set_avg[k] /= curr_vec;
		  scat_b = fmx_alloc(nfeat, nfeat);
		  scat_w = NULL;
		  if(scat_b)
		    {
		      fmx_set(scat_b, nfeat, nfeat, 0.);
		      cardinalities = (float*) malloc(sizeof(float) * dset->c);
		      for(i = 0; i != dset->c; i++)
			cardinalities[i] = (dset->nd)[i];
		      fmx_sum_sq_sub(class_avgs, set_avg, dset->c, nfeat, scat_b, 
				     cardinalities);
		      free(cardinalities);
		      scat_w = fmx_alloc(nfeat, nfeat);
		    }
		  if(scat_w)
		    {
		      fmx_set(scat_w, nfeat, nfeat, 0.);
		      x_offset = 0;
		      for(i = 0; i != dset->c; i++)
			{
			  fmx_sum_sq_sub(x+x_offset, class_avgs[i], (dset->nd)[i], nfeat,
					 scat_w, NULL);
			  x_offset += (dset->nd)[i];
			}
		      for(i = 0; i != nfeat; i++)
			for(j = 0; j != nfeat; j++)
			  {
			    scat_b[i][j] /= (float)x_offset;
			    scat_w[i][j] /= (float)x_offset;
			  }
		      
		      inv_w = fmx_inv(scat_w, nfeat, (float*) NULL, (float*)NULL, errc);
		      mx_free((void **) scat_w, nfeat);
		    }
		  int_intra = NULL;
		  if (inv_w)
		    {
		      int_intra = fmx_mult(inv_w, nfeat, nfeat, scat_b, nfeat, 0);
		      if(int_intra)
			{
			  crit = 0;
			  for(i = 0; i != nfeat; i++)
			    crit += int_intra[i][i];
			  mx_free((void **) int_intra, nfeat);
			}
		      else if(errc)
			*errc = errno;
		    }
		  mx_free((void **) inv_w, nfeat);
		  mx_free((void **) scat_b, nfeat);
		  free(set_avg);
		  mx_free((void **) class_avgs, dset->c);
		}
	      else if(errc)
		*errc = errno;
	    }
	  mx_free((void **) x, dset->nv);
	}
      else if (errc)
	*errc = errno;
    }
  return crit;
}

/* Allocates bit set of size bits
*/
static unsigned char* alloc_bitset(int size)
{
  int char_size;
  unsigned char* ret;
  char_size = size / 8;
  if(size % 8 != 0)
    char_size++;
  ret = calloc(char_size, 8);
  return ret;
}

/*  Test if bit at position is set. Returns 0 if not set, unspecified non-zero
    otherwise.
*/
static unsigned char test_bitset(unsigned char* bset, int position)
{
  return  bset[position / 8] & ((unsigned char)'\x80' >> (position % 8));
}

/* Sets bit at position
*/
static void set_bitset(unsigned char* bset, int position)
{	  
  bset[position / 8] |= ((unsigned char)'\x80' >> (position % 8));
}
/* Resets bit at position */
static void unset_bitset(unsigned char* bset, int position)
{	  
  bset[position / 8] &= ~((unsigned char)'\x80' >> (position % 8));
}
/* Inverts bit at position */
static void flip_bitset(unsigned char* bset, int position)
{	  
  bset[position / 8] ^= ((unsigned char)'\x80' >> (position % 8));
}

/* Initialize combination enumerator to generate combinations of k elements 
   from set of n elements. Generated combination is returned in array *a which 
   represents combination as 0s and 1s. Successive combinations are obtained 
   thry call to combination_next. In addition it to a it returns r and *w which 
   are intended for use with combination_next. *w to be disposed by caller 
   when done generating combinations as well as a*. Fails if n < k
*/
static void combination_init(int n, int k, int *r, unsigned char **a, unsigned char **w)
{
  int i;
  *r = n-k;
  if(*r >= 0)
    {
      *a = alloc_bitset(n);
      *w = alloc_bitset(n+1);
      
      for (i=0; i < *r; i++) 
	{
	  unset_bitset(*a, i); 
	  set_bitset(*w, i); 
	}
      for (i=*r; i < n; i++) 
	{ 
	  set_bitset(*a, i); 
	  set_bitset(*w, i); 
	}
      set_bitset(*w, n);
    }
}

/* Generate next combination of k elements from set of n elements using r *a 
   and *w initialized with combination_init. Next combination is found in *a,
   with r and *w being updated for use with next call. Returns 1 if next 
   combination exists and output values were updated. Otherwise it returns 0 in
   which case output values are unchanged.
   Uses chase sequence algorithm created by Donald Knuth. This implementation 
   based on C implementation by Glenn C. Rhoads.
*/
static int combination_next(int n, int k, int *r, unsigned char *a, unsigned char *w)
{
  /*      
	  for (i = n-1; i >= 0; i--) printf( "%3d", a[i] );
	  printf( "\n" );
  */  
  int i, ret;
  for (i=*r; !test_bitset(w, i); i++) 
    set_bitset(w,i);
  if (i == n) 
    ret = 0;
  else
    {
      unset_bitset(w, i);
      if (test_bitset(a, i))
	{
	  unset_bitset(a, i);
	  if (ODD(i) || test_bitset(a, i-2))
	    {
	      set_bitset(a, i-1);
	      if (*r == i-1) 
		*r = i;
	      else if (*r == i && i > 1) 
		*r = i-1;
	    }
	  else
	    {
	      set_bitset(a, i-2);
	      if (*r == i-2) 
		*r = i-1;
	      else if (*r == i)
		{
		  if (i > 3) 
		    *r = i-2;
		  else 
		    *r = 1;
		}
	    }
	}
      else
	{
	  set_bitset(a, i);
	  if (ODD(i) && !test_bitset(a, i-1))
	    {
	      unset_bitset(a, i-2);
	      if (*r == i-2) 
		*r = i;
	      else if (*r == i-1) 
		*r = i-2;
	    }
	  else
	    {
	      unset_bitset(a, i-1);
	      if (*r == i-1) 
		*r = i;
	      else if (*r == i && i > 1) 
		*r = i-1;
	    }
	}
      ret = 1;
    }
  return ret;
}

/*----*/
static void fill_index_set(unsigned char* included, int* index, int size)
{
  int i;
  int j = 0;
  for(i = 0; i != size; i++)
    if(test_bitset(included, i))
      index[j++] = i;
}

/* finds optimal combination of features to add to dataset using specified 
   criteria. As side effect index array is sorted.
   dset - (in) full dataset 
   i - (in) number of currently included features
   add_attrs - (in) number of features to remove
   index - (in) pre allocated array to temporary store feature indexes size dset->d
   included - (in) pre populated bitset with all features currently included same size as index
   tmp_index - (in) preallocated array of feature indices size sub_attrs
   max_index - (in, out) preallocated array to hold optimal combination of features to remove on return
   max_crit (in, out) value of criteria if features from max_index are added
   All combinations producing criteria below initial value are disregarded.
   select_crit - (in) number of criteria to apply
   errc - (out) error code if any
   return - number of features in optimal combination == sub_attrs or 0 if error
*/

static int fwd_step(struct dataset *dset, int i, int add_attrs, int* index, 
		    unsigned char *included, int* tmp_index, int* max_index, 
		    double* max_crit, int select_crit, int* errc)
    {
      int max_feat = 0;
      double curr_crit;
      int remaining_attrs = dset->d - i;
      unsigned char* to_include;
      unsigned char * util_set;
      int util_comb;
      combination_init(remaining_attrs, add_attrs, &util_comb, &to_include,
		       &util_set);
      do
	{
	  int addset_index = 0;
	  int currset_index = 0;
	  int attrs_added = 0;
	  int tmp_count;
	  fill_index_set(included, index, dset->d);
	  while((currset_index != dset->d) && (attrs_added != add_attrs) && 
		(addset_index != remaining_attrs))
	    {
	      if(!test_bitset(included, currset_index))
		{
		  if(test_bitset(to_include, addset_index++))
		    {
		      tmp_index[attrs_added++] = currset_index;
		      set_bitset(included, currset_index);
		    }
		}
	      currset_index++;
	    }
	  fill_index_set(included, index, dset->d);
	  curr_crit = eval_crit(dset, i + add_attrs, index, select_crit, errc);
	  for(tmp_count = 0; tmp_count != attrs_added; tmp_count++)
	    unset_bitset(included, tmp_index[tmp_count]);
	  if(*errc != 0) 
	    break;
	  if(curr_crit > *max_crit)
	    {
	      *max_crit = curr_crit;
	      max_feat = 1;
	      for(tmp_count = 0; tmp_count != add_attrs; tmp_count++)
		max_index[tmp_count] = tmp_index[tmp_count];
	    }
/*	  if(!(++comb_tried % 1000))
	    fprintf(stderr, "Tried %d combinations, max criteria = %f\n",
		    comb_tried, max_crit);
*/	}
      while(combination_next(remaining_attrs, add_attrs, &util_comb, to_include,
			     util_set));
      fill_index_set(included, index, dset->d);
      free(to_include);
      free(util_set);
      if(!max_feat)
	add_attrs = 0;
/*      fprintf(stderr, "Features = %d. crit = %f\n", i+add_attrs, max_crit);*/
      return add_attrs;
    }

/*
  Applies forward feature selection algorithm on the dataset
  dset - datset to select features from
  nfeat - number of features to select
  select_crit - ID of selection criteria
  l - number of features to simultaneously insert - complexity of this function 
  is exponentialy dependent on this value so supplying anything more than 1 
  should not be considered lightly
  val_crit - value of criteria function for resulting feature set
  errc - error code from criteria function if any

  returns - internaly allocated array of 0-based feature indices or NULL
  in case of failure. Caller is responsible for deallocation.
 */
int *forward_select(struct dataset *dset, int nfeat, int select_crit, 
		    int l, double *val_crit, int *errc)
{
  int* index = calloc(nfeat, sizeof(int));
  int* tmp_index = calloc(l, sizeof(int));
  int* max_index = calloc(l, sizeof(int));
  unsigned char* included = alloc_bitset(dset->d);
  int i, j;
  int add_attrs = l;
  int attrs_added = 0;
  double max_crit;
  *errc = 0;
  for(i = 0; i != nfeat; i+= add_attrs)
    {
      if(i + add_attrs > nfeat)
	add_attrs = nfeat - i;
      max_crit = 0;
      attrs_added = fwd_step(dset, i, add_attrs, index, included, 
			     tmp_index, max_index, &max_crit, select_crit, errc);
      for(j = 0; j != attrs_added; j++)
	{
	  viprint_line(6, 1, "Added feature %7d (%4d/%4d); criterion value: %f", 
		       max_index[j]+1, i+j+1, nfeat, max_crit);
	  set_bitset(included, max_index[j]);
	}
    }
  fill_index_set(included, index, dset->d);
  free(included);
  free(max_index);
  free(tmp_index);
  if(*errc != 0)
    {
      fprintf(stderr, "Error %d in forward select\n", *errc);
      free(index);
      index = NULL;
    }
  else if (val_crit)
    *val_crit = max_crit;
  return index;
}

/* finds optimal combination of features to remove from dataset using specified 
   criteria. As side effect index array is sorted.
   dset - (in) full dataset 
   i - (in) number of currently included features
   sub_attrs - (in) number of features to remove
   index - (in) pre allocated array to temporary store feature indexes size dset->d
   included - (in) pre populated bitset with all features currently incluses same size as index
   tmp_index - (in) preallocated array of feature indices size sub_attrs
   max_index - (in, out) preallocated array to hold optimal combination of features to remove on return
   max_crit (in, out) value of criteria if features from max_index are removed. 
   All combinations producing criteria below initial value are disregarded.
   select_crit - (in) number of criteria to apply
   errc - (out) error code if any
   return - number of features in optimal combination == sub_attrs or 0 if error
*/

static int backward_step(struct dataset *dset, int i, int sub_attrs, int* index, 
			 unsigned char *included, int* tmp_index,  int* max_index, 
			 double* max_crit, int select_crit, int* errc)
{
      int max_feat = 0;
      double curr_crit;
      unsigned char* to_exclude;
      unsigned char* util_set;
      int util_comb;
      combination_init(i, sub_attrs, &util_comb, &to_exclude, &util_set);
      do
	{
	  int subset_index = 0;
	  int currset_index = 0;
	  int attrs_subed = 0;
	  int tmp_count;
	  fill_index_set(included, index, dset->d);
	  while((currset_index != dset->d) && (attrs_subed != sub_attrs) && 
		(subset_index != i))
	    {
	      if(test_bitset(included, currset_index))
		{
		  if(test_bitset(to_exclude, subset_index++))
		    {
		      unset_bitset(included, currset_index);
		      tmp_index[attrs_subed++] = currset_index;
		    }
		}
	      currset_index++;
	    }
	  fill_index_set(included, index, dset->d);
	  curr_crit = eval_crit(dset, i - attrs_subed, index, select_crit, errc);
	  for(tmp_count = 0; tmp_count != attrs_subed; tmp_count++)
	    {
	      set_bitset(included, tmp_index[tmp_count]);
	    }
	  if(*errc != 0)
	    break;
	  if(curr_crit > *max_crit)
	    {
	      *max_crit = curr_crit;
	      max_feat = 1;
	      for(tmp_count = 0; tmp_count != attrs_subed; tmp_count++)
		max_index[tmp_count] = tmp_index[tmp_count];
	    }
	}
      while(combination_next(i, sub_attrs, &util_comb, to_exclude, util_set));
      fill_index_set(included, index, dset->d);
      free(to_exclude);
      free(util_set);
      if(!max_feat)
	sub_attrs = 0;
      return sub_attrs;
}

/*
  Applies backward feature selection algorithm on the dataset
  dset - datset to select features from
  nfeat - number of features to select
  select_crit - ID of selection criteria
  l - number of features to simultaneously remove - complexity of this function 
  is exponentialy dependent on this value so supplying anything more than 1 
  should not be considered lightly
  val_crit - value of criteria function for resulting feature set
  errc - error code from criteria function if any

  returns - internaly allocated array of 0-based feature indices or NULL
  in case of failure. Caller is responsible for deallocation.
 */

int *backward_select(struct dataset *dset, int nfeat, int select_crit, 
			    int r, double *val_crit, int *errc)
{
  int* index = calloc(dset->d, sizeof(int));
  int* tmp_index = calloc(r, sizeof(int));
  int* max_index = calloc(r, sizeof(int));
  unsigned char* included = alloc_bitset(dset->d);
  int inc_size = dset->d / 8;
  int i, j;
  double max_crit;
  int sub_attrs = r;
  if((dset->d % 8) != 0)
    inc_size++; 
  for(i = 0; i != inc_size; i++)
    included[i] = ~(included[i]);
  *errc = 0;
  for(i = dset->d; i != nfeat; i-= sub_attrs)
    {
      int attrs_subed;
      max_crit = 0.;
      if(i - sub_attrs < nfeat)
	sub_attrs = i - nfeat;
      attrs_subed = backward_step(dset, i, sub_attrs, index, included, 
				  tmp_index, max_index, &max_crit, select_crit, 
				  errc);
      if(*errc != 0)
	break;
      for(j = 0; j != attrs_subed; j++)
	{
	  viprint_line(6, 1, "Removed feature %d (%d/%d); criterion value: %f", 
		       max_index[j], dset->d - i + 1, dset->d - nfeat, max_crit);
	  unset_bitset(included, max_index[j]);
	}
    }
  if(*errc != 0)
    {
      free(index);
      index = NULL;
      fprintf(stderr, "Error %d in backward select\n", *errc);
    }
  else
    fill_index_set(included, index, dset->d);
  free(included);
  free(max_index);
  free(tmp_index);
  if (val_crit)
    *val_crit = max_crit;
  return index;
}
/*
  Applies Pudil's floating forward feature selection algorithm on the dataset
  dset - datset to select features from
  nfeat - number of features to select
  select_crit - ID of selection criteria
  val_crit (out) - value of criteria function for resulting feature set
  current_attrs (out) - actual number of attributes in result set
  errc (out) - error code from criteria function if any

  returns - internaly allocated array of 0-based feature indices or NULL
  in case of failure. Caller is responsible for deallocation.
 */

int *float_fwd_select(struct dataset *dset, int nfeat, int select_crit, 
		      double *val_crit, int* current_attrs, int *errc)
{
  int* index = calloc(dset->d, sizeof(int));
  int tmp_index;
  int add_index;
  int sub_index;
  unsigned char* included = alloc_bitset(dset->d);
  double max_crit = 0.;
  double* crit_vals = calloc(nfeat + 1, sizeof(double));
  int progress = 1;
  *errc = 0;
  *current_attrs = 0;
  while(*current_attrs != nfeat && progress)
    {
      int attrs_added = 0;
      int attrs_subed = 0;
      progress = 0;
      max_crit = crit_vals[*current_attrs];
      attrs_added = fwd_step(dset, *current_attrs, 1, index, included, &tmp_index, 
			  &add_index, &max_crit, select_crit, errc);
      if(*errc)
	break;
      if(attrs_added)
	{
	  set_bitset(included, add_index);
	  fill_index_set(included, index, dset->d);
	  crit_vals[++(*current_attrs)] =  max_crit;
#if defined(_FSELECT_DEBUG)
	  fprintf(stderr, "Added feature %d\n", add_index);
#endif
	  progress = 1;
	}
      while(*current_attrs > 2)
	{
	  max_crit = crit_vals[*current_attrs - 1];
	  attrs_subed = backward_step(dset, *current_attrs, 1, index, included, 
				      &tmp_index, &sub_index, &max_crit, 
				      select_crit, errc);
	  if(*errc)
	    break;
	  if(attrs_subed && (max_crit > crit_vals[*current_attrs - 1]))
	    {
	      unset_bitset(included, sub_index);
	      (*current_attrs)--;
	      crit_vals[*current_attrs] = max_crit;
#if defined(_FSELECT_DEBUG)
	      fprintf(stderr, "Removed feature %d\n", sub_index);
#endif
	      progress = 1;
	    }
	  else
	    break;
	}
    }
  if(*errc)
    {
      free(index);
      index = NULL;
    }
  else
    {
      *val_crit = crit_vals[*current_attrs - 1];
      fill_index_set(included, index, dset->d);
    }
  free(crit_vals);
  return index;
}

/*
  Select optimal `nfeat' features of `dset', using `select_crit'
  criterion and plus_l_minus_r method with parameters `l' and `r'.

  The function returns the indices of the optimal subset. The optimal
  value of the criterion is set in `val_crit'.
*/
static int *plus_l_minus_r(struct dataset *dset, int nfeat, int select_crit, int l, int r, double *val_crit)
{
  int *index;

  index = (int *) 0;
  return index;
}

/*
  Compute optimal subset of `d' features in `dset' using feature
  selection method `fsel_method' and feature subset evaluation
  criterion `fscrit'. If `dsflag' is 1, return the optimal-feature
  subset of `dset'.  If `index' is not NULL, return 0-based indices of
  selected features in 'index'.
  
  In case of error, return NULL and set errc.  However, if `dsflag' is
  0, the function returns NULL, so you have to check errc for errors.
*/
struct dataset *select_subset(struct dataset *dset, int d, int fsel_method, int fscrit, 
			      int **index, int dsflag, int *errc)
{
  int    l = 1;
  int    r = 1;
  double val_crit;
  int    *l_index;
  float  *ranking;
  struct dataset *subset;
  
  if (errc)
    *errc = 0;
  subset = (struct dataset *) 0;
  if ((d > 0) && (d <= dset->d))
    {
      if (fsel_method == PDR_RANKING)
	{
	  ranking = rank_features(dset, fscrit, &l_index);
	  if (ranking)
	    {
	      subset = dataset_select(dset, l_index, d);
	      free(ranking);
	      if (index)
		*index = l_index;
	      else
		free(l_index);
	    }
	}
      else if ((fsel_method == PDR_FORWARD) || (fsel_method == PDR_BACKWARD) || 
	       (fsel_method == PDR_PLUS_L_MINUS_R) || (fsel_method == PDR_FLOAT_FORWARD) || 
	       (fsel_method == PDR_BB))
	{
	  if (fsel_method == PDR_FORWARD)
	    {
	      l_index = forward_select(dset, d, fscrit, l, &val_crit, errc);
	    }
	  else if (fsel_method == PDR_BACKWARD)
	    {
	      l_index = backward_select(dset, d, fscrit, r, &val_crit, errc);
	    }
	  else if (fsel_method == PDR_FLOAT_FORWARD)
	    {
	      int really_selected;
	      l_index = float_fwd_select(dset, d, fscrit, &val_crit, 
					 &really_selected, errc);
	      d = really_selected;
	    }
	  else if (fsel_method == PDR_BB)
	    {
	      ;
	    }
	  else if (fsel_method == PDR_PLUS_L_MINUS_R)
	    {
	      l = PCP_FSEL_LPAR;
	      r = PCP_FSEL_RPAR;
	      l_index = plus_l_minus_r(dset, d, fscrit, l, r, &val_crit);
	    }
	  if (l_index)
	    {
	      if (dsflag == 1)
		subset = dataset_select(dset, l_index, d);
	      if (index)
		*index = l_index;
	      else
		free(l_index);
	    }
	}
    }
  return subset;
}





syntax highlighted by Code2HTML, v. 0.9.1