/*
  File name: dataset.c
  Created by: Ljubomir Buturovic
  Created: 03/10/2004
  Purpose: API for struct dataset.
*/

/*
  Copyright 2004 Ljubomir J. Buturovic

  Permission is hereby granted, free of charge, to any person
  obtaining a copy of this software and associated documentation files
  (the "Software"), to deal in the Software without restriction,
  including without limitation the rights to use, copy, modify, merge,
  publish, distribute, sublicense, and/or sell copies of the Software,
  and to permit persons to whom the Software is furnished to do so,
  subject to the following conditions:

  The above copyright notice and this permission notice shall be
  included in all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  SOFTWARE.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <float.h>
#include "hash_util.h"
#include "dataset.h"
#include "lau.h"
#include "lmat.h"

static char rcsid[] = "$Id: dataset.c,v 1.37 2006/03/27 17:45:39 ljubomir Exp $";

/*
  dataset_new() without the 'prediction' and 'ps' members.
*/
struct dataset *dataset_lt(int d, int c, int *nd, int nv, char **fnames, float **x)
{
  int i;
  int offset;
  int idx;
  struct dataset *dset;

  dset = calloc(1, sizeof(struct dataset));
  if (dset)
    {
      dset->label = malloc(nv*sizeof(int));
      if (dset->label)
	{
	  dset->d = d;
	  dset->c = c;
	  dset->nd = nd;
	  dset->nv = nv;
	  if (fnames)
	    dset->fnames = fnames;
	  if (x)
	    dset->x = x;
	  /*
	    Assign class labels.
	  */
	  idx = 0;
	  offset = nd[0];
	  for (i = 0; i < nv; i++)
	    {
	      dset->label[i] = idx;
	      if ((i == offset-1) && (i != nv-1))
		{
		  idx++;
		  offset += nd[idx];
		}
	    }
	}
    }
  return dset;
}  

/*
  Load contents of 'c' files, named in 'names', in format 'fmt', into
  a struct dataset. Return the struct.

  Return (struct dataset *) 0 in case of error, and set
  errc/fname. The possible values for errc are malloc() and file I/O
  error codes, or LERR_FILE_FORMAT for unrecognized input file format.
*/
struct dataset *load_dataset(int d, int c, int *nd, char **names, int fmt, 
			     int *errc, char **fname)
{
  int   i;
  int   status;
  int   offset;
  int   j;
  int   errno_save;
  char  **alab;
  float **x;
  struct dataset *dset;

  *errc = 0;
  status = 0;
  dset = calloc(1, sizeof(struct dataset));
  if (dset)
    {
      dset->c = c;
      dset->format = fmt;
      dset->fnames = string_copy(names, c);
      dset->d = d;
      dset->nd = ivec_clone(nd, c);
      if (dset->nd == (int *) 0)
	status = -1;
      if (dset->nd)
	{
	  dset->nv = ivec_sum(dset->nd, dset->c);
	  dset->x = malloc((dset->nv)*sizeof(float *));
	  if (dset->x)
	    {
	      if ((fmt == DATASET_FF_VEC) || (fmt == DATASET_FF_COLVEC))
		{
		  dset->xlab = calloc(dset->nv, sizeof(char *));
		  if (!dset->xlab)
		    status = -1;
		}
	      if (!status)
		{
		  for (i = 0; (i < dset->nv) && (status == 0); i++)
		    {
		      dset->x[i] = malloc((dset->d)*sizeof(float));
		      if (!dset->x[i])
			status = -1;
		    }
		  if (!status)
		    {
		      dset->label = malloc(dset->nv*sizeof(int));
		      if (dset->label)
			{
			  offset = 0;
			  x = dset->x;
			  for (i = 0; (i < dset->c) && !status; i++)
			    {
			      if ((i == 0) && ((fmt == DATASET_FF_COL) || (fmt == DATASET_FF_COLVEC)))
				{
				  alab = calloc(dset->d, sizeof(char *));
				  if (!alab)
				    status = -1;
				}
			      else
				alab = (char **) 0;
			      if (!status)
				{
				  if (dset->xlab)
				    status = load_file(dset->fnames[i], &(x[offset]),
						       dset->format, &(dset->xlab[offset]), &dset->alab0,
						       alab, dset->d, dset->nd[i], errc);
				  else
				    status = load_file(dset->fnames[i], &(x[offset]),
						       dset->format, (char **) 0, &dset->alab0,
						       alab, dset->d, dset->nd[i], errc);
				  if (!status)
				    {
				      if ((i == 0) && ((fmt == DATASET_FF_COL) || (fmt == DATASET_FF_COLVEC)))
					dset->alab = alab;
				      for (j = offset; j < offset+nd[i]; j++)
					dset->label[j] = i;
				      offset += dset->nd[i];
				    }
				  else
				    *fname = strdup(dset->fnames[i]);
				}
			    }
			}
		      else
			status = -1;
		    }
		}
	    }
	  else
	    status = -1;
	}
      else
	status = -1;
    }
  else
    status = -1;
  if (status == -1)
    {
      errno_save = errno;
      dset = dataset_free(dset);
      if (errc && !*errc)
	*errc = errno_save;
    }
  return dset;
}

/*
  Free 'dset'. Preserves the value of errno.
*/
struct dataset *dataset_free(struct dataset *dset)
{
  int errno_save;

  errno_save = errno;
  if (dset)
    {
      mx_free((void **) dset->fnames, dset->c);
      mx_free((void **) dset->x, dset->nv);
      mx_free((void **) dset->xlab, dset->nv);
      mx_free((void **) dset->alab, dset->d);
      mx_free((void **) dset->sigma, dset->c);
      vx_free(dset->det);
      free(dset->prediction);
      free(dset->ps);
      free(dset->nd);
      free(dset->label);
      free(dset);
    }
  errno = errno_save;
  return (struct dataset *) 0;
}  

/*
  Dataset resampling function, normally used for bagging
  algorithms. Samples 'bag_size' vectors from 'dset' and places them
  into 'bag', with 'bnd' vectors per class. 'mdl' is the model index
  (used for logging only).
*/
int resample(int mdl, struct dataset *dset, int bag_size, float **bag, int *bnd, FILE *fdbg)
{
  int   j;
  int   cx;
  int   offset;
  int   cbx;
  int   c_bag_size;
  int   idx;
  int   nvec;
  int   resampling_mode = POPULATION_RESAMPLING;
  float cf;
  int   *ibag;
  
  if (resampling_mode == CLASS_RESAMPLING)
    {
      ivec_set(bnd, dset->c, 0);
      offset = 0;
      cbx = 0;
      for (cx = 0; cx < dset->c; cx++)
	{
	  cf = dset->nd[cx];
	  cf = cf/(dset->nv);
	  c_bag_size = cf*bag_size;
	  for (j = 0; j < c_bag_size; j++)
	    {
	      idx = rand_int(0, dset->nd[cx]-1);
	      bag[cbx] = dset->x[idx+offset];
	      /* Use for testing. */
	      /*bag[cbx] = dset->x[cbx];*/
	      cbx++;
	      bnd[cx]++;
	      if (fdbg)
		fprintf(fdbg, "bagging(); model %d; bag element %d is %d.\n", 
			mdl, cbx-1, idx+offset);
	    }
	  offset += dset->nd[cx];
	}
      nvec = ivec_sum(bnd, dset->c);
    }
  else if (resampling_mode == POPULATION_RESAMPLING)
    {
      ibag = malloc((bag_size+1)*sizeof(int));
      ivec_set(bnd, dset->c, 0);
      for (cbx = 0; cbx < bag_size; cbx++)
	ibag[cbx] = rand_int(0, dset->nv-1);
      intsort(ibag, bag_size);
      if (fdbg)
	fprintf(fdbg, "bag:\t%d\t", mdl+1);
      for (cbx = 0; cbx < bag_size; cbx++)
	{
	  idx = ibag[cbx];
	  bag[cbx] = dset->x[idx];
	  /* Use for testing. */
	  /*bag[cbx] = dset->x[cbx];*/
	  cx = dataset_class(idx, dset->c, dset->nd);
	  bnd[cx]++;
	  if (fdbg)
	    {
	      fprintf(fdbg, "%d\t", idx+1);
	      /*fprintf(fdbg, "bagging(); model %d; bag element %d is %d.\n", 
		mdl, cbx, idx);*/
	    }
	}
      if (fdbg)
	fprintf(fdbg, "\n");
      free(ibag);
      nvec = ivec_sum(bnd, dset->c);
    }
  return nvec;
}

/*
  Return class index (0..c-1) for vector 'ivx' (0-based).
*/
int dataset_class(int ivx, int c, int *nd)
{
  int i;
  int icx;
  int ncx;
  int done;
  
  icx = 0;
  ncx = nd[0];
  done = 0;
  for (i = 1; (i < c) && (done == 0); i++)
    {
      if (ivx < ncx)
	{
	  done = 1;
	}
      else
	{
	  ncx += nd[i];
	  icx++;
	}
    }
  return icx;
}

/*
  Utility function for partitioning set of integers in [low..high]
  interval into 'nxval' disjoint subsets. The subsets are returned in
  'sx' and 'lx' integer arrays. Local variable 'tsx' has elements of
  the subsets, local variable 'tlx[i]' has cardinality of 'i'-th
  subset. tlx[i] is either [range/nxval] or [range/nxval]+1. So,
  tsx[0..lx[0]-1] are elements of the first subset;
  tsx[lx[0]..lx[0]+lx[1]-1] are elements of the second subset,
  etc. Addresses of 'tsx' and 'tlx' are returned in '*sx' and '*lx'.

  The partitioning is performed using pseudo-random number generator
  rand().

  Return -1 in case of malloc() error, 0 otherwise. If 'nxval' is
  greater than 'range', or less than 1, set 'sx' and 'lx' to NULL.

  NOTE: notation [low..high] designates set of integers between 'low'
  and 'high', including both 'low' and 'high'.
*/
static int partition_range(int low, int high, int nxval, int **sx, int **lx)
{
  int status;
  int range;
  int i;
  int j;
  int ivx;
  int jx;
  int tcx;
  int idx;
  int lsmall;
  int lbig;
  int *tsx;
  int *tlx;
  int *xc;
  float rcc;

  tsx = (int *) 0;
  tlx = (int *) 0;
  status = 0;
  range = high-low+1;
  if ((nxval >= 1) && (nxval <= range))
    {
      rcc = range/nxval;
      idx = 0;
      /*
	xc has a list of remaining integers from the 0..range-1.
	Initially xc contains all integers in the range.
      */
      xc = malloc(range*sizeof(int));
      for (j = 0; j < range; j++)
	xc[j] = j;
      
      tsx = malloc(range*sizeof(int));
      tlx = malloc(nxval*sizeof(int));
      if ((xc == (int *) 0) || (tsx == (int *) 0) || (tlx == (int *) 0))
	status = -1;
      else
	{
	  /*
	    Among the 'nxval' subsets, lsmall will have cardinality
	    [range/nxval]. The remaining lbig = nxval-lsmall subsets
	    will have one more element.
	  */
	  lbig = rcc;
	  lbig = range-lbig*nxval;
	  lsmall = nxval-lbig;
	  for (j = 0; j < lsmall; j++)
	    tlx[j] = rcc;
	  for (j = lsmall; j < nxval; j++)
	    tlx[j] = rcc+1;
	  tcx = range;
	  i = 0;
	  for (j = 0; j < range; j++)
	    {
	      /*
		Choose an integer, pseudo-randomly, and insert it into
		tsx.
	      */
	      ivx = rand_int(0, tcx-1);
	      tsx[i] = xc[ivx]+low;
	      i++;
	      for (jx = ivx; jx < tcx-1; jx++)
		xc[jx] = xc[jx+1];
	      tcx--;
	    }
	  free(xc);
	}
    }
  else
    status = -1;
  *sx = tsx;
  *lx = tlx;
  return status;
}

/*
  Partition each of the 'c' classes with 'nd[i]' vectors per class,
  into 'nxval' disjoint subsets. The resulting partition is specified
  in 'sxc' and 'lxc' arrays.
  
  Local arrays 'tsxc' and 'tlxc' have list of vectors in each subset,
  and subset cardinalities, respectively. 'tsxc[i]' and 'tlxc[i]' are
  pointers to 'sx' and 'lx' arrays, respectively, for class 'i', as
  described in comments for xss(). The addresses of 'tsxc' and 'tlxc'
  are returned in '*sxc' and '*lxc'.

  The function may be used for cross-validation experiments, to return
  all subsets. Then the subsets may be analyzed one at a time.

  The function returns -1 and sets errno in case of malloc() failure,
  0 otherwise. If 'nxval' is not in a correct range for any class, set
  'sxc' and 'lxc' to NULL. 
*/
struct subset **dataset_partition(struct dataset *dset, int nxval)
{
  int i;
  int j;
  int k;
  int length;
  int cl;
  int sl;
  int done;
  int status;
  int ccc;
  int *sx;
  int *lx;
  struct subset **xset;

  status = 0;
  done = 0;
  xset = malloc(nxval*sizeof(struct subset *));
  length = dset->nv/nxval+1;
  for (i = 0; i < nxval; i++)
    {
      xset[i] = calloc(1, sizeof(struct subset));
      xset[i]->idx = malloc(length*sizeof(int));
    }
  if (xset)
    {
      for (i = 0; (i < dset->c) && !done; i++)
	{
	  ccc = ivec_sum(dset->nd, i);
	  status = partition_range(ccc, ccc+dset->nd[i]-1, nxval, &sx, &lx);
	  if (!status && sx && lx)
	  {
	    sl = 0;
	    for (j = 0; j < nxval; j++)
	      {
		length = lx[j];
		cl = xset[j]->size;
		for (k = 0; k < length; k++)
		  xset[j]->idx[k+cl] = sx[sl++];
		xset[j]->size += length;
	      }
	    free(lx);
	    free(sx);
	  }
	  else
	    {
	      for (j = 0; j < nxval; j++)
		free(xset[j]);
	      xset = (struct subset **) 0;
	      done = 1;
	    }
	}
    }
  return xset;
}

/*
  Integer comparison function for qsort().
*/
static int compare_integers(const void *int1, const void *int2)
{
  int retval = -1;
  int *i1;
  int *i2;
  
  i1 = (int *) int1;
  i2 = (int *) int2;
  if (*i1 > (int) *i2)
    retval = 1;
  else if (*i1 == *i2)
    retval = 0;
  return retval;
}

/*
  Extract dataset defined by 'xset' from 'dset'. If 'complement' is 1,
  extract complement of 'xsubset'.

  The function returns the subset. If 'dset' or 'xset' are NULL,
  return NULL and set errno to 0. In case of memory allocation error,
  return NULL and set errno to the corresponding system function errno
  value.
*/
struct dataset *dataset_subset(struct dataset *dset, struct subset *xset, int complement)
{
  int i;
  int icl;
  int index;
  int *ind;
  int *isx;
  struct dataset *sset;

  sset = (struct dataset *) 0;
  if (dset && xset)
    {
      sset = calloc(1, sizeof(struct dataset));
      if (sset)
	{
	  sset->d = dset->d;
	  sset->c = dset->c;
	  if (complement == 1)
	    sset->nv = dset->nv-xset->size;
	  else
	    sset->nv = xset->size;
	  /*
	    Sort input vectors indices.
	  */
	  if (complement == 1)
	    {
	      ind = malloc(dset->nv*sizeof(int));
	      for (i = 0; i < dset->nv; i++)
		ind[i] = i;
	      isx = ivec_diff(ind, dset->nv, xset->idx, xset->size, &icl);
	      free(ind);
	    }
	  else
	    isx = ivec_clone(xset->idx, xset->size);
	  qsort(isx, xset->size, sizeof(int), compare_integers);
	  /*
	    Copy vectors.
	  */
	  sset->x = calloc(sset->nv, sizeof(float *));
	  if (sset->x)
	    {
	      for (i = 0; i < sset->nv; i++)
		{
		  index = isx[i];
		  sset->x[i] = fvec_clone(dset->x[index], dset->d);
		}
	      if (dset->xlab)
		{
		  sset->xlab = calloc(sset->nv, sizeof(char *));
		  if (sset->xlab)
		    {
		      for (i = 0; i < sset->nv; i++)
			{
			  index = isx[i];
			  sset->xlab[i] = strdup(dset->xlab[index]);
			}
		    }
		  else
		    sset = (struct dataset *) vx_free(sset); /* TBD: need to free() the whole struct */
		}
	    }
	  else
	    sset = (struct dataset *) vx_free(sset);
	  if (sset)
	    {
	      /*
		Compute class cardinalities.
	      */
	      ind = calloc(dset->c, sizeof(int));
	      for (i = 0; i < sset->nv; i++)
		{
		  icl = dataset_class(isx[i], dset->c, dset->nd);
		  ind[icl]++;
		}
	      free(isx);
	      sset->nd = ind;
	    }
	}
    }
  else
    errno = 0;
  return sset;
}

/*
  Compute covariance matrices and determinants for 'dset'. The
  matrices are stored in dset->sigma, in Symmetric Storage Mode, and
  the determinants are stored in dset->det.

  In case of success, return 0. In case of failure, return -1 and set
  'errc'.
*/
int dataset_sigma(struct dataset *dset, int *errc)
{
  int   status;
  int   j;
  int   idx;
  int   offset;
  float dsign;
  float dexp;
  float *det;
  float **sigma;
  float **cov;
  float **inverse;

  status = 0;
  *errc = 0;
  if (!dset->sigma)
    {
      sigma = calloc(dset->c, sizeof(float *));
      if (sigma)
	{
	  det = malloc(2*dset->c*sizeof(float));
	  if (det)
	    {
	      offset = 0;
	      idx = 0;
	      for (j = 0; (j < dset->c) && !status; j++)
		{
		  cov = cest(&dset->x[offset], dset->d, dset->nd[j], COVARIANCE_MATRIX);
		  if (cov)
		    {
		      inverse = fmx_inv(cov, dset->d, &dsign, &dexp, errc);
		      mx_free((void **) cov, dset->d);
		      if (inverse)
			{
			  sigma[j] = fmx_ssm(inverse, dset->d);
			  if (sigma[j])
			    {
			      offset += dset->nd[j];
			      det[idx] = dsign;
			      idx++;
			      det[idx] = dexp;
			      idx++;
			    }
			  else
			    {
			      status = -1;
			      vx_free(det);
			      mx_free((void **) sigma, j);
			    }
			  mx_free((void **) inverse, dset->d);
			}
		      else
			{
			  status = -1;
			  vx_free(det);
			  mx_free((void **) sigma, dset->c);
			}
		    }
		  else
		    {
		      status = -1;
		      *errc = errno;
		      vx_free(det);
		      mx_free((void **) sigma, dset->c);
		    }
		}
	      if (!status)
		{
		  dset->sigma = sigma;
		  dset->det = det;
		}
	    }
	  else
	    {
	      status = -1;
	      *errc = errno;
	    }
	}
      else
	{
	  status = -1;
	  *errc = errno;
	}
    }
  return status;
}

struct dataset *dataset_new(int d, int c, int *nd, int nv, char **fnames, float **x)
{
  struct dataset *dset;

  dset = calloc(1, sizeof(struct dataset));
  if (dset != (struct dataset *) 0)
    {
      dset->d = d;
      dset->c = c;
      dset->nd = nd;
      dset->nv = nv;
      dset->fnames = fnames;
      dset->prediction = calloc(nv, sizeof(float));
      if (dset->prediction)
	dset->x = x;
      else
	dset = dataset_free(dset);
    }
  return dset;
}  

/*
  Return clone of 'dset'.

  In case of failure, return NULL and set errno. The possible errors
  are memory allocation errors.
*/
struct dataset *dataset_clone(struct dataset *dset)
{
  int i;
  int len;
  int status;
  struct dataset *clone = (struct dataset *) 0;

  status = 0;
  if (dset)
    {
      clone = calloc(1, sizeof(struct dataset));
      if (clone)
	{
	  clone->d = dset->d;
	  clone->c = dset->c;
	  clone->nv = dset->nv;
	  clone->format = dset->format;
	  clone->nd = ivec_clone(dset->nd, dset->c);
	  if (clone->nd)
	    {
	      if (dset->fnames)
		{
		  clone->fnames = malloc(dset->c*sizeof(char *));
		  status = 0;
		  for (i = 0; (i < dset->c) && !status; i++)
		    {
		      clone->fnames[i] = strdup(dset->fnames[i]);
		      if (!clone->fnames[i])
			status = -1;
		    }
		}
	      if (!status && (dset->x))
		{
		  clone->x = fmx_alloc(dset->nv, dset->d);
		  if (clone->x)
		    for (i = 0; i < dset->nv; i++)
		      fvec_copy(clone->x[i], dset->x[i], dset->d);
		  else
		    status = -1;
		}
	      if (!status && dset->xlab)
		{
		  clone->xlab = calloc(dset->nv, sizeof(char *));
		  if (clone->xlab)
		    for (i = 0; i < dset->nv; i++)
		      clone->xlab[i] = strdup(dset->xlab[i]);
		  else
		    status = -1;
		}
	      if (!status && dset->alab)
		{
		  clone->alab = calloc(dset->d, sizeof(char *));
		  if (clone->alab)
		    for (i = 0; i < dset->d; i++)
		      clone->alab[i] = strdup(dset->alab[i]);
		  else
		    status = -1;
		}
	      if (!status && dset->alab0)
		{
		  clone->alab0 = strdup(dset->alab0);
		  if (!clone->alab0)
		    status = -1;
		}
	      if (!status && dset->label)
		{
		  clone->label = ivec_clone(dset->label, dset->nv);
		  if (!clone->label)
		    status = -1;
		}
	      if (!status && dset->prediction)
		{
		  clone->prediction = ivec_clone(dset->prediction, dset->nv);
		  if (!clone->prediction)
		    status = -1;
		}
	      if (!status && dset->ps)
		{
		  clone->ps = dmx_clone(dset->ps, dset->nv, dset->d);
		  if (!clone->ps)
		    status = -1;
		}
	      if (!status && dset->sigma)
		{
		  clone->sigma = malloc(dset->c*sizeof(float *));
		  if (!clone->sigma)
		    status = -1;
		  else
		    {
		      len = dset->d*(dset->d+1)/2;
		      for (i = 0; i < dset->c; i++)
			clone->sigma[i] = fvec_clone(dset->sigma[i], len);
		    }
		}
	      if (!status && dset->det)
		clone->det = fvec_clone(dset->det, 2*dset->d);
	    }
	  else
	    status = -1;
	}
    }
  if (status == -1)
    clone = dataset_free(clone);
  return clone;
}  

/*
  Map 'dset' into 'd'-dimensional space using 'n' by dset->d matrix
  'fmx'. 'n' has to be >= d, but the function does not check that
  (since it doesn't know n).

  Return NULL and set errno in case of failure. Possible errors are
  memory allocation errors and EINVAL for bad input arguments.
*/
struct dataset *dataset_map(struct dataset *dset, int d, float **fmx)
{
  struct dataset *mapped_dset;
  float **x;

  mapped_dset = (struct dataset *) 0;
  if (dset && fmx)
    {
      x = fmx_mult(dset->x, dset->nv, dset->d, fmx, d, 1);
      if (x)
	{
	  mapped_dset = calloc(1, sizeof(struct dataset));
	  if (mapped_dset)
	    {
	      mapped_dset->x = x;
	      mapped_dset->nd = ivec_clone(dset->nd, dset->c);
	      if (mapped_dset->nd)
		{
		  mapped_dset->label = ivec_clone(dset->label, dset->nv);
		  if (dset->label && !mapped_dset->label)
		    mapped_dset = dataset_free(mapped_dset);
		  else
		    {
		      mapped_dset->nv = dset->nv;
		      mapped_dset->d = d;
		      mapped_dset->c = dset->c;
		    }
		}
	      else
		mapped_dset = dataset_free(mapped_dset);
	    }
	}
    }
  else
    errno = EINVAL;
  return mapped_dset;
}

/*
  In-place mapping of 'dset' into 'd'-dimensional space using 'n' by
  dset->d matrix 'fmx'. 'n' has to be >= d, but the function does not
  check that (since it doesn't know n).

  Return -1 in case of inconsistent input. Return -1 and set errno
  in case of failure.
*/
int dataset_mapx(struct dataset *dset, int d, float **fmx)
{
  int   status;
  float **x;

  status = 0;
  x = fmx_mult(dset->x, dset->nv, dset->d, fmx, d, 1);
  if (x)
    {
      mx_free((void **) dset->x, dset->nv);
      dset->x = x;
      dset->d = d;
    }
  else
    status = -1;
  return status;
}

/*
  Save 'matrix' in 'fname'. Store `alab[i]' as column names in the
  first row. Store 'xlab[i]' in the first column of the i-th row. If
  `alab0' is not NULL, it is stored as first column in the first row.

  Return -1 in case of error and set errno.

  ljb, 08/10/2005: allow zero rows and/or columns. It is conceivable
  that we may have an empty test class.
*/

static int dataset_save(float **matrix, char *alab0, char **alab, char **xlab, 
			int rows, int columns, char *fname)
{
  int status = 0;
  int i;
  FILE *fptr;

  fptr = fopen(fname, "w");
  if (fptr)
    {
      if (alab0)
	fprintf(fptr, "%s\t", alab0);
      if (alab)
	{
	  for (i = 0; i < columns-1; i++)
	    fprintf(fptr, "%s\t", alab[i]);
	  if (columns > 1)
	    fprintf(fptr, "%s", alab[columns-1]);
	  fprintf(fptr, "\n");
	}
      status = fmx_nwrite(fptr, matrix, xlab, rows, columns);
      if (!status)
	status = fclose(fptr);
    }
  else
    status = -1;
  return status;
}

/*
  Write `dset->x' into `dset->fnames' in `dset->format'.

  Return -1 in case of error and set errno. In case of file error,
  copy the offending file name to 'xname'.
*/
int dataset_write(struct dataset *dset, char **xname)
{
  int  i;
  int  offset;
  int  status;
  char **xlab;

  status = 0;
  if (dset && dset->x && dset->nd)
    {
      offset = 0;
      for (i = 0; (i < dset->c) && !status; i++)
	{
	  if (dset->xlab)
	    xlab = &dset->xlab[offset];
	  else
	    xlab = (char **) 0;
	  status = dataset_save(&dset->x[offset], dset->alab0, dset->alab, 
				xlab, dset->nd[i], dset->d, dset->fnames[i]);
	  if ((status == -1) && xname)
	    *xname = strdup(dset->fnames[i]);
	  offset += dset->nd[i];
	}
    }
  return status;
}

/*
  Write `dset->x' into filenames prefix1.dmp, prefix2.dmp, etc., in
  `dset->format'.

  Return -1 in case of error and set errno. In case of file error,
  copy the offending file name to 'xname'.
*/
int dataset_dump(struct dataset *dset, char *prefix, char **xname)
{
  int  i;
  int  len;
  int  status;
  char *name;
  char **names;
  char **dmp_names;

  names = dset->fnames;
  dmp_names = calloc(dset->c+1, sizeof(char *));
  len = strlen(prefix)+20;
  for (i = 0; i < dset->c; i++)
    {
      name = malloc((len+1)*sizeof(char));
      sprintf(name, "%s%d.dmp", prefix, i+1);
      dmp_names[i] = name;
    }
  dset->fnames = dmp_names;
  status = dataset_write(dset, xname);
  str_free(dmp_names);
  dset->fnames = names;
  return status;
}

/*
  Apply linear classifier 'wmx' to 'dset'. 'wmx' is assumed to be a
  dset->c by dset->d+1 matrix whose last column contains the bias term
  of the linear classifier. The predictions are stored in
  dset->prediction.

  Return 0 in case of success. In case of error, return the error
  code.  Error codes are EINVAL for bad input arguments and malloc()
  error codes.
*/
int dataset_lin_predict(struct dataset *dset, float **wmx)
{
  int    i;
  int    j;
  int    errc;
  double gmax;
  double gx;

  errc = 0;
  if (dset && wmx)
    {
      if (!dset->prediction)
	dset->prediction = malloc(dset->nv*sizeof(int));
      if (dset->prediction)
	{
	  for (i = 0; i < dset->nv; i++)
	    {
	      gmax = -FLT_MAX;
	      for (j = 0; j < dset->c; j++)
		{
		  gx = fvec_dot(wmx[j], dset->x[i], dset->d, (int *) 0);
		  gx += wmx[j][dset->d];
		  if (gx > gmax)
		    {
		      dset->prediction[i] = j;
		      gmax = gx;
		    }
		}
	    }
	}
      else
	errc = errno;
    }
  else
    errc = EINVAL;
  return errc;
}

/*
  Apply parametric quadratic classifier defined by 'wmx' and 'sigma'
  to 'dset'. 'wmx' is assumed to be a dset->c by dset->d+1 matrix
  whose last column contains the bias term. 'sigma' are inverted
  covariance matrices of the training data set in SSM. They define the
  quadratic term of the classifier. The predictions are stored in
  dset->prediction.

  Return 0 in case of success. In case of error, return the error
  code.  Error codes are EINVAL for bad input arguments and malloc()
  error codes.
*/
int dataset_pqc_predict(struct dataset *dset, float **wmx, float **sigma)
{
  int    i;
  int    j;
  int    d;
  int    errc;
  double gmax;
  double gx;
  float  *fv;

  errc = 0;
  d = dset->d;
  if (dset && wmx)
    {
      if (!dset->prediction)
	dset->prediction = malloc(dset->nv*sizeof(int));
      if (dset->prediction)
	{
	  fv = malloc(d*sizeof(float));
	  if (fv)
	    {
	      for (i = 0; i < dset->nv; i++)
		{
		  gmax = -FLT_MAX;
		  for (j = 0; j < dset->c; j++)
		    {
		      gx = fvec_dot(wmx[j], dset->x[i], d, (int *) 0);
		      gx += wmx[j][d];
		      /*
			Add quadratic term.
		      */
		      fvec_smx(dset->x[i], dset->sigma[j], d, fv);
		      gx += -0.5*fvec_dot(dset->x[i], fv, d, (int *) 0);
		      if (gx > gmax)
			{
			  dset->prediction[i] = j;
			  gmax = gx;
			}
		    }
		}
	      free(fv);
	    }
	  else
	    errc = errno;
	}
      else
	errc = errno;
    }
  else
    errc = EINVAL;
  return errc;
}

/*
  Create dataset which has a subset of features in 'dset'. The subset
  is defined by 'index'.

  In case of failure, return NULL and set errno.  
*/
struct dataset *dataset_select(struct dataset *dset, int *index, int d)
{
  int  i;
  int  j;
  int  idx;
  int  status;
  struct dataset *sub_dset;

  sub_dset = (struct dataset *) 0;
  if (dset && index && (d > 0))
    {
      sub_dset = calloc(1, sizeof(struct dataset));
      if (sub_dset)
	{
	  sub_dset->d = d;
	  sub_dset->c = dset->c;
	  sub_dset->nv = dset->nv;
	  sub_dset->nd = ivec_clone(dset->nd, dset->c);
	  if (sub_dset->nd)
	    {
	      sub_dset->x = malloc(dset->nv*sizeof(float *));
	      if (sub_dset->x)
		{
		  status = 0;
		  for (i = 0; (i < dset->nv) && !status; i++)
		    {
		      sub_dset->x[i] = fvec_subset(dset->x[i], dset->d, index, d);
		      if (!sub_dset->x[i])
			status = -1;
		    }
		  if (!status)
		    {
		      if (dset->alab)
			{
			  sub_dset->alab = calloc(d, sizeof(char *));
			  for (j = 0; j < d; j++)
			    {
			      idx = index[j];
			      sub_dset->alab[j] = strdup(dset->alab[idx]);
			    }
			  if (!sub_dset->alab)
			    status = -1;
			}
		      if (!status)
			{
			  if (sub_dset->alab0)
			    {
			      sub_dset->alab0 = strdup(dset->alab0);
			      if (!sub_dset->alab0)
				status = -1;
			    }
			  if (dset->xlab)
			    {
			      sub_dset->xlab = str_clone(dset->xlab, dset->nv);
			      if (!sub_dset->xlab)
				status = -1;
			    }
			}
		      else
			status = -1;
		    }
		  if (status == -1)
		    sub_dset = dataset_free(sub_dset);
		}
	    }
	}
    }
  return sub_dset;
}

/*
  In-place version of dataset_select() - replace vectors in 'dset'
  with the subset of `d' features whose indexed are given in 'index'.

  In case of success, return 0. In case of memory allocation failure,
  return -1 and set errno. 

  Note that bad arguments _are not_ considered an error. The caller
  has to check the arguments himself.
*/
int dataset_inset(struct dataset *dset, int *index, int d)
{
  int   i;
  int   j;
  int   status;
  char  **alab;
  float **x;

  status = 0;
  alab = (char **) 0;
  if (dset && index && (d > 0))
    {
      x = malloc(dset->nv*sizeof(float *));
      if (x)
	{
	  for (i = 0; (i < dset->nv) && !status; i++)
	    {
	      x[i] = fvec_subset(dset->x[i], dset->d, index, d);
	      if (!x[i])
		{
		  status = -1;
		  mx_free((void **) x, dset->nv);
		}
	      else
		{
		  if (dset->alab)
		    {
		      alab = calloc(d, sizeof(char *));
		      if (alab)
			{
			  for (j = 0; (j < d) && !status; j++)
			    {
			      alab[j] = strdup(dset->alab[index[j]]);
			      if (!alab[j])
				{
				  status = -1;
				  mx_free((void **) alab, j);
				}
			    }
			}
		      else
			status = -1;
		    }
		}
	    }
	  if (!status)
	    {
	      mx_free((void **) dset->x, dset->nv);
	      mx_free((void **) dset->alab, dset->d);
	      dset->x = x;
	      dset->d = d;
	      dset->alab = alab;
	    }
	}
    }
  else
    {
      status = -1;
      errno = EINVAL;
    }
  return status;
}



syntax highlighted by Code2HTML, v. 0.9.1