/*
  File name: dataset.h
  Created by: Ljubomir Buturovic
  Created: 03/10/2004
  Purpose: declaration of dataset structure and associated API.
*/

/*
  Copyright 2004 Ljubomir J. Buturovic

  Permission is hereby granted, free of charge, to any person
  obtaining a copy of this software and associated documentation files
  (the "Software"), to deal in the Software without restriction,
  including without limitation the rights to use, copy, modify, merge,
  publish, distribute, sublicense, and/or sell copies of the Software,
  and to permit persons to whom the Software is furnished to do so,
  subject to the following conditions:

  The above copyright notice and this permission notice shall be
  included in all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  SOFTWARE.
*/

#ifndef DATASET_H
#define DATASET_H

#include <stdio.h>

/*
  POPULATION_RESAMPLING selects bagging set from entire population,
  using uniform distribution with resampling without regard to the
  class memberships. CLASS_RESAMPLING maintains class proportions in
  the selected bagging set. Note that POPULATION_RESAMPLING can in
  theory generate an empty class (because, unlike CLASS_RESAMPLING, it
  has no built-in guarantee that each class will be represented in the
  bagging set).

  On 12/08/2002 decided to use POPULATION_RESAMPLING, based on the
  available literature.

  TBD: need to test POPULATION_RESAMPLING if it happens that an empty
  training set class is created.
*/
#define CLASS_RESAMPLING       1
#define POPULATION_RESAMPLING  2

/*
  Recognized dataset file formats.

  DATASET_FF_RAW         raw vectors
  DATASET_FF_VEC         first column is vector name
  DATASET_FF_COL         first row has column names
  DATASET_FF_COLVEC      first row has column names, first column is vector name

  DATASET_FF_DEFAULT     the default format
  
  DATASET_FF_N           number of supported formats

  TBD:

  DATASET_FF_CLASS       first column is class name
  DATASET_FF_CV          first column is class name, second column is vector name

*/
#define DATASET_FF_RAW                 0
#define DATASET_FF_VEC                 1
#define DATASET_FF_COL                 2
#define DATASET_FF_COLVEC              3

#if 0
#define DATASET_FF_CLASS               1
#define DATASET_FF_CV                  3
#endif

#define DATASET_FF_N                   4

#define DATASET_FF_DEFAULT             DATASET_FF_RAW

/*
  dataset_new() without the 'prediction' and 'ps' members.
*/
struct dataset *dataset_lt(int d, int c, int *nd, int nv, char **fnames, float **x);

/*
  Representation of a set of (optionally labeled) data vectors.
*/
struct dataset
{
  int   d;         /* dimension (length) of data vectors */
  int   c;         /* number of categories (classes) */
  char  **cnames;  /* optional class names */
  int   *nd;       /* number of vectors per class */
  int   nv;        /* total number of vectors (i.e., sum of elements in 'nd') */
  char  **fnames;  /* data files */
  int   format;    /* format of data in `fnames' */
  int   nfiles;    /* number of data files */
  float **x;       /* data vectors */
  char  **alab;    /* optional labels (names) for attributes of `x' */
  char  *alab0;    /* column 0 label for DATASET_FF_COLVEC format */
  char  **xlab;    /* optional labels (names) for data vectors in `x' */
  /*
    Actual (truth) class labels for vectors in 'x'. The length of
    'label' is 'nv'.

    In PCP, the range of values in 'label' is 0..c-1, plus
    PCP_UNASSIGNED, which indicates unassigned class label.

    NOTE: this field is currently not used universally. Most PCP
    functions presently assume that the first nd[0] vectors belong to
    class 0, the next nd[1] belong to class 1, etc. TBD: convert all
    functions to use 'label' instead.
  */
  int   *label;
  /*
    Predicted class labels for vectors in `x'. The length of
    'prediction' is 'nv' if classification has been performed. 

    In PCP, the range of values in 'prediction' is 0..c-1, plus
    PCP_UNASSIGNED. The values represent class labels assigned as a
    result of a classification. The value PCP_UNASSIGNED indicates
    that the classification algorithm did not assign a class label to
    the vector.
  */
  int   *prediction;
  /*
    nv by c matrix of class prediction strengths corresponding to
    'prediction'.
    
    Why is this double? Because LIBSVM function
    svm_predict_probability(), where it has first been used, wants a
    double. And anyway, all other float fields should be converted to
    double.
  */
  double **ps;
  /* 
     Inverted class-covariance matrices in SSM (IMSL Symmetric Storage
     Mode)
  */
  float **sigma; 
  /* 
     Determinants of the class-covariance matrices in the
     dsign*exp(dexp) mode, where dsign, dexp for class i, 0 <= i <=
     c-1, are det[2*i], det[2*i+1], respectively.

     Frequently, log of the determinant is used in machine learning
     algorithms. For example, natural logarithm of a determinant of
     covariance matrix of the first class can be computed as:

     log(det[0])+det[1]

     Second class:

     log(det[2])+det[3]

     etc.
  */
  float *det;
};

/*
  Structure describing a subset of struct dataset.
*/
struct subset 
{
  int size; /* number of elements in the subset */
  int *idx; /* indices of elements in the subset */
};

/*
  Load contents of 'c' files, named in 'names', in format 'fmt', into
  a struct dataset. Return the struct.

  Return (struct dataset *) 0 in case of error, and set
  errc/fname. The possible values for errc are malloc() and file I/O
  error codes, or LERR_FILE_FORMAT for unrecognized input file format.
*/
struct dataset *load_dataset(int d, int c, int *nd, char **names, int fmt, 
			     int *errc, char **fname);

/*
  Free 'dset'. Preserves the value of errno.
*/
struct dataset *dataset_free(struct dataset *dset);

/*
  Dataset resampling function, normally used for bagging
  algorithms. Samples 'bag_size' vectors from 'dset' and places them
  into 'bag', with 'bnd' vectors per class. 'mdl' is the model index
  (used for logging only).
*/
int resample(int mdl, struct dataset *dset, int bag_size, float **bag, int *bnd, FILE *fdbg);

/*
  Return class index (0..c-1) for vector 'ivx' (0-based).
*/
int dataset_class(int ivx, int c, int *nd);

/*
  Extract dataset defined by 'xsubset' from 'dset'. If 'complement' is
  1, extract complement of 'xsubset'.

  In case of error, return NULL and set errno. Errors are EINVAL and
  memory allocation errors.
*/
struct dataset *dataset_subset(struct dataset *dset, struct subset *xsubset, int complement);

/*
  Partition each of the 'c' classes with 'nd[i]' vectors per class,
  into 'nxval' disjoint subsets. The resulting partition is specified
  in 'sxc' and 'lxc' arrays.
  
  Local arrays 'tsxc' and 'tlxc' have list of vectors in each subset,
  and subset cardinalities, respectively. 'tsxc[i]' and 'tlxc[i]' are
  pointers to 'sx' and 'lx' arrays, respectively, for class 'i', as
  described in comments for xss(). The addresses of 'tsxc' and 'tlxc'
  are returned in '*sxc' and '*lxc'.

  The function may be used for cross-validation experiments, to return
  all subsets. Then the subsets may be analyzed one at a time.

  The function returns -1 and sets errno in case of malloc() failure,
  0 otherwise. If 'nxval' is not in a correct range for any class, set
  'sxc' and 'lxc' to NULL. 
*/
struct subset **dataset_partition(struct dataset *dset, int nxval);

/*
  Compute covariance matrices and determinants for 'dset'. The
  matrices are stored in dset->sigma, in Symmetric Storage Mode, and
  the determinants are stored in dset->det.

  In case of success, return 0. In case of failure, return -1 and set
  'errc'.
*/
int dataset_sigma(struct dataset *dset, int *errc);

struct dataset *dataset_new(int d, int c, int *nd, int nv, char **fnames, float **x);

/*
  Return clone of 'dset'.

  In case of failure, return NULL and set errno. The possible errors
  are memory allocation errors.
*/
struct dataset *dataset_clone(struct dataset *dset);

/*
  Map 'dset' into 'd'-dimensional space using 'n' by dset->d matrix
  'fmx'. 'n' has to be >= d, but the function does not check that
  (since it doesn't know n).

  Return NULL and set errno in case of failure. Possible errors are
  memory allocation errors and EINVAL for bad input arguments.
*/
struct dataset *dataset_map(struct dataset *dset, int d, float **fmx);

/*
  In-place mapping of 'dset' into 'd'-dimensional space using 'n' by
  dset->d matrix 'fmx'. 'n' has to be >= d, but the function does not
  check that (since it doesn't know n).

  Return -1 in case of inconsistent input. Return -1 and set errno
  in case of failure.
*/
int dataset_mapx(struct dataset *dset, int d, float **fmx);

/*
  Write `dset->x' into `dset->fnames' in `dset->format'.

  Return -1 in case of error and set errno. In case of file error,
  copy the offending file name to 'xname'.
*/
int dataset_write(struct dataset *dset, char **xname);

/*
  Write `dset->x' into filenames prefix1.dmp, prefix2.dmp, etc., in
  `dset->format'.

  Return -1 in case of error and set errno. In case of file error,
  copy the offending file name to 'xname'.
*/
int dataset_dump(struct dataset *dset, char *prefix, char **xname);

/*
  Apply linear classifier 'wmx' to 'dset'. 'wmx' is assumed to be a
  dset->c by dset->d+1 matrix whose last column contains the bias term
  of the linear classifier. The predictions are stored in
  dset->prediction.

  Return 0 in case of success. In case of error, return the error
  code.  Error codes are EINVAL for bad input arguments and malloc()
  error codes.
*/
int dataset_lin_predict(struct dataset *dset, float **wmx);

/*
  Apply parametric quadratic classifier defined by 'wmx' and 'sigma'
  to 'dset'. 'wmx' is assumed to be a dset->c by dset->d+1 matrix
  whose last column contains the bias term. 'sigma' are inverted
  covariance matrices of the training data set in SSM. They define the
  quadratic term of the classifier. The predictions are stored in
  dset->prediction.

  Return 0 in case of success. In case of error, return the error
  code.  Error codes are EINVAL for bad input arguments and malloc()
  error codes.
*/
int dataset_pqc_predict(struct dataset *dset, float **wmx, float **sigma);

/*
  Create dataset which has a subset of features in 'dset'. The subset
  is defined by 'index'.

  In case of failure, return NULL and set errno.  
*/
struct dataset *dataset_select(struct dataset *dset, int *index, int d);

/*
  In-place version of dataset_select() - replace vectors in 'dset'
  with the subset of `d' features whose indexed are given in 'index'.

  In case of success, return 0. In case of memory allocation failure,
  return -1 and set errno. 

  Note that bad arguments _are not_ considered an error. The caller
  has to check the arguments himself.
*/
int dataset_inset(struct dataset *dset, int *index, int d);

#endif