/* File name: dataset.h Created by: Ljubomir Buturovic Created: 03/10/2004 Purpose: declaration of dataset structure and associated API. */ /* Copyright 2004 Ljubomir J. Buturovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef DATASET_H #define DATASET_H #include /* POPULATION_RESAMPLING selects bagging set from entire population, using uniform distribution with resampling without regard to the class memberships. CLASS_RESAMPLING maintains class proportions in the selected bagging set. Note that POPULATION_RESAMPLING can in theory generate an empty class (because, unlike CLASS_RESAMPLING, it has no built-in guarantee that each class will be represented in the bagging set). On 12/08/2002 decided to use POPULATION_RESAMPLING, based on the available literature. TBD: need to test POPULATION_RESAMPLING if it happens that an empty training set class is created. */ #define CLASS_RESAMPLING 1 #define POPULATION_RESAMPLING 2 /* Recognized dataset file formats. DATASET_FF_RAW raw vectors DATASET_FF_VEC first column is vector name DATASET_FF_COL first row has column names DATASET_FF_COLVEC first row has column names, first column is vector name DATASET_FF_DEFAULT the default format DATASET_FF_N number of supported formats TBD: DATASET_FF_CLASS first column is class name DATASET_FF_CV first column is class name, second column is vector name */ #define DATASET_FF_RAW 0 #define DATASET_FF_VEC 1 #define DATASET_FF_COL 2 #define DATASET_FF_COLVEC 3 #if 0 #define DATASET_FF_CLASS 1 #define DATASET_FF_CV 3 #endif #define DATASET_FF_N 4 #define DATASET_FF_DEFAULT DATASET_FF_RAW /* dataset_new() without the 'prediction' and 'ps' members. */ struct dataset *dataset_lt(int d, int c, int *nd, int nv, char **fnames, float **x); /* Representation of a set of (optionally labeled) data vectors. */ struct dataset { int d; /* dimension (length) of data vectors */ int c; /* number of categories (classes) */ char **cnames; /* optional class names */ int *nd; /* number of vectors per class */ int nv; /* total number of vectors (i.e., sum of elements in 'nd') */ char **fnames; /* data files */ int format; /* format of data in `fnames' */ int nfiles; /* number of data files */ float **x; /* data vectors */ char **alab; /* optional labels (names) for attributes of `x' */ char *alab0; /* column 0 label for DATASET_FF_COLVEC format */ char **xlab; /* optional labels (names) for data vectors in `x' */ /* Actual (truth) class labels for vectors in 'x'. The length of 'label' is 'nv'. In PCP, the range of values in 'label' is 0..c-1, plus PCP_UNASSIGNED, which indicates unassigned class label. NOTE: this field is currently not used universally. Most PCP functions presently assume that the first nd[0] vectors belong to class 0, the next nd[1] belong to class 1, etc. TBD: convert all functions to use 'label' instead. */ int *label; /* Predicted class labels for vectors in `x'. The length of 'prediction' is 'nv' if classification has been performed. In PCP, the range of values in 'prediction' is 0..c-1, plus PCP_UNASSIGNED. The values represent class labels assigned as a result of a classification. The value PCP_UNASSIGNED indicates that the classification algorithm did not assign a class label to the vector. */ int *prediction; /* nv by c matrix of class prediction strengths corresponding to 'prediction'. Why is this double? Because LIBSVM function svm_predict_probability(), where it has first been used, wants a double. And anyway, all other float fields should be converted to double. */ double **ps; /* Inverted class-covariance matrices in SSM (IMSL Symmetric Storage Mode) */ float **sigma; /* Determinants of the class-covariance matrices in the dsign*exp(dexp) mode, where dsign, dexp for class i, 0 <= i <= c-1, are det[2*i], det[2*i+1], respectively. Frequently, log of the determinant is used in machine learning algorithms. For example, natural logarithm of a determinant of covariance matrix of the first class can be computed as: log(det[0])+det[1] Second class: log(det[2])+det[3] etc. */ float *det; }; /* Structure describing a subset of struct dataset. */ struct subset { int size; /* number of elements in the subset */ int *idx; /* indices of elements in the subset */ }; /* Load contents of 'c' files, named in 'names', in format 'fmt', into a struct dataset. Return the struct. Return (struct dataset *) 0 in case of error, and set errc/fname. The possible values for errc are malloc() and file I/O error codes, or LERR_FILE_FORMAT for unrecognized input file format. */ struct dataset *load_dataset(int d, int c, int *nd, char **names, int fmt, int *errc, char **fname); /* Free 'dset'. Preserves the value of errno. */ struct dataset *dataset_free(struct dataset *dset); /* Dataset resampling function, normally used for bagging algorithms. Samples 'bag_size' vectors from 'dset' and places them into 'bag', with 'bnd' vectors per class. 'mdl' is the model index (used for logging only). */ int resample(int mdl, struct dataset *dset, int bag_size, float **bag, int *bnd, FILE *fdbg); /* Return class index (0..c-1) for vector 'ivx' (0-based). */ int dataset_class(int ivx, int c, int *nd); /* Extract dataset defined by 'xsubset' from 'dset'. If 'complement' is 1, extract complement of 'xsubset'. In case of error, return NULL and set errno. Errors are EINVAL and memory allocation errors. */ struct dataset *dataset_subset(struct dataset *dset, struct subset *xsubset, int complement); /* Partition each of the 'c' classes with 'nd[i]' vectors per class, into 'nxval' disjoint subsets. The resulting partition is specified in 'sxc' and 'lxc' arrays. Local arrays 'tsxc' and 'tlxc' have list of vectors in each subset, and subset cardinalities, respectively. 'tsxc[i]' and 'tlxc[i]' are pointers to 'sx' and 'lx' arrays, respectively, for class 'i', as described in comments for xss(). The addresses of 'tsxc' and 'tlxc' are returned in '*sxc' and '*lxc'. The function may be used for cross-validation experiments, to return all subsets. Then the subsets may be analyzed one at a time. The function returns -1 and sets errno in case of malloc() failure, 0 otherwise. If 'nxval' is not in a correct range for any class, set 'sxc' and 'lxc' to NULL. */ struct subset **dataset_partition(struct dataset *dset, int nxval); /* Compute covariance matrices and determinants for 'dset'. The matrices are stored in dset->sigma, in Symmetric Storage Mode, and the determinants are stored in dset->det. In case of success, return 0. In case of failure, return -1 and set 'errc'. */ int dataset_sigma(struct dataset *dset, int *errc); struct dataset *dataset_new(int d, int c, int *nd, int nv, char **fnames, float **x); /* Return clone of 'dset'. In case of failure, return NULL and set errno. The possible errors are memory allocation errors. */ struct dataset *dataset_clone(struct dataset *dset); /* Map 'dset' into 'd'-dimensional space using 'n' by dset->d matrix 'fmx'. 'n' has to be >= d, but the function does not check that (since it doesn't know n). Return NULL and set errno in case of failure. Possible errors are memory allocation errors and EINVAL for bad input arguments. */ struct dataset *dataset_map(struct dataset *dset, int d, float **fmx); /* In-place mapping of 'dset' into 'd'-dimensional space using 'n' by dset->d matrix 'fmx'. 'n' has to be >= d, but the function does not check that (since it doesn't know n). Return -1 in case of inconsistent input. Return -1 and set errno in case of failure. */ int dataset_mapx(struct dataset *dset, int d, float **fmx); /* Write `dset->x' into `dset->fnames' in `dset->format'. Return -1 in case of error and set errno. In case of file error, copy the offending file name to 'xname'. */ int dataset_write(struct dataset *dset, char **xname); /* Write `dset->x' into filenames prefix1.dmp, prefix2.dmp, etc., in `dset->format'. Return -1 in case of error and set errno. In case of file error, copy the offending file name to 'xname'. */ int dataset_dump(struct dataset *dset, char *prefix, char **xname); /* Apply linear classifier 'wmx' to 'dset'. 'wmx' is assumed to be a dset->c by dset->d+1 matrix whose last column contains the bias term of the linear classifier. The predictions are stored in dset->prediction. Return 0 in case of success. In case of error, return the error code. Error codes are EINVAL for bad input arguments and malloc() error codes. */ int dataset_lin_predict(struct dataset *dset, float **wmx); /* Apply parametric quadratic classifier defined by 'wmx' and 'sigma' to 'dset'. 'wmx' is assumed to be a dset->c by dset->d+1 matrix whose last column contains the bias term. 'sigma' are inverted covariance matrices of the training data set in SSM. They define the quadratic term of the classifier. The predictions are stored in dset->prediction. Return 0 in case of success. In case of error, return the error code. Error codes are EINVAL for bad input arguments and malloc() error codes. */ int dataset_pqc_predict(struct dataset *dset, float **wmx, float **sigma); /* Create dataset which has a subset of features in 'dset'. The subset is defined by 'index'. In case of failure, return NULL and set errno. */ struct dataset *dataset_select(struct dataset *dset, int *index, int d); /* In-place version of dataset_select() - replace vectors in 'dset' with the subset of `d' features whose indexed are given in 'index'. In case of success, return 0. In case of memory allocation failure, return -1 and set errno. Note that bad arguments _are not_ considered an error. The caller has to check the arguments himself. */ int dataset_inset(struct dataset *dset, int *index, int d); #endif