/* File name: dataset.c Created by: Ljubomir Buturovic Created: 03/10/2004 Purpose: API for struct dataset. */ /* Copyright 2004 Ljubomir J. Buturovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include "hash_util.h" #include "dataset.h" #include "lau.h" #include "lmat.h" static char rcsid[] = "$Id: dataset.c,v 1.37 2006/03/27 17:45:39 ljubomir Exp $"; /* dataset_new() without the 'prediction' and 'ps' members. */ struct dataset *dataset_lt(int d, int c, int *nd, int nv, char **fnames, float **x) { int i; int offset; int idx; struct dataset *dset; dset = calloc(1, sizeof(struct dataset)); if (dset) { dset->label = malloc(nv*sizeof(int)); if (dset->label) { dset->d = d; dset->c = c; dset->nd = nd; dset->nv = nv; if (fnames) dset->fnames = fnames; if (x) dset->x = x; /* Assign class labels. */ idx = 0; offset = nd[0]; for (i = 0; i < nv; i++) { dset->label[i] = idx; if ((i == offset-1) && (i != nv-1)) { idx++; offset += nd[idx]; } } } } return dset; } /* Load contents of 'c' files, named in 'names', in format 'fmt', into a struct dataset. Return the struct. Return (struct dataset *) 0 in case of error, and set errc/fname. The possible values for errc are malloc() and file I/O error codes, or LERR_FILE_FORMAT for unrecognized input file format. */ struct dataset *load_dataset(int d, int c, int *nd, char **names, int fmt, int *errc, char **fname) { int i; int status; int offset; int j; int errno_save; char **alab; float **x; struct dataset *dset; *errc = 0; status = 0; dset = calloc(1, sizeof(struct dataset)); if (dset) { dset->c = c; dset->format = fmt; dset->fnames = string_copy(names, c); dset->d = d; dset->nd = ivec_clone(nd, c); if (dset->nd == (int *) 0) status = -1; if (dset->nd) { dset->nv = ivec_sum(dset->nd, dset->c); dset->x = malloc((dset->nv)*sizeof(float *)); if (dset->x) { if ((fmt == DATASET_FF_VEC) || (fmt == DATASET_FF_COLVEC)) { dset->xlab = calloc(dset->nv, sizeof(char *)); if (!dset->xlab) status = -1; } if (!status) { for (i = 0; (i < dset->nv) && (status == 0); i++) { dset->x[i] = malloc((dset->d)*sizeof(float)); if (!dset->x[i]) status = -1; } if (!status) { dset->label = malloc(dset->nv*sizeof(int)); if (dset->label) { offset = 0; x = dset->x; for (i = 0; (i < dset->c) && !status; i++) { if ((i == 0) && ((fmt == DATASET_FF_COL) || (fmt == DATASET_FF_COLVEC))) { alab = calloc(dset->d, sizeof(char *)); if (!alab) status = -1; } else alab = (char **) 0; if (!status) { if (dset->xlab) status = load_file(dset->fnames[i], &(x[offset]), dset->format, &(dset->xlab[offset]), &dset->alab0, alab, dset->d, dset->nd[i], errc); else status = load_file(dset->fnames[i], &(x[offset]), dset->format, (char **) 0, &dset->alab0, alab, dset->d, dset->nd[i], errc); if (!status) { if ((i == 0) && ((fmt == DATASET_FF_COL) || (fmt == DATASET_FF_COLVEC))) dset->alab = alab; for (j = offset; j < offset+nd[i]; j++) dset->label[j] = i; offset += dset->nd[i]; } else *fname = strdup(dset->fnames[i]); } } } else status = -1; } } } else status = -1; } else status = -1; } else status = -1; if (status == -1) { errno_save = errno; dset = dataset_free(dset); if (errc && !*errc) *errc = errno_save; } return dset; } /* Free 'dset'. Preserves the value of errno. */ struct dataset *dataset_free(struct dataset *dset) { int errno_save; errno_save = errno; if (dset) { mx_free((void **) dset->fnames, dset->c); mx_free((void **) dset->x, dset->nv); mx_free((void **) dset->xlab, dset->nv); mx_free((void **) dset->alab, dset->d); mx_free((void **) dset->sigma, dset->c); vx_free(dset->det); free(dset->prediction); free(dset->ps); free(dset->nd); free(dset->label); free(dset); } errno = errno_save; return (struct dataset *) 0; } /* Dataset resampling function, normally used for bagging algorithms. Samples 'bag_size' vectors from 'dset' and places them into 'bag', with 'bnd' vectors per class. 'mdl' is the model index (used for logging only). */ int resample(int mdl, struct dataset *dset, int bag_size, float **bag, int *bnd, FILE *fdbg) { int j; int cx; int offset; int cbx; int c_bag_size; int idx; int nvec; int resampling_mode = POPULATION_RESAMPLING; float cf; int *ibag; if (resampling_mode == CLASS_RESAMPLING) { ivec_set(bnd, dset->c, 0); offset = 0; cbx = 0; for (cx = 0; cx < dset->c; cx++) { cf = dset->nd[cx]; cf = cf/(dset->nv); c_bag_size = cf*bag_size; for (j = 0; j < c_bag_size; j++) { idx = rand_int(0, dset->nd[cx]-1); bag[cbx] = dset->x[idx+offset]; /* Use for testing. */ /*bag[cbx] = dset->x[cbx];*/ cbx++; bnd[cx]++; if (fdbg) fprintf(fdbg, "bagging(); model %d; bag element %d is %d.\n", mdl, cbx-1, idx+offset); } offset += dset->nd[cx]; } nvec = ivec_sum(bnd, dset->c); } else if (resampling_mode == POPULATION_RESAMPLING) { ibag = malloc((bag_size+1)*sizeof(int)); ivec_set(bnd, dset->c, 0); for (cbx = 0; cbx < bag_size; cbx++) ibag[cbx] = rand_int(0, dset->nv-1); intsort(ibag, bag_size); if (fdbg) fprintf(fdbg, "bag:\t%d\t", mdl+1); for (cbx = 0; cbx < bag_size; cbx++) { idx = ibag[cbx]; bag[cbx] = dset->x[idx]; /* Use for testing. */ /*bag[cbx] = dset->x[cbx];*/ cx = dataset_class(idx, dset->c, dset->nd); bnd[cx]++; if (fdbg) { fprintf(fdbg, "%d\t", idx+1); /*fprintf(fdbg, "bagging(); model %d; bag element %d is %d.\n", mdl, cbx, idx);*/ } } if (fdbg) fprintf(fdbg, "\n"); free(ibag); nvec = ivec_sum(bnd, dset->c); } return nvec; } /* Return class index (0..c-1) for vector 'ivx' (0-based). */ int dataset_class(int ivx, int c, int *nd) { int i; int icx; int ncx; int done; icx = 0; ncx = nd[0]; done = 0; for (i = 1; (i < c) && (done == 0); i++) { if (ivx < ncx) { done = 1; } else { ncx += nd[i]; icx++; } } return icx; } /* Utility function for partitioning set of integers in [low..high] interval into 'nxval' disjoint subsets. The subsets are returned in 'sx' and 'lx' integer arrays. Local variable 'tsx' has elements of the subsets, local variable 'tlx[i]' has cardinality of 'i'-th subset. tlx[i] is either [range/nxval] or [range/nxval]+1. So, tsx[0..lx[0]-1] are elements of the first subset; tsx[lx[0]..lx[0]+lx[1]-1] are elements of the second subset, etc. Addresses of 'tsx' and 'tlx' are returned in '*sx' and '*lx'. The partitioning is performed using pseudo-random number generator rand(). Return -1 in case of malloc() error, 0 otherwise. If 'nxval' is greater than 'range', or less than 1, set 'sx' and 'lx' to NULL. NOTE: notation [low..high] designates set of integers between 'low' and 'high', including both 'low' and 'high'. */ static int partition_range(int low, int high, int nxval, int **sx, int **lx) { int status; int range; int i; int j; int ivx; int jx; int tcx; int idx; int lsmall; int lbig; int *tsx; int *tlx; int *xc; float rcc; tsx = (int *) 0; tlx = (int *) 0; status = 0; range = high-low+1; if ((nxval >= 1) && (nxval <= range)) { rcc = range/nxval; idx = 0; /* xc has a list of remaining integers from the 0..range-1. Initially xc contains all integers in the range. */ xc = malloc(range*sizeof(int)); for (j = 0; j < range; j++) xc[j] = j; tsx = malloc(range*sizeof(int)); tlx = malloc(nxval*sizeof(int)); if ((xc == (int *) 0) || (tsx == (int *) 0) || (tlx == (int *) 0)) status = -1; else { /* Among the 'nxval' subsets, lsmall will have cardinality [range/nxval]. The remaining lbig = nxval-lsmall subsets will have one more element. */ lbig = rcc; lbig = range-lbig*nxval; lsmall = nxval-lbig; for (j = 0; j < lsmall; j++) tlx[j] = rcc; for (j = lsmall; j < nxval; j++) tlx[j] = rcc+1; tcx = range; i = 0; for (j = 0; j < range; j++) { /* Choose an integer, pseudo-randomly, and insert it into tsx. */ ivx = rand_int(0, tcx-1); tsx[i] = xc[ivx]+low; i++; for (jx = ivx; jx < tcx-1; jx++) xc[jx] = xc[jx+1]; tcx--; } free(xc); } } else status = -1; *sx = tsx; *lx = tlx; return status; } /* Partition each of the 'c' classes with 'nd[i]' vectors per class, into 'nxval' disjoint subsets. The resulting partition is specified in 'sxc' and 'lxc' arrays. Local arrays 'tsxc' and 'tlxc' have list of vectors in each subset, and subset cardinalities, respectively. 'tsxc[i]' and 'tlxc[i]' are pointers to 'sx' and 'lx' arrays, respectively, for class 'i', as described in comments for xss(). The addresses of 'tsxc' and 'tlxc' are returned in '*sxc' and '*lxc'. The function may be used for cross-validation experiments, to return all subsets. Then the subsets may be analyzed one at a time. The function returns -1 and sets errno in case of malloc() failure, 0 otherwise. If 'nxval' is not in a correct range for any class, set 'sxc' and 'lxc' to NULL. */ struct subset **dataset_partition(struct dataset *dset, int nxval) { int i; int j; int k; int length; int cl; int sl; int done; int status; int ccc; int *sx; int *lx; struct subset **xset; status = 0; done = 0; xset = malloc(nxval*sizeof(struct subset *)); length = dset->nv/nxval+1; for (i = 0; i < nxval; i++) { xset[i] = calloc(1, sizeof(struct subset)); xset[i]->idx = malloc(length*sizeof(int)); } if (xset) { for (i = 0; (i < dset->c) && !done; i++) { ccc = ivec_sum(dset->nd, i); status = partition_range(ccc, ccc+dset->nd[i]-1, nxval, &sx, &lx); if (!status && sx && lx) { sl = 0; for (j = 0; j < nxval; j++) { length = lx[j]; cl = xset[j]->size; for (k = 0; k < length; k++) xset[j]->idx[k+cl] = sx[sl++]; xset[j]->size += length; } free(lx); free(sx); } else { for (j = 0; j < nxval; j++) free(xset[j]); xset = (struct subset **) 0; done = 1; } } } return xset; } /* Integer comparison function for qsort(). */ static int compare_integers(const void *int1, const void *int2) { int retval = -1; int *i1; int *i2; i1 = (int *) int1; i2 = (int *) int2; if (*i1 > (int) *i2) retval = 1; else if (*i1 == *i2) retval = 0; return retval; } /* Extract dataset defined by 'xset' from 'dset'. If 'complement' is 1, extract complement of 'xsubset'. The function returns the subset. If 'dset' or 'xset' are NULL, return NULL and set errno to 0. In case of memory allocation error, return NULL and set errno to the corresponding system function errno value. */ struct dataset *dataset_subset(struct dataset *dset, struct subset *xset, int complement) { int i; int icl; int index; int *ind; int *isx; struct dataset *sset; sset = (struct dataset *) 0; if (dset && xset) { sset = calloc(1, sizeof(struct dataset)); if (sset) { sset->d = dset->d; sset->c = dset->c; if (complement == 1) sset->nv = dset->nv-xset->size; else sset->nv = xset->size; /* Sort input vectors indices. */ if (complement == 1) { ind = malloc(dset->nv*sizeof(int)); for (i = 0; i < dset->nv; i++) ind[i] = i; isx = ivec_diff(ind, dset->nv, xset->idx, xset->size, &icl); free(ind); } else isx = ivec_clone(xset->idx, xset->size); qsort(isx, xset->size, sizeof(int), compare_integers); /* Copy vectors. */ sset->x = calloc(sset->nv, sizeof(float *)); if (sset->x) { for (i = 0; i < sset->nv; i++) { index = isx[i]; sset->x[i] = fvec_clone(dset->x[index], dset->d); } if (dset->xlab) { sset->xlab = calloc(sset->nv, sizeof(char *)); if (sset->xlab) { for (i = 0; i < sset->nv; i++) { index = isx[i]; sset->xlab[i] = strdup(dset->xlab[index]); } } else sset = (struct dataset *) vx_free(sset); /* TBD: need to free() the whole struct */ } } else sset = (struct dataset *) vx_free(sset); if (sset) { /* Compute class cardinalities. */ ind = calloc(dset->c, sizeof(int)); for (i = 0; i < sset->nv; i++) { icl = dataset_class(isx[i], dset->c, dset->nd); ind[icl]++; } free(isx); sset->nd = ind; } } } else errno = 0; return sset; } /* Compute covariance matrices and determinants for 'dset'. The matrices are stored in dset->sigma, in Symmetric Storage Mode, and the determinants are stored in dset->det. In case of success, return 0. In case of failure, return -1 and set 'errc'. */ int dataset_sigma(struct dataset *dset, int *errc) { int status; int j; int idx; int offset; float dsign; float dexp; float *det; float **sigma; float **cov; float **inverse; status = 0; *errc = 0; if (!dset->sigma) { sigma = calloc(dset->c, sizeof(float *)); if (sigma) { det = malloc(2*dset->c*sizeof(float)); if (det) { offset = 0; idx = 0; for (j = 0; (j < dset->c) && !status; j++) { cov = cest(&dset->x[offset], dset->d, dset->nd[j], COVARIANCE_MATRIX); if (cov) { inverse = fmx_inv(cov, dset->d, &dsign, &dexp, errc); mx_free((void **) cov, dset->d); if (inverse) { sigma[j] = fmx_ssm(inverse, dset->d); if (sigma[j]) { offset += dset->nd[j]; det[idx] = dsign; idx++; det[idx] = dexp; idx++; } else { status = -1; vx_free(det); mx_free((void **) sigma, j); } mx_free((void **) inverse, dset->d); } else { status = -1; vx_free(det); mx_free((void **) sigma, dset->c); } } else { status = -1; *errc = errno; vx_free(det); mx_free((void **) sigma, dset->c); } } if (!status) { dset->sigma = sigma; dset->det = det; } } else { status = -1; *errc = errno; } } else { status = -1; *errc = errno; } } return status; } struct dataset *dataset_new(int d, int c, int *nd, int nv, char **fnames, float **x) { struct dataset *dset; dset = calloc(1, sizeof(struct dataset)); if (dset != (struct dataset *) 0) { dset->d = d; dset->c = c; dset->nd = nd; dset->nv = nv; dset->fnames = fnames; dset->prediction = calloc(nv, sizeof(float)); if (dset->prediction) dset->x = x; else dset = dataset_free(dset); } return dset; } /* Return clone of 'dset'. In case of failure, return NULL and set errno. The possible errors are memory allocation errors. */ struct dataset *dataset_clone(struct dataset *dset) { int i; int len; int status; struct dataset *clone = (struct dataset *) 0; status = 0; if (dset) { clone = calloc(1, sizeof(struct dataset)); if (clone) { clone->d = dset->d; clone->c = dset->c; clone->nv = dset->nv; clone->format = dset->format; clone->nd = ivec_clone(dset->nd, dset->c); if (clone->nd) { if (dset->fnames) { clone->fnames = malloc(dset->c*sizeof(char *)); status = 0; for (i = 0; (i < dset->c) && !status; i++) { clone->fnames[i] = strdup(dset->fnames[i]); if (!clone->fnames[i]) status = -1; } } if (!status && (dset->x)) { clone->x = fmx_alloc(dset->nv, dset->d); if (clone->x) for (i = 0; i < dset->nv; i++) fvec_copy(clone->x[i], dset->x[i], dset->d); else status = -1; } if (!status && dset->xlab) { clone->xlab = calloc(dset->nv, sizeof(char *)); if (clone->xlab) for (i = 0; i < dset->nv; i++) clone->xlab[i] = strdup(dset->xlab[i]); else status = -1; } if (!status && dset->alab) { clone->alab = calloc(dset->d, sizeof(char *)); if (clone->alab) for (i = 0; i < dset->d; i++) clone->alab[i] = strdup(dset->alab[i]); else status = -1; } if (!status && dset->alab0) { clone->alab0 = strdup(dset->alab0); if (!clone->alab0) status = -1; } if (!status && dset->label) { clone->label = ivec_clone(dset->label, dset->nv); if (!clone->label) status = -1; } if (!status && dset->prediction) { clone->prediction = ivec_clone(dset->prediction, dset->nv); if (!clone->prediction) status = -1; } if (!status && dset->ps) { clone->ps = dmx_clone(dset->ps, dset->nv, dset->d); if (!clone->ps) status = -1; } if (!status && dset->sigma) { clone->sigma = malloc(dset->c*sizeof(float *)); if (!clone->sigma) status = -1; else { len = dset->d*(dset->d+1)/2; for (i = 0; i < dset->c; i++) clone->sigma[i] = fvec_clone(dset->sigma[i], len); } } if (!status && dset->det) clone->det = fvec_clone(dset->det, 2*dset->d); } else status = -1; } } if (status == -1) clone = dataset_free(clone); return clone; } /* Map 'dset' into 'd'-dimensional space using 'n' by dset->d matrix 'fmx'. 'n' has to be >= d, but the function does not check that (since it doesn't know n). Return NULL and set errno in case of failure. Possible errors are memory allocation errors and EINVAL for bad input arguments. */ struct dataset *dataset_map(struct dataset *dset, int d, float **fmx) { struct dataset *mapped_dset; float **x; mapped_dset = (struct dataset *) 0; if (dset && fmx) { x = fmx_mult(dset->x, dset->nv, dset->d, fmx, d, 1); if (x) { mapped_dset = calloc(1, sizeof(struct dataset)); if (mapped_dset) { mapped_dset->x = x; mapped_dset->nd = ivec_clone(dset->nd, dset->c); if (mapped_dset->nd) { mapped_dset->label = ivec_clone(dset->label, dset->nv); if (dset->label && !mapped_dset->label) mapped_dset = dataset_free(mapped_dset); else { mapped_dset->nv = dset->nv; mapped_dset->d = d; mapped_dset->c = dset->c; } } else mapped_dset = dataset_free(mapped_dset); } } } else errno = EINVAL; return mapped_dset; } /* In-place mapping of 'dset' into 'd'-dimensional space using 'n' by dset->d matrix 'fmx'. 'n' has to be >= d, but the function does not check that (since it doesn't know n). Return -1 in case of inconsistent input. Return -1 and set errno in case of failure. */ int dataset_mapx(struct dataset *dset, int d, float **fmx) { int status; float **x; status = 0; x = fmx_mult(dset->x, dset->nv, dset->d, fmx, d, 1); if (x) { mx_free((void **) dset->x, dset->nv); dset->x = x; dset->d = d; } else status = -1; return status; } /* Save 'matrix' in 'fname'. Store `alab[i]' as column names in the first row. Store 'xlab[i]' in the first column of the i-th row. If `alab0' is not NULL, it is stored as first column in the first row. Return -1 in case of error and set errno. ljb, 08/10/2005: allow zero rows and/or columns. It is conceivable that we may have an empty test class. */ static int dataset_save(float **matrix, char *alab0, char **alab, char **xlab, int rows, int columns, char *fname) { int status = 0; int i; FILE *fptr; fptr = fopen(fname, "w"); if (fptr) { if (alab0) fprintf(fptr, "%s\t", alab0); if (alab) { for (i = 0; i < columns-1; i++) fprintf(fptr, "%s\t", alab[i]); if (columns > 1) fprintf(fptr, "%s", alab[columns-1]); fprintf(fptr, "\n"); } status = fmx_nwrite(fptr, matrix, xlab, rows, columns); if (!status) status = fclose(fptr); } else status = -1; return status; } /* Write `dset->x' into `dset->fnames' in `dset->format'. Return -1 in case of error and set errno. In case of file error, copy the offending file name to 'xname'. */ int dataset_write(struct dataset *dset, char **xname) { int i; int offset; int status; char **xlab; status = 0; if (dset && dset->x && dset->nd) { offset = 0; for (i = 0; (i < dset->c) && !status; i++) { if (dset->xlab) xlab = &dset->xlab[offset]; else xlab = (char **) 0; status = dataset_save(&dset->x[offset], dset->alab0, dset->alab, xlab, dset->nd[i], dset->d, dset->fnames[i]); if ((status == -1) && xname) *xname = strdup(dset->fnames[i]); offset += dset->nd[i]; } } return status; } /* Write `dset->x' into filenames prefix1.dmp, prefix2.dmp, etc., in `dset->format'. Return -1 in case of error and set errno. In case of file error, copy the offending file name to 'xname'. */ int dataset_dump(struct dataset *dset, char *prefix, char **xname) { int i; int len; int status; char *name; char **names; char **dmp_names; names = dset->fnames; dmp_names = calloc(dset->c+1, sizeof(char *)); len = strlen(prefix)+20; for (i = 0; i < dset->c; i++) { name = malloc((len+1)*sizeof(char)); sprintf(name, "%s%d.dmp", prefix, i+1); dmp_names[i] = name; } dset->fnames = dmp_names; status = dataset_write(dset, xname); str_free(dmp_names); dset->fnames = names; return status; } /* Apply linear classifier 'wmx' to 'dset'. 'wmx' is assumed to be a dset->c by dset->d+1 matrix whose last column contains the bias term of the linear classifier. The predictions are stored in dset->prediction. Return 0 in case of success. In case of error, return the error code. Error codes are EINVAL for bad input arguments and malloc() error codes. */ int dataset_lin_predict(struct dataset *dset, float **wmx) { int i; int j; int errc; double gmax; double gx; errc = 0; if (dset && wmx) { if (!dset->prediction) dset->prediction = malloc(dset->nv*sizeof(int)); if (dset->prediction) { for (i = 0; i < dset->nv; i++) { gmax = -FLT_MAX; for (j = 0; j < dset->c; j++) { gx = fvec_dot(wmx[j], dset->x[i], dset->d, (int *) 0); gx += wmx[j][dset->d]; if (gx > gmax) { dset->prediction[i] = j; gmax = gx; } } } } else errc = errno; } else errc = EINVAL; return errc; } /* Apply parametric quadratic classifier defined by 'wmx' and 'sigma' to 'dset'. 'wmx' is assumed to be a dset->c by dset->d+1 matrix whose last column contains the bias term. 'sigma' are inverted covariance matrices of the training data set in SSM. They define the quadratic term of the classifier. The predictions are stored in dset->prediction. Return 0 in case of success. In case of error, return the error code. Error codes are EINVAL for bad input arguments and malloc() error codes. */ int dataset_pqc_predict(struct dataset *dset, float **wmx, float **sigma) { int i; int j; int d; int errc; double gmax; double gx; float *fv; errc = 0; d = dset->d; if (dset && wmx) { if (!dset->prediction) dset->prediction = malloc(dset->nv*sizeof(int)); if (dset->prediction) { fv = malloc(d*sizeof(float)); if (fv) { for (i = 0; i < dset->nv; i++) { gmax = -FLT_MAX; for (j = 0; j < dset->c; j++) { gx = fvec_dot(wmx[j], dset->x[i], d, (int *) 0); gx += wmx[j][d]; /* Add quadratic term. */ fvec_smx(dset->x[i], dset->sigma[j], d, fv); gx += -0.5*fvec_dot(dset->x[i], fv, d, (int *) 0); if (gx > gmax) { dset->prediction[i] = j; gmax = gx; } } } free(fv); } else errc = errno; } else errc = errno; } else errc = EINVAL; return errc; } /* Create dataset which has a subset of features in 'dset'. The subset is defined by 'index'. In case of failure, return NULL and set errno. */ struct dataset *dataset_select(struct dataset *dset, int *index, int d) { int i; int j; int idx; int status; struct dataset *sub_dset; sub_dset = (struct dataset *) 0; if (dset && index && (d > 0)) { sub_dset = calloc(1, sizeof(struct dataset)); if (sub_dset) { sub_dset->d = d; sub_dset->c = dset->c; sub_dset->nv = dset->nv; sub_dset->nd = ivec_clone(dset->nd, dset->c); if (sub_dset->nd) { sub_dset->x = malloc(dset->nv*sizeof(float *)); if (sub_dset->x) { status = 0; for (i = 0; (i < dset->nv) && !status; i++) { sub_dset->x[i] = fvec_subset(dset->x[i], dset->d, index, d); if (!sub_dset->x[i]) status = -1; } if (!status) { if (dset->alab) { sub_dset->alab = calloc(d, sizeof(char *)); for (j = 0; j < d; j++) { idx = index[j]; sub_dset->alab[j] = strdup(dset->alab[idx]); } if (!sub_dset->alab) status = -1; } if (!status) { if (sub_dset->alab0) { sub_dset->alab0 = strdup(dset->alab0); if (!sub_dset->alab0) status = -1; } if (dset->xlab) { sub_dset->xlab = str_clone(dset->xlab, dset->nv); if (!sub_dset->xlab) status = -1; } } else status = -1; } if (status == -1) sub_dset = dataset_free(sub_dset); } } } } return sub_dset; } /* In-place version of dataset_select() - replace vectors in 'dset' with the subset of `d' features whose indexed are given in 'index'. In case of success, return 0. In case of memory allocation failure, return -1 and set errno. Note that bad arguments _are not_ considered an error. The caller has to check the arguments himself. */ int dataset_inset(struct dataset *dset, int *index, int d) { int i; int j; int status; char **alab; float **x; status = 0; alab = (char **) 0; if (dset && index && (d > 0)) { x = malloc(dset->nv*sizeof(float *)); if (x) { for (i = 0; (i < dset->nv) && !status; i++) { x[i] = fvec_subset(dset->x[i], dset->d, index, d); if (!x[i]) { status = -1; mx_free((void **) x, dset->nv); } else { if (dset->alab) { alab = calloc(d, sizeof(char *)); if (alab) { for (j = 0; (j < d) && !status; j++) { alab[j] = strdup(dset->alab[index[j]]); if (!alab[j]) { status = -1; mx_free((void **) alab, j); } } } else status = -1; } } } if (!status) { mx_free((void **) dset->x, dset->nv); mx_free((void **) dset->alab, dset->d); dset->x = x; dset->d = d; dset->alab = alab; } } } else { status = -1; errno = EINVAL; } return status; }