/*
File name: dataset.c
Created by: Ljubomir Buturovic
Created: 03/10/2004
Purpose: API for struct dataset.
*/
/*
Copyright 2004 Ljubomir J. Buturovic
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation files
(the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of the Software,
and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <float.h>
#include "hash_util.h"
#include "dataset.h"
#include "lau.h"
#include "lmat.h"
static char rcsid[] = "$Id: dataset.c,v 1.37 2006/03/27 17:45:39 ljubomir Exp $";
/*
dataset_new() without the 'prediction' and 'ps' members.
*/
struct dataset *dataset_lt(int d, int c, int *nd, int nv, char **fnames, float **x)
{
int i;
int offset;
int idx;
struct dataset *dset;
dset = calloc(1, sizeof(struct dataset));
if (dset)
{
dset->label = malloc(nv*sizeof(int));
if (dset->label)
{
dset->d = d;
dset->c = c;
dset->nd = nd;
dset->nv = nv;
if (fnames)
dset->fnames = fnames;
if (x)
dset->x = x;
/*
Assign class labels.
*/
idx = 0;
offset = nd[0];
for (i = 0; i < nv; i++)
{
dset->label[i] = idx;
if ((i == offset-1) && (i != nv-1))
{
idx++;
offset += nd[idx];
}
}
}
}
return dset;
}
/*
Load contents of 'c' files, named in 'names', in format 'fmt', into
a struct dataset. Return the struct.
Return (struct dataset *) 0 in case of error, and set
errc/fname. The possible values for errc are malloc() and file I/O
error codes, or LERR_FILE_FORMAT for unrecognized input file format.
*/
struct dataset *load_dataset(int d, int c, int *nd, char **names, int fmt,
int *errc, char **fname)
{
int i;
int status;
int offset;
int j;
int errno_save;
char **alab;
float **x;
struct dataset *dset;
*errc = 0;
status = 0;
dset = calloc(1, sizeof(struct dataset));
if (dset)
{
dset->c = c;
dset->format = fmt;
dset->fnames = string_copy(names, c);
dset->d = d;
dset->nd = ivec_clone(nd, c);
if (dset->nd == (int *) 0)
status = -1;
if (dset->nd)
{
dset->nv = ivec_sum(dset->nd, dset->c);
dset->x = malloc((dset->nv)*sizeof(float *));
if (dset->x)
{
if ((fmt == DATASET_FF_VEC) || (fmt == DATASET_FF_COLVEC))
{
dset->xlab = calloc(dset->nv, sizeof(char *));
if (!dset->xlab)
status = -1;
}
if (!status)
{
for (i = 0; (i < dset->nv) && (status == 0); i++)
{
dset->x[i] = malloc((dset->d)*sizeof(float));
if (!dset->x[i])
status = -1;
}
if (!status)
{
dset->label = malloc(dset->nv*sizeof(int));
if (dset->label)
{
offset = 0;
x = dset->x;
for (i = 0; (i < dset->c) && !status; i++)
{
if ((i == 0) && ((fmt == DATASET_FF_COL) || (fmt == DATASET_FF_COLVEC)))
{
alab = calloc(dset->d, sizeof(char *));
if (!alab)
status = -1;
}
else
alab = (char **) 0;
if (!status)
{
if (dset->xlab)
status = load_file(dset->fnames[i], &(x[offset]),
dset->format, &(dset->xlab[offset]), &dset->alab0,
alab, dset->d, dset->nd[i], errc);
else
status = load_file(dset->fnames[i], &(x[offset]),
dset->format, (char **) 0, &dset->alab0,
alab, dset->d, dset->nd[i], errc);
if (!status)
{
if ((i == 0) && ((fmt == DATASET_FF_COL) || (fmt == DATASET_FF_COLVEC)))
dset->alab = alab;
for (j = offset; j < offset+nd[i]; j++)
dset->label[j] = i;
offset += dset->nd[i];
}
else
*fname = strdup(dset->fnames[i]);
}
}
}
else
status = -1;
}
}
}
else
status = -1;
}
else
status = -1;
}
else
status = -1;
if (status == -1)
{
errno_save = errno;
dset = dataset_free(dset);
if (errc && !*errc)
*errc = errno_save;
}
return dset;
}
/*
Free 'dset'. Preserves the value of errno.
*/
struct dataset *dataset_free(struct dataset *dset)
{
int errno_save;
errno_save = errno;
if (dset)
{
mx_free((void **) dset->fnames, dset->c);
mx_free((void **) dset->x, dset->nv);
mx_free((void **) dset->xlab, dset->nv);
mx_free((void **) dset->alab, dset->d);
mx_free((void **) dset->sigma, dset->c);
vx_free(dset->det);
free(dset->prediction);
free(dset->ps);
free(dset->nd);
free(dset->label);
free(dset);
}
errno = errno_save;
return (struct dataset *) 0;
}
/*
Dataset resampling function, normally used for bagging
algorithms. Samples 'bag_size' vectors from 'dset' and places them
into 'bag', with 'bnd' vectors per class. 'mdl' is the model index
(used for logging only).
*/
int resample(int mdl, struct dataset *dset, int bag_size, float **bag, int *bnd, FILE *fdbg)
{
int j;
int cx;
int offset;
int cbx;
int c_bag_size;
int idx;
int nvec;
int resampling_mode = POPULATION_RESAMPLING;
float cf;
int *ibag;
if (resampling_mode == CLASS_RESAMPLING)
{
ivec_set(bnd, dset->c, 0);
offset = 0;
cbx = 0;
for (cx = 0; cx < dset->c; cx++)
{
cf = dset->nd[cx];
cf = cf/(dset->nv);
c_bag_size = cf*bag_size;
for (j = 0; j < c_bag_size; j++)
{
idx = rand_int(0, dset->nd[cx]-1);
bag[cbx] = dset->x[idx+offset];
/* Use for testing. */
/*bag[cbx] = dset->x[cbx];*/
cbx++;
bnd[cx]++;
if (fdbg)
fprintf(fdbg, "bagging(); model %d; bag element %d is %d.\n",
mdl, cbx-1, idx+offset);
}
offset += dset->nd[cx];
}
nvec = ivec_sum(bnd, dset->c);
}
else if (resampling_mode == POPULATION_RESAMPLING)
{
ibag = malloc((bag_size+1)*sizeof(int));
ivec_set(bnd, dset->c, 0);
for (cbx = 0; cbx < bag_size; cbx++)
ibag[cbx] = rand_int(0, dset->nv-1);
intsort(ibag, bag_size);
if (fdbg)
fprintf(fdbg, "bag:\t%d\t", mdl+1);
for (cbx = 0; cbx < bag_size; cbx++)
{
idx = ibag[cbx];
bag[cbx] = dset->x[idx];
/* Use for testing. */
/*bag[cbx] = dset->x[cbx];*/
cx = dataset_class(idx, dset->c, dset->nd);
bnd[cx]++;
if (fdbg)
{
fprintf(fdbg, "%d\t", idx+1);
/*fprintf(fdbg, "bagging(); model %d; bag element %d is %d.\n",
mdl, cbx, idx);*/
}
}
if (fdbg)
fprintf(fdbg, "\n");
free(ibag);
nvec = ivec_sum(bnd, dset->c);
}
return nvec;
}
/*
Return class index (0..c-1) for vector 'ivx' (0-based).
*/
int dataset_class(int ivx, int c, int *nd)
{
int i;
int icx;
int ncx;
int done;
icx = 0;
ncx = nd[0];
done = 0;
for (i = 1; (i < c) && (done == 0); i++)
{
if (ivx < ncx)
{
done = 1;
}
else
{
ncx += nd[i];
icx++;
}
}
return icx;
}
/*
Utility function for partitioning set of integers in [low..high]
interval into 'nxval' disjoint subsets. The subsets are returned in
'sx' and 'lx' integer arrays. Local variable 'tsx' has elements of
the subsets, local variable 'tlx[i]' has cardinality of 'i'-th
subset. tlx[i] is either [range/nxval] or [range/nxval]+1. So,
tsx[0..lx[0]-1] are elements of the first subset;
tsx[lx[0]..lx[0]+lx[1]-1] are elements of the second subset,
etc. Addresses of 'tsx' and 'tlx' are returned in '*sx' and '*lx'.
The partitioning is performed using pseudo-random number generator
rand().
Return -1 in case of malloc() error, 0 otherwise. If 'nxval' is
greater than 'range', or less than 1, set 'sx' and 'lx' to NULL.
NOTE: notation [low..high] designates set of integers between 'low'
and 'high', including both 'low' and 'high'.
*/
static int partition_range(int low, int high, int nxval, int **sx, int **lx)
{
int status;
int range;
int i;
int j;
int ivx;
int jx;
int tcx;
int idx;
int lsmall;
int lbig;
int *tsx;
int *tlx;
int *xc;
float rcc;
tsx = (int *) 0;
tlx = (int *) 0;
status = 0;
range = high-low+1;
if ((nxval >= 1) && (nxval <= range))
{
rcc = range/nxval;
idx = 0;
/*
xc has a list of remaining integers from the 0..range-1.
Initially xc contains all integers in the range.
*/
xc = malloc(range*sizeof(int));
for (j = 0; j < range; j++)
xc[j] = j;
tsx = malloc(range*sizeof(int));
tlx = malloc(nxval*sizeof(int));
if ((xc == (int *) 0) || (tsx == (int *) 0) || (tlx == (int *) 0))
status = -1;
else
{
/*
Among the 'nxval' subsets, lsmall will have cardinality
[range/nxval]. The remaining lbig = nxval-lsmall subsets
will have one more element.
*/
lbig = rcc;
lbig = range-lbig*nxval;
lsmall = nxval-lbig;
for (j = 0; j < lsmall; j++)
tlx[j] = rcc;
for (j = lsmall; j < nxval; j++)
tlx[j] = rcc+1;
tcx = range;
i = 0;
for (j = 0; j < range; j++)
{
/*
Choose an integer, pseudo-randomly, and insert it into
tsx.
*/
ivx = rand_int(0, tcx-1);
tsx[i] = xc[ivx]+low;
i++;
for (jx = ivx; jx < tcx-1; jx++)
xc[jx] = xc[jx+1];
tcx--;
}
free(xc);
}
}
else
status = -1;
*sx = tsx;
*lx = tlx;
return status;
}
/*
Partition each of the 'c' classes with 'nd[i]' vectors per class,
into 'nxval' disjoint subsets. The resulting partition is specified
in 'sxc' and 'lxc' arrays.
Local arrays 'tsxc' and 'tlxc' have list of vectors in each subset,
and subset cardinalities, respectively. 'tsxc[i]' and 'tlxc[i]' are
pointers to 'sx' and 'lx' arrays, respectively, for class 'i', as
described in comments for xss(). The addresses of 'tsxc' and 'tlxc'
are returned in '*sxc' and '*lxc'.
The function may be used for cross-validation experiments, to return
all subsets. Then the subsets may be analyzed one at a time.
The function returns -1 and sets errno in case of malloc() failure,
0 otherwise. If 'nxval' is not in a correct range for any class, set
'sxc' and 'lxc' to NULL.
*/
struct subset **dataset_partition(struct dataset *dset, int nxval)
{
int i;
int j;
int k;
int length;
int cl;
int sl;
int done;
int status;
int ccc;
int *sx;
int *lx;
struct subset **xset;
status = 0;
done = 0;
xset = malloc(nxval*sizeof(struct subset *));
length = dset->nv/nxval+1;
for (i = 0; i < nxval; i++)
{
xset[i] = calloc(1, sizeof(struct subset));
xset[i]->idx = malloc(length*sizeof(int));
}
if (xset)
{
for (i = 0; (i < dset->c) && !done; i++)
{
ccc = ivec_sum(dset->nd, i);
status = partition_range(ccc, ccc+dset->nd[i]-1, nxval, &sx, &lx);
if (!status && sx && lx)
{
sl = 0;
for (j = 0; j < nxval; j++)
{
length = lx[j];
cl = xset[j]->size;
for (k = 0; k < length; k++)
xset[j]->idx[k+cl] = sx[sl++];
xset[j]->size += length;
}
free(lx);
free(sx);
}
else
{
for (j = 0; j < nxval; j++)
free(xset[j]);
xset = (struct subset **) 0;
done = 1;
}
}
}
return xset;
}
/*
Integer comparison function for qsort().
*/
static int compare_integers(const void *int1, const void *int2)
{
int retval = -1;
int *i1;
int *i2;
i1 = (int *) int1;
i2 = (int *) int2;
if (*i1 > (int) *i2)
retval = 1;
else if (*i1 == *i2)
retval = 0;
return retval;
}
/*
Extract dataset defined by 'xset' from 'dset'. If 'complement' is 1,
extract complement of 'xsubset'.
The function returns the subset. If 'dset' or 'xset' are NULL,
return NULL and set errno to 0. In case of memory allocation error,
return NULL and set errno to the corresponding system function errno
value.
*/
struct dataset *dataset_subset(struct dataset *dset, struct subset *xset, int complement)
{
int i;
int icl;
int index;
int *ind;
int *isx;
struct dataset *sset;
sset = (struct dataset *) 0;
if (dset && xset)
{
sset = calloc(1, sizeof(struct dataset));
if (sset)
{
sset->d = dset->d;
sset->c = dset->c;
if (complement == 1)
sset->nv = dset->nv-xset->size;
else
sset->nv = xset->size;
/*
Sort input vectors indices.
*/
if (complement == 1)
{
ind = malloc(dset->nv*sizeof(int));
for (i = 0; i < dset->nv; i++)
ind[i] = i;
isx = ivec_diff(ind, dset->nv, xset->idx, xset->size, &icl);
free(ind);
}
else
isx = ivec_clone(xset->idx, xset->size);
qsort(isx, xset->size, sizeof(int), compare_integers);
/*
Copy vectors.
*/
sset->x = calloc(sset->nv, sizeof(float *));
if (sset->x)
{
for (i = 0; i < sset->nv; i++)
{
index = isx[i];
sset->x[i] = fvec_clone(dset->x[index], dset->d);
}
if (dset->xlab)
{
sset->xlab = calloc(sset->nv, sizeof(char *));
if (sset->xlab)
{
for (i = 0; i < sset->nv; i++)
{
index = isx[i];
sset->xlab[i] = strdup(dset->xlab[index]);
}
}
else
sset = (struct dataset *) vx_free(sset); /* TBD: need to free() the whole struct */
}
}
else
sset = (struct dataset *) vx_free(sset);
if (sset)
{
/*
Compute class cardinalities.
*/
ind = calloc(dset->c, sizeof(int));
for (i = 0; i < sset->nv; i++)
{
icl = dataset_class(isx[i], dset->c, dset->nd);
ind[icl]++;
}
free(isx);
sset->nd = ind;
}
}
}
else
errno = 0;
return sset;
}
/*
Compute covariance matrices and determinants for 'dset'. The
matrices are stored in dset->sigma, in Symmetric Storage Mode, and
the determinants are stored in dset->det.
In case of success, return 0. In case of failure, return -1 and set
'errc'.
*/
int dataset_sigma(struct dataset *dset, int *errc)
{
int status;
int j;
int idx;
int offset;
float dsign;
float dexp;
float *det;
float **sigma;
float **cov;
float **inverse;
status = 0;
*errc = 0;
if (!dset->sigma)
{
sigma = calloc(dset->c, sizeof(float *));
if (sigma)
{
det = malloc(2*dset->c*sizeof(float));
if (det)
{
offset = 0;
idx = 0;
for (j = 0; (j < dset->c) && !status; j++)
{
cov = cest(&dset->x[offset], dset->d, dset->nd[j], COVARIANCE_MATRIX);
if (cov)
{
inverse = fmx_inv(cov, dset->d, &dsign, &dexp, errc);
mx_free((void **) cov, dset->d);
if (inverse)
{
sigma[j] = fmx_ssm(inverse, dset->d);
if (sigma[j])
{
offset += dset->nd[j];
det[idx] = dsign;
idx++;
det[idx] = dexp;
idx++;
}
else
{
status = -1;
vx_free(det);
mx_free((void **) sigma, j);
}
mx_free((void **) inverse, dset->d);
}
else
{
status = -1;
vx_free(det);
mx_free((void **) sigma, dset->c);
}
}
else
{
status = -1;
*errc = errno;
vx_free(det);
mx_free((void **) sigma, dset->c);
}
}
if (!status)
{
dset->sigma = sigma;
dset->det = det;
}
}
else
{
status = -1;
*errc = errno;
}
}
else
{
status = -1;
*errc = errno;
}
}
return status;
}
struct dataset *dataset_new(int d, int c, int *nd, int nv, char **fnames, float **x)
{
struct dataset *dset;
dset = calloc(1, sizeof(struct dataset));
if (dset != (struct dataset *) 0)
{
dset->d = d;
dset->c = c;
dset->nd = nd;
dset->nv = nv;
dset->fnames = fnames;
dset->prediction = calloc(nv, sizeof(float));
if (dset->prediction)
dset->x = x;
else
dset = dataset_free(dset);
}
return dset;
}
/*
Return clone of 'dset'.
In case of failure, return NULL and set errno. The possible errors
are memory allocation errors.
*/
struct dataset *dataset_clone(struct dataset *dset)
{
int i;
int len;
int status;
struct dataset *clone = (struct dataset *) 0;
status = 0;
if (dset)
{
clone = calloc(1, sizeof(struct dataset));
if (clone)
{
clone->d = dset->d;
clone->c = dset->c;
clone->nv = dset->nv;
clone->format = dset->format;
clone->nd = ivec_clone(dset->nd, dset->c);
if (clone->nd)
{
if (dset->fnames)
{
clone->fnames = malloc(dset->c*sizeof(char *));
status = 0;
for (i = 0; (i < dset->c) && !status; i++)
{
clone->fnames[i] = strdup(dset->fnames[i]);
if (!clone->fnames[i])
status = -1;
}
}
if (!status && (dset->x))
{
clone->x = fmx_alloc(dset->nv, dset->d);
if (clone->x)
for (i = 0; i < dset->nv; i++)
fvec_copy(clone->x[i], dset->x[i], dset->d);
else
status = -1;
}
if (!status && dset->xlab)
{
clone->xlab = calloc(dset->nv, sizeof(char *));
if (clone->xlab)
for (i = 0; i < dset->nv; i++)
clone->xlab[i] = strdup(dset->xlab[i]);
else
status = -1;
}
if (!status && dset->alab)
{
clone->alab = calloc(dset->d, sizeof(char *));
if (clone->alab)
for (i = 0; i < dset->d; i++)
clone->alab[i] = strdup(dset->alab[i]);
else
status = -1;
}
if (!status && dset->alab0)
{
clone->alab0 = strdup(dset->alab0);
if (!clone->alab0)
status = -1;
}
if (!status && dset->label)
{
clone->label = ivec_clone(dset->label, dset->nv);
if (!clone->label)
status = -1;
}
if (!status && dset->prediction)
{
clone->prediction = ivec_clone(dset->prediction, dset->nv);
if (!clone->prediction)
status = -1;
}
if (!status && dset->ps)
{
clone->ps = dmx_clone(dset->ps, dset->nv, dset->d);
if (!clone->ps)
status = -1;
}
if (!status && dset->sigma)
{
clone->sigma = malloc(dset->c*sizeof(float *));
if (!clone->sigma)
status = -1;
else
{
len = dset->d*(dset->d+1)/2;
for (i = 0; i < dset->c; i++)
clone->sigma[i] = fvec_clone(dset->sigma[i], len);
}
}
if (!status && dset->det)
clone->det = fvec_clone(dset->det, 2*dset->d);
}
else
status = -1;
}
}
if (status == -1)
clone = dataset_free(clone);
return clone;
}
/*
Map 'dset' into 'd'-dimensional space using 'n' by dset->d matrix
'fmx'. 'n' has to be >= d, but the function does not check that
(since it doesn't know n).
Return NULL and set errno in case of failure. Possible errors are
memory allocation errors and EINVAL for bad input arguments.
*/
struct dataset *dataset_map(struct dataset *dset, int d, float **fmx)
{
struct dataset *mapped_dset;
float **x;
mapped_dset = (struct dataset *) 0;
if (dset && fmx)
{
x = fmx_mult(dset->x, dset->nv, dset->d, fmx, d, 1);
if (x)
{
mapped_dset = calloc(1, sizeof(struct dataset));
if (mapped_dset)
{
mapped_dset->x = x;
mapped_dset->nd = ivec_clone(dset->nd, dset->c);
if (mapped_dset->nd)
{
mapped_dset->label = ivec_clone(dset->label, dset->nv);
if (dset->label && !mapped_dset->label)
mapped_dset = dataset_free(mapped_dset);
else
{
mapped_dset->nv = dset->nv;
mapped_dset->d = d;
mapped_dset->c = dset->c;
}
}
else
mapped_dset = dataset_free(mapped_dset);
}
}
}
else
errno = EINVAL;
return mapped_dset;
}
/*
In-place mapping of 'dset' into 'd'-dimensional space using 'n' by
dset->d matrix 'fmx'. 'n' has to be >= d, but the function does not
check that (since it doesn't know n).
Return -1 in case of inconsistent input. Return -1 and set errno
in case of failure.
*/
int dataset_mapx(struct dataset *dset, int d, float **fmx)
{
int status;
float **x;
status = 0;
x = fmx_mult(dset->x, dset->nv, dset->d, fmx, d, 1);
if (x)
{
mx_free((void **) dset->x, dset->nv);
dset->x = x;
dset->d = d;
}
else
status = -1;
return status;
}
/*
Save 'matrix' in 'fname'. Store `alab[i]' as column names in the
first row. Store 'xlab[i]' in the first column of the i-th row. If
`alab0' is not NULL, it is stored as first column in the first row.
Return -1 in case of error and set errno.
ljb, 08/10/2005: allow zero rows and/or columns. It is conceivable
that we may have an empty test class.
*/
static int dataset_save(float **matrix, char *alab0, char **alab, char **xlab,
int rows, int columns, char *fname)
{
int status = 0;
int i;
FILE *fptr;
fptr = fopen(fname, "w");
if (fptr)
{
if (alab0)
fprintf(fptr, "%s\t", alab0);
if (alab)
{
for (i = 0; i < columns-1; i++)
fprintf(fptr, "%s\t", alab[i]);
if (columns > 1)
fprintf(fptr, "%s", alab[columns-1]);
fprintf(fptr, "\n");
}
status = fmx_nwrite(fptr, matrix, xlab, rows, columns);
if (!status)
status = fclose(fptr);
}
else
status = -1;
return status;
}
/*
Write `dset->x' into `dset->fnames' in `dset->format'.
Return -1 in case of error and set errno. In case of file error,
copy the offending file name to 'xname'.
*/
int dataset_write(struct dataset *dset, char **xname)
{
int i;
int offset;
int status;
char **xlab;
status = 0;
if (dset && dset->x && dset->nd)
{
offset = 0;
for (i = 0; (i < dset->c) && !status; i++)
{
if (dset->xlab)
xlab = &dset->xlab[offset];
else
xlab = (char **) 0;
status = dataset_save(&dset->x[offset], dset->alab0, dset->alab,
xlab, dset->nd[i], dset->d, dset->fnames[i]);
if ((status == -1) && xname)
*xname = strdup(dset->fnames[i]);
offset += dset->nd[i];
}
}
return status;
}
/*
Write `dset->x' into filenames prefix1.dmp, prefix2.dmp, etc., in
`dset->format'.
Return -1 in case of error and set errno. In case of file error,
copy the offending file name to 'xname'.
*/
int dataset_dump(struct dataset *dset, char *prefix, char **xname)
{
int i;
int len;
int status;
char *name;
char **names;
char **dmp_names;
names = dset->fnames;
dmp_names = calloc(dset->c+1, sizeof(char *));
len = strlen(prefix)+20;
for (i = 0; i < dset->c; i++)
{
name = malloc((len+1)*sizeof(char));
sprintf(name, "%s%d.dmp", prefix, i+1);
dmp_names[i] = name;
}
dset->fnames = dmp_names;
status = dataset_write(dset, xname);
str_free(dmp_names);
dset->fnames = names;
return status;
}
/*
Apply linear classifier 'wmx' to 'dset'. 'wmx' is assumed to be a
dset->c by dset->d+1 matrix whose last column contains the bias term
of the linear classifier. The predictions are stored in
dset->prediction.
Return 0 in case of success. In case of error, return the error
code. Error codes are EINVAL for bad input arguments and malloc()
error codes.
*/
int dataset_lin_predict(struct dataset *dset, float **wmx)
{
int i;
int j;
int errc;
double gmax;
double gx;
errc = 0;
if (dset && wmx)
{
if (!dset->prediction)
dset->prediction = malloc(dset->nv*sizeof(int));
if (dset->prediction)
{
for (i = 0; i < dset->nv; i++)
{
gmax = -FLT_MAX;
for (j = 0; j < dset->c; j++)
{
gx = fvec_dot(wmx[j], dset->x[i], dset->d, (int *) 0);
gx += wmx[j][dset->d];
if (gx > gmax)
{
dset->prediction[i] = j;
gmax = gx;
}
}
}
}
else
errc = errno;
}
else
errc = EINVAL;
return errc;
}
/*
Apply parametric quadratic classifier defined by 'wmx' and 'sigma'
to 'dset'. 'wmx' is assumed to be a dset->c by dset->d+1 matrix
whose last column contains the bias term. 'sigma' are inverted
covariance matrices of the training data set in SSM. They define the
quadratic term of the classifier. The predictions are stored in
dset->prediction.
Return 0 in case of success. In case of error, return the error
code. Error codes are EINVAL for bad input arguments and malloc()
error codes.
*/
int dataset_pqc_predict(struct dataset *dset, float **wmx, float **sigma)
{
int i;
int j;
int d;
int errc;
double gmax;
double gx;
float *fv;
errc = 0;
d = dset->d;
if (dset && wmx)
{
if (!dset->prediction)
dset->prediction = malloc(dset->nv*sizeof(int));
if (dset->prediction)
{
fv = malloc(d*sizeof(float));
if (fv)
{
for (i = 0; i < dset->nv; i++)
{
gmax = -FLT_MAX;
for (j = 0; j < dset->c; j++)
{
gx = fvec_dot(wmx[j], dset->x[i], d, (int *) 0);
gx += wmx[j][d];
/*
Add quadratic term.
*/
fvec_smx(dset->x[i], dset->sigma[j], d, fv);
gx += -0.5*fvec_dot(dset->x[i], fv, d, (int *) 0);
if (gx > gmax)
{
dset->prediction[i] = j;
gmax = gx;
}
}
}
free(fv);
}
else
errc = errno;
}
else
errc = errno;
}
else
errc = EINVAL;
return errc;
}
/*
Create dataset which has a subset of features in 'dset'. The subset
is defined by 'index'.
In case of failure, return NULL and set errno.
*/
struct dataset *dataset_select(struct dataset *dset, int *index, int d)
{
int i;
int j;
int idx;
int status;
struct dataset *sub_dset;
sub_dset = (struct dataset *) 0;
if (dset && index && (d > 0))
{
sub_dset = calloc(1, sizeof(struct dataset));
if (sub_dset)
{
sub_dset->d = d;
sub_dset->c = dset->c;
sub_dset->nv = dset->nv;
sub_dset->nd = ivec_clone(dset->nd, dset->c);
if (sub_dset->nd)
{
sub_dset->x = malloc(dset->nv*sizeof(float *));
if (sub_dset->x)
{
status = 0;
for (i = 0; (i < dset->nv) && !status; i++)
{
sub_dset->x[i] = fvec_subset(dset->x[i], dset->d, index, d);
if (!sub_dset->x[i])
status = -1;
}
if (!status)
{
if (dset->alab)
{
sub_dset->alab = calloc(d, sizeof(char *));
for (j = 0; j < d; j++)
{
idx = index[j];
sub_dset->alab[j] = strdup(dset->alab[idx]);
}
if (!sub_dset->alab)
status = -1;
}
if (!status)
{
if (sub_dset->alab0)
{
sub_dset->alab0 = strdup(dset->alab0);
if (!sub_dset->alab0)
status = -1;
}
if (dset->xlab)
{
sub_dset->xlab = str_clone(dset->xlab, dset->nv);
if (!sub_dset->xlab)
status = -1;
}
}
else
status = -1;
}
if (status == -1)
sub_dset = dataset_free(sub_dset);
}
}
}
}
return sub_dset;
}
/*
In-place version of dataset_select() - replace vectors in 'dset'
with the subset of `d' features whose indexed are given in 'index'.
In case of success, return 0. In case of memory allocation failure,
return -1 and set errno.
Note that bad arguments _are not_ considered an error. The caller
has to check the arguments himself.
*/
int dataset_inset(struct dataset *dset, int *index, int d)
{
int i;
int j;
int status;
char **alab;
float **x;
status = 0;
alab = (char **) 0;
if (dset && index && (d > 0))
{
x = malloc(dset->nv*sizeof(float *));
if (x)
{
for (i = 0; (i < dset->nv) && !status; i++)
{
x[i] = fvec_subset(dset->x[i], dset->d, index, d);
if (!x[i])
{
status = -1;
mx_free((void **) x, dset->nv);
}
else
{
if (dset->alab)
{
alab = calloc(d, sizeof(char *));
if (alab)
{
for (j = 0; (j < d) && !status; j++)
{
alab[j] = strdup(dset->alab[index[j]]);
if (!alab[j])
{
status = -1;
mx_free((void **) alab, j);
}
}
}
else
status = -1;
}
}
}
if (!status)
{
mx_free((void **) dset->x, dset->nv);
mx_free((void **) dset->alab, dset->d);
dset->x = x;
dset->d = d;
dset->alab = alab;
}
}
}
else
{
status = -1;
errno = EINVAL;
}
return status;
}
syntax highlighted by Code2HTML, v. 0.9.1