/* File name: emap.c Created by: Ljubomir Buturovic Created: 04/14/2002 Purpose: linear dimension reduction using E-mapping. */ /* Copyright 2004 Ljubomir J. Buturovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ static char rcsid[] = "$Id: emap.c,v 1.88 2005/06/18 07:36:18 ljubomir Exp $"; #include #include #include #include #include #include "emap.h" #include "dataset.h" #include "lau.h" #include "pau.h" #include "lmat.h" #include "simplex.h" #include "cda.h" #include "estimate_error.h" struct emap_crit_parameters { struct dataset *dset; int lxd; /* dimension of the transformed space */ int kmin; int kmax; int vid; /* vertex ID */ float eval; /* the lowest function value so far */ char *fname; /* file to store resulting transformation */ FILE *outdev; /* destination for output messages */ FILE *fdbg; /* destination for debug messages */ }; /* Calculate criterion function for E-mapping. Takes input dataset 'dset' and transformation matrix 'umx' and returns average leave-one-out Bayes error estimate in the transformed space, calculated for k between 'kmin' and 'kmax'. Dimensions of 'umx' are 'lxd' by 'd', where 'lxd' is dimension of transformed space. In case of failure, return -1 and set 'errc'. Possible error codes: ENOMEM; LERR_SINGCOV for singular covariance matrix. */ static float emap_crit(struct dataset *dset, float **umx, int lxd, int kmin, int kmax, int *errc, FILE *outdev, FILE *fdbg) { int i; int status; int knn; int offset; float avg_bayes; float dsign; float dexp; float **y; float **sigma; /* covariance matrix for one class, in full storage mode */ float **sig; /* covariance matrices for all classes in SSM */ float **inverse; /* inverted covariance matrix for one class */ float *det; /* array of natural log of determinants */ float *bayes_l; double dsum; avg_bayes = -1.0; status = 0; *errc = 0; /* Transform input vectors. */ y = fmx_mult(dset->x, dset->nv, dset->d, umx, lxd, 1); if (y) { det = malloc(dset->c*sizeof(float)); if (det) { sig = calloc(dset->c, sizeof(float *)); if (sig) { /* Estimate and invert covariance matrices in the transformed space. */ offset = 0; for (i = 0; (i < dset->c) && !status; i++) { sigma = cest(&y[offset], lxd, dset->nd[i], COVARIANCE_MATRIX); /* Invert. */ inverse = fmx_inv(sigma, lxd, &dsign, &dexp, errc); mx_free((void **) sigma, lxd); if (!inverse) status = -1; else { /* Convert 'sigma' to SSM, as required by L_error(). */ sig[i] = fmx_ssm(inverse, lxd); mx_free((void **) inverse, lxd); if (!sig[i]) { sig = (float **) mx_free((void **) sig, i); status = -1; *errc = errno; } else { det[i] = dexp; if (fdbg) fprintf(fdbg, "emap_crit(); class %d determinant: %g\n", i, dexp); } } offset += dset->nd[i]; } if (!status) { /* Estimate Bayes error in the transformed space. */ bayes_l = L_error(y, dset->nv, lxd, dset->c, dset->nd, sig, det, kmin, kmax, outdev, fdbg, errc); /* Return average. */ if (*errc == 0) { dsum = 0.0; knn = kmax-kmin+1; for (i = 0; i < knn; i++) dsum += bayes_l[i]; avg_bayes = dsum/knn; if (fdbg) fprintf(fdbg, "emap_crit(); Bayes in %d-dimensional space: %g\n", lxd, avg_bayes); } else status = -1; vx_free(bayes_l); } sig = (float **) mx_free((void **) sig, dset->c); } else *errc = errno; vx_free(det); } else *errc = errno; mx_free((void **) y, dset->nv); } else *errc = errno; return avg_bayes; } static int save_mapping(float **umx, int lxd, int d, float ecrit, char *fname) { int status = 0; int i; int j; float *vector; FILE *fptr; if (fname) { fptr = fopen(fname, "w"); if (fptr) { for (i = 0; i < lxd; i++) { vector = fvec_normalize(umx[i], d); if (vector) { for (j = 0; j < d; j++) fprintf(fptr, "%14.7g\t", vector[j]); fprintf(fptr, "\n"); vx_free(vector); } else status = -1; } fclose(fptr); } else status = -1; } return status; } #define EMAP_FUNC_MSG " Bayes error estimate, mean value: " /* Compute value of EMAP criterion function for the transformation vector 'fvec'. The vector is array representation of the transformation matrix which maps parameters->dset to parameters->d-dimensional space. The argument 'n' is not used - it is calculated from values contained in 'parameters'. But we have to accept it, since that is the required signature for the simplex() criterion function. The function converts 'fvec' to a matrix format, which is then passed to the actual criterion computation function emap_crit(), and saves results. In case of error, set 'errc'. The errors are malloc() errors and LERR_SINGCOV for singular covariance matrix. */ static float emap_func(float *fvec, int n, int iteration, void *parameters, int *errc) { int i; int idx; int lxd; int kmin; int kmax; int status; int vid; float ecrit; char *fname; FILE *outdev; FILE *fdbg; char *msg; float **umx; struct dataset *dset; struct emap_crit_parameters *emap_parameters; ecrit = -1.0; emap_parameters = (struct emap_crit_parameters *) parameters; dset = emap_parameters->dset; lxd = emap_parameters->lxd; kmin = emap_parameters->kmin; kmax = emap_parameters->kmax; vid = emap_parameters->vid; outdev = emap_parameters->outdev; fdbg = emap_parameters->fdbg; fname = emap_parameters->fname; status = 0; /* 'umx' is matrix-version of 'fvec'. */ umx = malloc(lxd*sizeof(float *)); if (umx) { idx = 0; for (i = 0; i < lxd; i++) { umx[i] = &fvec[idx]; idx += dset->d; } ecrit = emap_crit(dset, umx, lxd, kmin, kmax, errc, (FILE *) 0, fdbg); if (ecrit < 0) status = -1; else { /* Only save the mapping if the criterion function value is less than the current minimum eval. */ /*if ((iteration > 0) && (ecrit < emap_parameters->eval))*/ { emap_parameters->eval = ecrit; status = save_mapping(umx, lxd, dset->d, ecrit, fname); if (status == -1) *errc = errno; } } vx_free(umx); } else *errc = errno; if (!*errc) { msg = malloc(1000); if (msg) { if (iteration == 0) sprintf(msg, "Iteration %7d; vertex %7d; %s %7.2f", iteration, vid, EMAP_FUNC_MSG, ecrit); else sprintf(msg, "Iteration %7d; %s %7.2f", iteration, EMAP_FUNC_MSG, ecrit); print_line(outdev, msg, PCP_QLEN); } else *errc = errno; free(msg); } return ecrit; } /* Return the chosen dimension reduction method used to generate starting point for emap(). The methods are those listed in file pcp.h. */ static int input_init_method(FILE *outdev, int lxd, int nv, int d, int c) { int method; int default_method; int len; int *methods; char *msg; len = 0; methods = malloc(100*sizeof(int)); /* this should be more than we will ever need */ msg = malloc((PCP_QLEN+1)*sizeof(char)); if ((c == 2) && (nv <= d)) { len = 3; methods[0] = PDR_SVD; methods[1] = PDR_RANDOM; methods[2] = PDR_FILE; default_method = PDR_SVD; sprintf(msg, PCP_UMSG_EMAP_S1, PDR_SVD, PDR_RANDOM, PDR_FILE, default_method); } else if ((c == 2) && (nv > d)) { /* For Fisher's linear discriminant, the allowed values of mapping space dimension are <= c-1. */ if (lxd <= c-1) { len = 4; methods[0] = PDR_PCA; methods[1] = PDR_FISHER; methods[2] = PDR_RANDOM; methods[3] = PDR_FILE; default_method = PDR_PCA; sprintf(msg, PCP_UMSG_EMAP_S2, PDR_FISHER, PDR_PCA, PDR_RANDOM, PDR_FILE, default_method); } else { len = 3; methods[0] = PDR_PCA; methods[1] = PDR_RANDOM; methods[2] = PDR_FILE; default_method = PDR_PCA; sprintf(msg, PCP_UMSG_EMAP_S3, PDR_PCA, PDR_RANDOM, PDR_FILE, default_method); } } else if ((c > 2) && (nv <= d)) { len = 4; methods[0] = PDR_SVD; methods[1] = PDR_RANDOM; methods[2] = PDR_FILE; default_method = PDR_SVD; sprintf(msg, PCP_UMSG_EMAP_S1, PDR_SVD, PDR_RANDOM, PDR_FILE, default_method); } else if ((c > 2) && (nv > d)) { if (lxd <= c-1) { len = 4; methods[0] = PDR_PCA; methods[1] = PDR_FISHER; methods[2] = PDR_RANDOM; methods[3] = PDR_FILE; default_method = PDR_PCA; sprintf(msg, PCP_UMSG_EMAP_S2, PDR_FISHER, PDR_PCA, PDR_RANDOM, PDR_FILE, default_method); } else { len = 3; methods[0] = PDR_PCA; methods[1] = PDR_RANDOM; methods[2] = PDR_FILE; default_method = PDR_PCA; sprintf(msg, PCP_UMSG_EMAP_S3, PDR_PCA, PDR_RANDOM, PDR_FILE, default_method); } } method = input_choice(stdin, outdev, msg, &default_method, methods, len); free(msg); free(methods); return method; } /* Convert matrix 'mx' ('rows' by 'columns') into vector. */ static float *fvec_mx(float **mx, int rows, int columns, int *errc) { int i; int j; int idx; float *fvec; fvec = malloc(rows*columns*sizeof(float)); if (fvec) { idx = 0; for (i = 0; i < rows; i++) for (j = 0; j < columns; j++) fvec[idx++] = mx[i][j]; } else *errc = errno; return fvec; } /* Return initial transformation of 'dset' from original space to 'lxd' space, to be used as a starting point for optimization. The transformation is read from a file or generated pseudo-randomly. If it is read from a file, return file name in 'fname'. The transformation is represented as a one-dimensional vector of length d times lxd, as is required for the simplex() optimization function. In case of error, set 'errc'. In case of file error, return name of offending file in 'fname'. */ static float *init_vertex(FILE *indev, FILE *outdev, int lxd, struct dataset *dset, char **fname, int *errc) { int i; int dc; int status; int nx; float min; float max; char *xname; float *umx; /* initial vertex in vector form */ float **mapping; float **y; /* temp. var, used to store mapping read from a file */ umx = (float *) 0; status = input_init_method(outdev, lxd, dset->nv, dset->d, dset->c); if (status != EOF) { if (status == PDR_FILE) { xname = input_filename(PCP_UMSG_INIT_VERTEX, (char *) 0, outdev); if (xname && *xname) { nx = file_info(xname, (int *) 0, &dc, '\0'); if ((nx > 0) && (dc > 0)) { if (nx >= lxd) { y = fmx_alloc(lxd, dc); if (y) { status = load_file(xname, y, LAU_FF_RAW, (char **) 0, (char **) 0, (char **) 0, dc, lxd, errc); if (status == -1) { *fname = strdup(xname); if (!*fname) *errc = errno; } else { umx = fvec_mx(y, lxd, dc, errc); mx_free((void **) y, lxd); if (!umx) *errc = errno; } } else *errc = errno; } else { *fname = strdup(xname); *errc = PERR_INC_DIM; } } else { *fname = strdup(xname); if (nx < 0) *errc = errno; else if (dc == -1) *errc = PERR_BAD_INPUT_FILE; } } else { *errc = PERR_BAD_INPUT_FILE; if (xname && !(*xname)) *fname = strdup(""); } } else if (status == PDR_RANDOM) { min = 0.0; /* TBD: hardcoded. */ max = 1.0; umx = malloc(lxd*(dset->d)*sizeof(float)); if (umx) { /*rand_float_seed(&seed);*/ for (i = 0; i < lxd*(dset->d); i++) umx[i] = float_rand()*(max-min)+min; } else *errc = errno; } else if ((status == PDR_SVD) || (status == PDR_FISHER) || (status == PDR_PCA)) { if (status == PDR_SVD) mapping = svd_transform(dset->x, dset->nv, dset->d, errc); else if (status == PDR_FISHER) mapping = fld(dset, errc); else mapping = pca(dset, errc); if (mapping) { umx = fvec_mx(mapping, lxd, dset->d, errc); if (!umx) *errc = errno; if (status == PDR_SVD) mx_free((void **) mapping, dset->nv); else mx_free((void **) mapping, lxd); } } } else *errc = PERR_BAD_INPUT; if (*errc != 0) { free(umx); umx = (float *) 0; } return umx; } /* Implementation of EMAP functionality (linear feature transformation which minimizes Bayes error estimate in the transformed space). The function computes the transformation matrix using the training data set, and saves it in a file. In case of error, set 'errc'. If 'errc' is file access error, 'xname' is name of file associated with the error. If 'dbg' is > 0, send debugging information to debug file. */ void p_emap(int *errc, char **xname, int dbg) { int i; int j; int status; int lxd; int ndim; int seed; int min_range; int max_range; int int_default; int kmin; int kmax; int itmax; int iter; float lambda; float ftol = 1e-6; float val; float *smx0; char *fname = (char *) 0; char *emap_fname; char *msg; float **smx; float *fval; FILE *outdev; FILE *fdbg = (FILE *) 0; struct emap_crit_parameters *emap_parameters; *errc = 0; if (dbg > 0) fdbg = fopen(PCP_DBG, "a"); outdev = stdout; clear_screen(); cursor_on(); emap_parameters = (struct emap_crit_parameters *) 0; if (*errc == 0) { min_range = 1; max_range = tds->d; msg = malloc((PCP_QLEN+1)*sizeof(char)); sprintf(msg, PCP_UMSG_DIM, min_range, max_range); lxd = input_integer(stdin, outdev, msg, PCP_QLEN, (int *) 0, &min_range, &max_range); free(msg); emap_fname = input_filename(PMSG_LIN_OUTPUT_FNAME, PCP_EMP, stdout); /* The error estimate is calculated for k between 'kmin' and 'kmax', inclusive. */ min_range = 1; max_range = ivec_min(tds->nd, tds->c)-1; kmin = input_integer(stdin, outdev, PCP_UMSG_KMIN, PCP_QLEN, &min_range, &min_range, &max_range); msg = malloc((PCP_QLEN+1)*sizeof(char)); int_default = max_range/2; sprintf(msg, PCP_UMSG_KMAX, kmin, max_range, int_default); kmax = input_integer(stdin, outdev, msg, PCP_QLEN, &int_default, &kmin, &max_range); free(msg); seed = input_seed(stdin, outdev); min_range = 1; int_default = EMAP_ITMAX; /* default max. number of iterations */ msg = malloc((PCP_QLEN+1)*sizeof(char)); sprintf(msg, PCP_UMSG_MAXIT, int_default); itmax = input_integer(stdin, outdev, msg, PCP_QLEN, &int_default, &min_range, (int *) 0); free(msg); ndim = lxd*(tds->d); /* Parameters ready, start processing. First create initial simplex 'smx': each row represents one vertex of the simplex. 'smx' has a total of ndim+1 rows, one per vertex. */ status = smplx_alloc(&fval, &smx, ndim); if (!status) { /* Create initial vertex. */ smx0 = init_vertex(stdin, outdev, lxd, tds, &fname, errc); fvec_copy(smx[0], smx0, ndim); free(smx0); if (*errc == 0) { emap_parameters = malloc(sizeof(struct emap_crit_parameters)); emap_parameters->dset = tds; emap_parameters->lxd = lxd; emap_parameters->kmin = kmin; emap_parameters->kmax = kmax; emap_parameters->outdev = outdev; emap_parameters->fdbg = fdbg; emap_parameters->vid = 0; emap_parameters->fname = emap_fname; emap_parameters->eval = FLT_MAX; fval[0] = emap_func(smx[0], 0, 0, emap_parameters, errc); if (*errc == 0) { /* Initialize the remaining ndim vertices of the simplex, using equation (10.4.1) from Numerical Recipes in C. */ lambda = 1.0; /* TBD: hardcoded for now. */ for (i = 1; (i < ndim+1) && (*errc == 0); i++) { for (j = 0; j < ndim; j++) smx[i][j] = smx[0][j]; smx[i][i-1] += lambda; emap_parameters->vid = i; fval[i] = emap_func(smx[i], 0, 0, emap_parameters, errc); } if (*errc == 0) { val = simplex(smx, ndim, fval, emap_func, (void *) emap_parameters, itmax, ftol, &iter, errc); mx_free((void **) smx, ndim+1); vx_free(fval); if (*errc == 0) { msg = malloc(1000); sprintf(msg, "Optimization completed in %7d iterations; minimum: %7.2f.", iter, val); print_line(outdev, msg, PCP_QLEN); free(msg); pwait(); } } } else *xname = strdup(emap_fname); vx_free((void *) emap_parameters); } else if (fname) *xname = strdup(fname); } else *errc = errno; } cursor_off(); } /* Compute EMAP transformation matrix for dataset 'dset' into 'dr'-dimensional space. 'kmin' and 'kmax' are parameters used for the Bayes error estimation. The function uses PCA (Principal Component Analysis) as initial point for the simplex optimization if number of input vectors is greater than dimensionality; otherwise it uses SVD (Singular Value Decomposition). The max. number of iterations is fixed to EMAP_MAX_ITER. In case of success, return the transformation matrix. In case of failure, return NULL and set 'errc'. The errors are EINVAL for bad arguments, memory allocation errors, */ float **emap(struct dataset *dset, int dr, int kmin, int kmax, int *errc) { int i; int j; int status; int iter; int ndim; int nv; int d; int idx; float val; float lambda; float ftol; float *fval; float *smx0; float **smx; float **mx; float **map; struct emap_crit_parameters *emap_parameters; map = (float **) 0; if (dset && errc && (kmin >= 1) && (kmax >= 1) && (kmax >= kmin)) { ftol = 1e-6; d = dset->d; nv = dset->nv; ndim = dr*d; status = smplx_alloc(&fval, &smx, ndim); if (!status) { /* Create initial vertex. */ if (nv <= d) mx = svd_transform(dset->x, nv, d, errc); else mx = pca(dset, errc); if (mx) { smx0 = fvec_mx(mx, dr, d, errc); if (smx0) { fvec_copy(smx[0], smx0, ndim); if (nv <= d) mx_free((void **) mx, nv); else mx_free((void **) mx, dr); free(smx0); if (*errc == 0) { emap_parameters = malloc(sizeof(struct emap_crit_parameters)); emap_parameters->dset = dset; emap_parameters->lxd = dr; emap_parameters->kmin = kmin; emap_parameters->kmax = kmax; emap_parameters->outdev = (FILE *) 0; emap_parameters->fdbg = (FILE *) 0; emap_parameters->vid = 0; emap_parameters->fname = (char *) 0; emap_parameters->eval = FLT_MAX; fval[0] = emap_func(smx[0], 0, 0, emap_parameters, errc); if (*errc == 0) { /* Initialize the remaining ndim vertices of the simplex, using equation (10.4.1) from Numerical Recipes in C. */ lambda = 1.0; /* TBD: hardcoded for now. */ for (i = 1; (i < ndim+1) && (*errc == 0); i++) { for (j = 0; j < ndim; j++) smx[i][j] = smx[0][j]; smx[i][i-1] += lambda; emap_parameters->vid = i; fval[i] = emap_func(smx[i], 0, 0, emap_parameters, errc); } if (*errc == 0) { val = simplex(smx, ndim, fval, emap_func, (void *) emap_parameters, EMAP_ITMAX, ftol, &iter, errc); idx = fvec_argmin(fval, ndim+1); map = fmx_fvec(smx[idx], ndim, dr, d); mx_free((void **) smx, ndim+1); } } } } } } else if (errc) *errc = EINVAL; } return map; }