/* File name: svd.c Created by: Ljubomir Buturovic Created: 02/22/2002 Purpose: Singular Value Decomposition (SVD) mapping for high-dimensional data. */ /* Copyright 2004 Ljubomir J. Buturovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* The principal function in this file is svd(). It implements linear transformation of the input data using Singular Value Decomposition (SVD) of a covariance matrix. This algorithm is a specialized version of Karhunen-Loeve transform (also known as principal component analysis) for high-dimensional data. It calculates transformation matrix 'u' corresponding to 'idr' biggest eigenvalues of the covariance matrix of the input matrix data matrix by performing the eigenanalysis in the 'nvec' dimensional space, instead of in the 'd'-dimensional space of the input vectors. The function should be used when the number of available vectors is lower than the dimension (length) of the vectors. In such cases, svd() is more efficient than the standard approach (of performing the analysis in the original 'd'-dimensional space). */ static char rcsid[] = "$Id: svd.c,v 1.52 2006/04/21 07:09:42 ljubomir Exp $"; #include #include #include #include #include #include "pcp.h" #include "lmat.h" #include "lau.h" #include "pau.h" #include "bagging.h" #include "adaboost.h" /* Log MX^T*MX (transposed matrix 'mx' times itself). Used to verify eigenvector matrices. */ static void log_mxt_mx(float **mx, int rows, int columns, FILE *fdbg) { int i; int j; int k; double s; float *amx; if (fdbg) { amx = malloc(columns*sizeof(float)); for (i = 0; i < columns; i++) { for (j = 0; j < columns; j++) { s = 0.0; for (k = 0; k < rows; k++) s += mx[k][i]*mx[k][j]; amx[j] = s; } fprintf(fdbg, "mxt*mx row %d\n", i); for (j = 0; j < columns; j++) fprintf(fdbg, "%12.5g ", amx[j]); fprintf(fdbg, "\n"); } free(amx); } } /* Calculate Singular Value Decomposition (SVD) mapping (a specialized version of Karhunen-Loeve transform) for 'nvec' 'd'-dimensional input vectors 'x'. Return 'nvec' by 'd' transformation matrix 'u', where 'nvec' is the number of input vectors in 'x'. Input vectors x[0][0..d-1], x[1][0..d-1], x[2][0..d-1], ..., x[nvec-1][0..d-1] are assumed to be such that d << nvec (otherwise use the regular KL transform). Return -1 in case of error and set 'errc'. Errors are malloc() errors and LERR_ITMAX in eigen(). The code follows the logic of the article Orly Alter, Patrick O. Brown, and David Botstein, 'Singular value decomposition for genome-wide expression data processing and modeling', Proceedings of the National Academy of Sciences, vol 97, no. 18, August 29, 2000, pp. 10101-10106, section 'Mathematical Framework: Singular Value Decomposition'. */ static int svd(float **xmx, int nvec, int d, float ***u, FILE *fdbg, int *errc) { int i; int j; int k; int status; int use_correlation = 0; float xfc; double s; float *xmean; float **sigma; float **umx; float *evl; float **evx; /* For use_correlation == 1, use correlation matrix to calculate u (as in the original reference); otherwise, use the covariance matrix, as is commonly done. */ xmean = mest(xmx, d, nvec); if (use_correlation == 1) { for (i = 0; i < d; i++) xmean[i] = 0.0; sigma = cest(xmx, d, nvec, CORRELATION_MATRIX); } else sigma = cest(xmx, d, nvec, COVARIANCE_MATRIX); if (sigma && xmean) { status = eigsn_column(sigma, d, &evl, &evx, errc); if (status == 0) { if (fdbg) { for (i = 0; i < d; i++) { fprintf(fdbg, "svd(); evl[%d]: %f\n", i, evl[i]); fprintf(fdbg, "svd(); "); for (j = 0; j < d; j++) fprintf(fdbg, "evx[%d][%d]: %f; ", j, i, evx[j][i]); fprintf(fdbg, "\n"); } fflush(fdbg); fprintf(fdbg, "eigenvectors ev^T * ev:\n"); log_mxt_mx(evx, d, d, fdbg); } /* Now calculate umx as amx*evx*sqrt(evl^(-1)). amx is (xmx-xmean)/sqrt(nvec-1) (see R&D, vol. III, 02/25/2002). First multiply each eigenvector with its' eigenvalue^(-1/2). */ for (j = 0; j < d; j++) { /* Ill-conditioned covariance matrices may generate negative eigenvalues. Set them to 0. TBD: examine this. */ if (evl[j] > 0) evl[j] = sqrt(1.0/evl[j]); else evl[j] = 0.0; } if (fdbg) { fprintf(fdbg, "sqrt(eps^(-1)):\n"); for (j = 0; j < d; j++) fprintf(fdbg, "%g ", evl[j]); fprintf(fdbg, "\n"); } for (i = 0; i < d; i++) /* eigenvector index */ for (j = 0; j < d; j++) evx[j][i] = evx[j][i]*evl[i]; if (fdbg) { fprintf(fdbg, "ev * eps^(-1):\n"); log_mx(evx, d, d, fdbg); } umx = fmx_alloc(nvec, d); if (umx) { /* Now multiply result with xmx-xmean. */ if (use_correlation == 1) xfc = 1.0; else { xfc = sqrt(nvec-1); xfc = 1.0/xfc; } for (i = 0; i < nvec; i++) for (j = 0; j < d; j++) { s = 0.0; for (k = 0; k < d; k++) s += (xmx[i][k]-xmean[k])*evx[k][j]; umx[i][j] = s*xfc; } if (fdbg) { fprintf(fdbg, "transformation matrix u:\n"); log_mx(umx, nvec, d, fdbg); fprintf(fdbg, "transformation matrix u^T * u:\n"); log_mxt_mx(umx, nvec, d, fdbg); } } else status = -1; if (fdbg) { correst(sigma, d); fprintf(fdbg, "matrix of linear correlation coefficients:\n"); log_mx(sigma, d, d, fdbg); } /* Free sigma and xmean. */ mx_free((void **) sigma, d); free(xmean); *u = umx; vx_free(evl); mx_free((void **) evx, d); } } else { status = -1; *errc = errno; } return status; } /* Interface to svd(), a linear transformation of high-dimensional vectors using Singular Value Decomposition (SVD) of a covariance matrix (a variant of Karhunen-Loeve transform). In case of error, set 'errc'. In case of file error, set the file name in 'xname'. Error codes: 1. this function assumes that the rows of 'input' are high-dimensional, that is typically nvec << d. If nvec >= d, set 'errc' to PERR_INC_DIM. */ void p_svd(int *errc, char **xname, int dbg) { int itd; int status; int mode; int nsamples; char *fname; float **xmx; float **umx; float **svdx; FILE *outdev; FILE *fdbg = (FILE *) 0; status = 0; mode = 0; /* 0: use training and test datasets to compute SVD */ /* 1: use training dataset to compute SVD */ if (tds) { if (teds) { clear_screen(); cursor_on(); mode = input_transform_mode(); } nsamples = tds->nv; if (mode == 0) nsamples += teds->nv; /* This transformation is executed only if number of samples is less or equal than dimension. */ if (nsamples > tds->d) *errc = PERR_INCONSISTENT_SVD; else { outdev = stdout; if (dbg > 0) fdbg = fopen(PCP_DBG, "a"); if (!teds) { clear_screen(); cursor_on(); } itd = input_d(stdin, outdev, tds->d, nsamples); fname = input_filename(PMSG_LIN_OUTPUT_FNAME, PCP_LIN, outdev); cursor_off(); print_line(outdev, PCP_UMSG_SVD, PCP_QLEN); if (mode == 1) svdx = tds->x; else { svdx = combine_x(tds->x, tds->nv, teds->x, teds->nv); if (!svdx) { status = -1; *errc = errno; } } if (!status) { xmx = fmx_transpose(svdx, nsamples, tds->d); if (mode == 0) vx_free(svdx); status = svd(xmx, tds->d, nsamples, &umx, fdbg, errc); /* After svd(), xmx is d by nsamples, umx is nsamples by d. */ mx_free((void **) xmx, tds->d); if (status == 0) { status = fmx_save(umx, tds->d, itd, fname, 1); mx_free((void **) umx, tds->d); if (status == -1) { *errc = errno; *xname = strdup(fname); } } } } } else *errc = PERR_UNDEFINED_TDS; } /* Feature extraction using SVD. For input 'nv' by 'd' data matrix 'x', returns linear transformation matrix 'd' by 'nv'. It is assumed that 'nv' is <= 'd'. If nv is > d, return NULL. In case of failure to iterate in eigen(), return NULL and set errc to LERR_ITMAX. In case of malloc() error, return NULL and set errc to the corresponding errno. */ float **svd_transform(float **x, int nv, int d, int *errc) { int status; float **svd_transformation; float **umx; float **xmx; svd_transformation = (float **) 0; if (nv <= d) { /* TBD: the two transposes are a kludge. Instead, svd() should take nv by d input. */ xmx = fmx_transpose(x, nv, d); if (xmx) { status = svd(xmx, d, nv, &umx, (FILE *) 0, errc); if (!status) { /* xmx and umx are d by nv. */ svd_transformation = fmx_transpose(umx, d, nv); /* svd_transformation is nv by d. */ if (!svd_transformation) *errc = errno; mx_free((void **) umx, d); } mx_free((void **) xmx, d); } else *errc = errno; } return svd_transformation; }