/* File name: fselect.c Created by: Ljubomir Buturovic Created: 01/10/2003 Purpose: feature selection functions. */ /* Copyright 2004-2005 Ljubomir J. Buturovic, Sasha Jaksic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include "pcp.h" #include "pau.h" #include "lau.h" #include "lmat.h" #include "heap.h" #include "estimate_error.h" #include "fselect.h" #include "xlearn.h" static char rcsid[] = "$Id: fselect.c,v 1.73 2005/11/22 06:17:00 ljubomir Exp $"; #define FSELECT_LINE_WIDTH 100 #define ODD(x) (x&1) /* Calculate mean of feature 'idx' in matrix 'x'. */ static float calc_mean(float **x, int nv, int idx) { int i; float mean; double dsum = 0.0; for (i = 0; i < nv; i++) dsum += x[i][idx]; mean = dsum/nv; return mean; } /* Calculate standard deviation of feature 'idx' in matrix 'x', given 'mean'. */ static float calc_sd(float **x, int nv, int idx, float mean) { int i; float sd; double diff; double dsum = 0.0; for (i = 0; i < nv; i++) { diff = x[i][idx]-mean; dsum += diff*diff; } sd = sqrt(dsum/(nv-1)); return sd; } /* Calculate Pearson correlation coefficient between 'ivec' and feature 'idx' in matrix 'x'. */ static double calc_pearson(int *ivec, float **x, int nv, int idx) { int i; double sxx; double syy; double sxy; double meanx; double meany; double pearson; double dsum1; double dsum2; dsum1 = 0.0; dsum2 = 0.0; for (i = 0; i < nv; i++) { dsum1 += ivec[i]*ivec[i]; dsum2 += ivec[i]; } meanx = dsum2/nv; sxx = dsum1-nv*meanx*meanx; dsum1 = 0.0; dsum2 = 0.0; for (i = 0; i < nv; i++) { dsum1 += x[i][idx]*x[i][idx]; dsum2 += x[i][idx]; } meany = dsum2/nv; syy = dsum1-nv*meany*meany; dsum1 = 0.0; for (i = 0; i < nv; i++) dsum1 += ivec[i]*x[i][idx]; sxy = dsum1-nv*meanx*meany; pearson = sxy/sqrt(sxx*syy); return pearson; } /* Return ranking of features in 'dset' according to criterion `fscrit'. The available feature evaluation criteria are defined in pcp.h. The returned array has sorted criterion values for all input features (best comes first; thus array[0] is the best feature, array[1] next best, etc.). Output array 'index' contains the original indexes of these features. In case of failure, return NULL and set errno. */ static float *rank_features(struct dataset *dset, int fscrit, int **index) { int i; int j; int status; int offset; int idx; int kmin; int kmax; int nn; int errc; float mean1; float mean2; float sd1; float sd2; double dsum1; double dsum2; int *isx; float *det; float **x; float *ranking; float **sig; float *bayes_l; unsigned int seed; status = 0; ranking = malloc(dset->d*sizeof(float)); if (ranking) { if (fscrit == PCP_FSEL_EUCLIDEAN) { for (i = 0; i < dset->d; i++) { dsum2 = 0.0; for (j = 0; j < dset->nv; j++) { dsum1 = dset->x[j][i]-dset->label[j]; dsum2 += dsum1*dsum1; } ranking[i] = sqrt(dsum2); } } else if (fscrit == PCP_FSEL_GOLUB) { /* NOTE: this method only works for two classes. It uses 'neighborhood analysis' described in Golub, T.R. et al., 'Molecular Classification of Cancer: Class Discovery and Class Prediction by Gene Expression Monitoring'. Science, vol. 286, 15 October 1999, pp. 531-537. */ for (i = 0; i < dset->d; i++) { offset = dset->nd[0]; mean1 = calc_mean(dset->x, dset->nd[0], i); mean2 = calc_mean(&dset->x[offset], dset->nd[1], i); sd1 = calc_sd(dset->x, dset->nd[0], i, mean1); sd2 = calc_sd(&dset->x[offset], dset->nd[1], i, mean2); ranking[i] = -fabs((mean1-mean2)/(sd1+sd2)); } } else if (fscrit == PCP_FSEL_PEARSON) /* This criterion computes Pearson correlation coefficient between a vector of feature values, for all input vectors, and the corresponding class labels. The class labels go from 0 to c-1. The criterion often provides very good performance. However, there are still some open questions regarding this criterion. Namely, the class labels are arbitrary integers. We happen to use [0..c-1], but that is arbitrary choice. The value of the criterion depends on this arbitrary assignment of the class labels. It is unclear how much using different class labels would affect classification performance of this criterion. This is an important issue which should be investigated. */ for (i = 0; i < dset->d; i++) ranking[i] = -fabs(calc_pearson(dset->label, dset->x, dset->nv, i)); else if (fscrit == PCP_FSEL_BAYES) for (i = 0; (i < dset->d) && !status; i++) { x = fmx_alloc(dset->nv, 1); for (j = 0; j < dset->nv; j++) x[j][0] = dset->x[j][i]; seed = 1; kmin = 1; kmax = ivec_min(dset->nd, dset->c)-1; nn = kmax-kmin+1; sig = calc_sig(x, dset->c, 1, dset->nd, &det, &errc); bayes_l = L_error(x, dset->nv, 1, dset->c, dset->nd, sig, det, kmin, kmax, (FILE *) 0, (FILE *) 0, &errc); if (bayes_l) ranking[i] = fvec_sum(bayes_l, nn)/nn; else { status = -1; errno = errc; } } if (!status) { status = fheap(ranking, dset->d, index); isx = malloc(dset->d*sizeof(int)); if (isx) { for (i = 0; i < dset->d; i++) { idx = (*index)[i]; isx[idx] = i; } ivec_copy(*index, isx, dset->d); free(isx); } else status = -1; if (status == -1) ranking = vx_free(ranking); else if ((fscrit == PCP_FSEL_GOLUB) || (fscrit == PCP_FSEL_PEARSON)) { /* Restore sign for similarity-like measures. */ for (i = 0; i < dset->d; i++) ranking[i] = -ranking[i]; } } } return ranking; } /* Select optimal feature subset in TDS. The function collects parameters from the user (output file name and criterion), computes the optimal feature subset, and saves it in the output file. */ void p_fselect(int *errc, char **xname, int dbg) { int i; int fscrit; int nfeat; int dr_method; double val_crit; int *index; char *fname; float *ranking; char **names; FILE *outdev; FILE *fptr; if (tds) { *errc = 0; clear_screen(); outdev = stdout; cursor_on(); dr_method = input_fsel(outdev); if (dr_method == PDR_RANKING) { fname = input_filename(PCP_UMSG_FNAME_RNK, PCP_RNK, outdev); fscrit = input_fscrit(outdev, dr_method, tds->c); if ((ranking = rank_features(tds, fscrit, &index)) == (float *) 0) *errc = errno; else { fptr = fopen(fname, "w"); if (fptr) { names = tds->alab; for (i = 0; i < tds->d; i++) { fprintf(fptr, "%d\t%f", index[i]+1, ranking[i]); if (names) fprintf(fptr, "\t%s", names[index[i]]); fprintf(fptr, "\n"); } if (fclose(fptr)) { *errc = errno; *xname = strdup(fname); } } else { *errc = errno; *xname = strdup(fname); } } } else { fname = input_filename(PCP_UMSG_FNAME_SEL, PCP_SET, outdev); fscrit = input_fscrit(outdev, dr_method, tds->c); nfeat = input_nfeat(stdin, stdout, tds->d, PCP_DFLT_NFSEL); if (dr_method == PDR_FORWARD) index = forward_select(tds, nfeat, fscrit, 1, &val_crit, errc); else if(dr_method == PDR_BACKWARD) index = backward_select(tds, nfeat, fscrit, 1, &val_crit, errc); else if(dr_method == PDR_FLOAT_FORWARD) { int real_featcount; index = float_fwd_select(tds, nfeat, fscrit, &val_crit, &real_featcount, errc); nfeat = real_featcount; } if (index) { fptr = fopen(fname, "w"); if (fptr) { names = tds->alab; for (i = 0; i < nfeat; i++) { fprintf(fptr, "%d\t%f", index[i]+1, val_crit); if (names) fprintf(fptr, "\t%s", names[index[i]]); fprintf(fptr, "\n"); } if (fclose(fptr)) { *errc = errno; *xname = strdup(fname); } } else { *errc = errno; *xname = strdup(fname); } } } } else *errc = PERR_UNDEFINED_TDS; } static void fsel_display(int *index, float *ranking, int d) { int i; int idx; char *msg; char *name; char **names; clear_screen(); inverse_video(); cursor_off(); printf(PCP_WLINE); printf(PCP_EMPTY_WLINE); printf("| S E L E C T E D F E A T U R E S |\n"); printf(PCP_EMPTY_WLINE); printf(PCP_WLINE); names = tds->alab; msg = malloc((PCP_WLEN+1)*sizeof(char)); for (i = 0; i < d; i++) { idx = index[i]; if (names) { name = strdup(names[idx]); if (strlen(name) > 74) name[74] = '\0'; sprintf(msg, "| %6d | %10.5f | %-74s |", index[i]+1, ranking[i], name); free(name); } else sprintf(msg, "| %6d | %10.5f | unnamed feature |", index[i], ranking[i]); print_line(stdout, msg, PCP_WLEN); } inverse_video(); printf(PCP_WLINE); free(msg); reset_video(); pwait(); } /* Extract top 'd' ranking features from the feature ranking file 'fname' into 'index'. 'frel' is the feature relevance (i.e., the quantity upon which the ranking is based upon). */ static int read_ranking(char *fname, int d, int fmax, int **index, float **frel, int *errc, char **xname) { int i; int idx; int imax; int dnx; int status; int fscrit; int *ifx; float *rx; char *line; char **tokens; FILE *fptr; fscrit = 0; ifx = (int *) 0; rx = (float *) 0; /* Read indexes. */ status = 0; idx = file_info(fname, &imax, &dnx, '\0'); if ((idx != -1) && (dnx >= 2)) { if (d == 0) d = idx; ifx = malloc(idx*sizeof(int)); if (frel) rx = malloc(idx*sizeof(float)); fptr = fopen(fname, "r"); if (fptr) { line = malloc(imax+2); i = 0; while (fgets(line, imax+2, fptr) && (i < idx) && !status) { tokens = str_tokenize(line, WHITESPACE); ifx[i] = atoi(tokens[0])-1; if (ifx[i]+1 > fmax) { status = -1; *errc = PERR_ILLEGAL_RNK_FILE; *xname = strdup(fname); } else { if (frel) rx[i] = atof(tokens[1]); i++; str_free(tokens); } } free(line); free(fname); if (!status) status = fclose(fptr); else fclose(fptr); } else { status = -1; *errc = errno; *xname = strdup(fname); } } else if (!idx) { status = -2; *errc = PERR_BAD_INPUT_FILE; *xname = strdup(fname); } else { status = -1; *errc = errno; *xname = strdup(fname); } if (!status) { *index = ifx; if (frel) *frel = rx; } return status; } /* Display the ranking of features in a user-defined file. */ void p_fdisp(int *errc, char **xname, int dbg) { int d; int status; int min_range; int imax; int *index; char *fname; float *frel; FILE *outdev; status = 0; *errc = 0; clear_screen(); outdev = stdout; cursor_on(); fname = input_filename(PCP_UMSG_FNAME_RNK, PCP_RNK, outdev); min_range = 1; imax = file_info(fname, (int *) 0, (int *) 0, '\0'); if (imax != -1) { d = input_nfeat(stdin, outdev, imax, PCP_DFLT_NFSEL); status = read_ranking(fname, d, tds->d, &index, &frel, errc, xname); if (!status) fsel_display(index, frel, d); } else { *errc = errno; *xname = strdup(fname); } } /* Extract and save a user-defined subset of features from current data sets. Optionally, replace the current data sets with the subset. */ void p_f_subset(int *errc, char **xname, int dbg) { int d; int min_range; int imax; int int_default; int status; int mode; int *index; char *msg; char *fname; float *frel; struct dataset *dset1 = (struct dataset *) 0; struct dataset *dset2 = (struct dataset *) 0; FILE *outdev; *errc = 0; if (tds) { status = 0; clear_screen(); outdev = stdout; cursor_on(); msg = malloc((PCP_QLEN+1)*sizeof(char)); fname = input_filename(PCP_UMSG_FNAME_RNK_2, PCP_RNK, outdev); mode = input_replace(&dset1, &dset2, "sel"); min_range = 1; int_default = 2; imax = tds->d; sprintf(msg, PCP_UMSG_NSEL, int_default); d = input_integer(stdin, outdev, msg, PCP_QLEN, &int_default, &min_range, &imax); free(msg); status = read_ranking(fname, d, imax, &index, &frel, errc, xname); if (!status) { /* Input file read. Now subselect dset1, dset2. */ if (dset1) status = dataset_inset(dset1, index, d); if (!status) { status = dataset_inset(dset2, index, d); if (!status) status = save_datasets(dset1, dset2, mode, errc, xname); vx_free(index); vx_free(frel); } else *errc = errno; } else if (status == -1) { *errc = errno; *xname = strdup(fname); } } else *errc = PERR_UNDEFINED_TDS; } /* subtracts vec from mat row and multiplies result with transposed self repeats for each row accumulating sum in result matrix. If multipliers is non-null multiplies result of each row iteration with corresponding multipliers element. Non-null multiplier must have rows elements. Result matrix needs to be pre-alocated as cols by cols and initialized if desired */ static int fmx_sum_sq_sub(float** mat, float* vec, int rows, int cols, float** result, float* multipliers) { float* tmp_vec = (float*) malloc(sizeof(float) * cols); int retval = -1; if(tmp_vec) { int i, j, k; for(i = 0; i != rows; i++) { for(j = 0; j != cols; j++) tmp_vec[j] = mat[i][j] - vec[j]; if(multipliers != NULL) /*pulled outside of loop to save time*/ for(j = 0; j != cols; j++) { result[j][j] += tmp_vec[j] * tmp_vec[j] * multipliers[i]; for(k = j+1; k != cols; k++) { float tres = tmp_vec[j] * tmp_vec[k] * multipliers[i]; result[j][k] += tres; result[k][j] += tres; } } else /*same thing but without multipliers*/ for(j = 0; j != cols; j++) { result[j][j] += tmp_vec[j] * tmp_vec[j]; for(k = j+1; k != cols; k++) { float tres = tmp_vec[j] * tmp_vec[k]; result[j][k] += tres; result[k][j] += tres; } } } free(tmp_vec); retval = 0; } return retval; } /* Compute the value of `select_crit' discrimination criterion for subset of `nfeat' features in `dset'. The subset is defined by `index'. All criteria implemented in this function have to satisfy: the larger the value of the criterion, the better. In case of success, set *errc to 0; otherwise, set error code in *errc. */ static double eval_crit(struct dataset *dset, int nfeat, int *index, int select_crit, int *errc) { int i; int j; int idx; int kmin; int kmax; int nn; int x_offset; int nxval; double crit; float *det; float **x; float **sig; float** scat_b; float** scat_w; float *bayes_l; float* cardinalities; float** inv_w = NULL; float** int_intra = NULL; struct dataset *xset; unsigned int seed; crit = 0.0; if (errc) *errc = 0; if (select_crit == PCP_FSUB_KNN) { xset = dataset_select(dset, index, nfeat); if (xset) { /* NOTE: nexp fixed to 1. nxval set to min(smallest class, 5). */ nxval = 5; i = ivec_min(xset->nd, xset->c); if (i < nxval) nxval = i; crit = xlearn_machine(xset, PALG_KNN, 1, nxval, errc); dataset_free(xset); crit = 100-crit; } else *errc = errno; } else { x = fmx_alloc(dset->nv, nfeat); if (x) { for (i = 0; i < nfeat; i++) { idx = index[i]; for (j = 0; j < dset->nv; j++) x[j][i] = dset->x[j][idx]; } if (select_crit == PCP_FSUB_BAYES) { seed = 1; kmin = 1; kmax = ivec_min(dset->nd, dset->c)-1; nn = kmax-kmin+1; sig = calc_sig(x, dset->c, nfeat, dset->nd, &det, errc); if (sig) { bayes_l = L_error(x, dset->nv, nfeat, dset->c, dset->nd, sig, det, kmin, kmax, (FILE *) 0, (FILE *) 0, errc); if (bayes_l) crit = 100-fvec_sum(bayes_l, nn)/nn; } } else if (select_crit == PCP_FSUB_IN_IN) { float** class_avgs = fmx_alloc(dset->c, nfeat); float* set_avg = (float*) calloc(nfeat, sizeof(float)); int curr_vec, i, j, k; if(class_avgs && set_avg) { fmx_set(class_avgs, dset->c, nfeat, 0.); curr_vec = 0; for(i = 0; i != dset->c; i++) { for(j = 0; j != (dset->nd)[i]; j++) { for(k = 0; k != nfeat; k++) { class_avgs[i][k] += x[curr_vec][k]; set_avg[k] += x[curr_vec][k]; } curr_vec++; } for(k = 0; k != nfeat; k++) class_avgs[i][k] /= (dset->nd)[i]; } for(k = 0; k != nfeat; k++) set_avg[k] /= curr_vec; scat_b = fmx_alloc(nfeat, nfeat); scat_w = NULL; if(scat_b) { fmx_set(scat_b, nfeat, nfeat, 0.); cardinalities = (float*) malloc(sizeof(float) * dset->c); for(i = 0; i != dset->c; i++) cardinalities[i] = (dset->nd)[i]; fmx_sum_sq_sub(class_avgs, set_avg, dset->c, nfeat, scat_b, cardinalities); free(cardinalities); scat_w = fmx_alloc(nfeat, nfeat); } if(scat_w) { fmx_set(scat_w, nfeat, nfeat, 0.); x_offset = 0; for(i = 0; i != dset->c; i++) { fmx_sum_sq_sub(x+x_offset, class_avgs[i], (dset->nd)[i], nfeat, scat_w, NULL); x_offset += (dset->nd)[i]; } for(i = 0; i != nfeat; i++) for(j = 0; j != nfeat; j++) { scat_b[i][j] /= (float)x_offset; scat_w[i][j] /= (float)x_offset; } inv_w = fmx_inv(scat_w, nfeat, (float*) NULL, (float*)NULL, errc); mx_free((void **) scat_w, nfeat); } int_intra = NULL; if (inv_w) { int_intra = fmx_mult(inv_w, nfeat, nfeat, scat_b, nfeat, 0); if(int_intra) { crit = 0; for(i = 0; i != nfeat; i++) crit += int_intra[i][i]; mx_free((void **) int_intra, nfeat); } else if(errc) *errc = errno; } mx_free((void **) inv_w, nfeat); mx_free((void **) scat_b, nfeat); free(set_avg); mx_free((void **) class_avgs, dset->c); } else if(errc) *errc = errno; } mx_free((void **) x, dset->nv); } else if (errc) *errc = errno; } return crit; } /* Allocates bit set of size bits */ static unsigned char* alloc_bitset(int size) { int char_size; unsigned char* ret; char_size = size / 8; if(size % 8 != 0) char_size++; ret = calloc(char_size, 8); return ret; } /* Test if bit at position is set. Returns 0 if not set, unspecified non-zero otherwise. */ static unsigned char test_bitset(unsigned char* bset, int position) { return bset[position / 8] & ((unsigned char)'\x80' >> (position % 8)); } /* Sets bit at position */ static void set_bitset(unsigned char* bset, int position) { bset[position / 8] |= ((unsigned char)'\x80' >> (position % 8)); } /* Resets bit at position */ static void unset_bitset(unsigned char* bset, int position) { bset[position / 8] &= ~((unsigned char)'\x80' >> (position % 8)); } /* Inverts bit at position */ static void flip_bitset(unsigned char* bset, int position) { bset[position / 8] ^= ((unsigned char)'\x80' >> (position % 8)); } /* Initialize combination enumerator to generate combinations of k elements from set of n elements. Generated combination is returned in array *a which represents combination as 0s and 1s. Successive combinations are obtained thry call to combination_next. In addition it to a it returns r and *w which are intended for use with combination_next. *w to be disposed by caller when done generating combinations as well as a*. Fails if n < k */ static void combination_init(int n, int k, int *r, unsigned char **a, unsigned char **w) { int i; *r = n-k; if(*r >= 0) { *a = alloc_bitset(n); *w = alloc_bitset(n+1); for (i=0; i < *r; i++) { unset_bitset(*a, i); set_bitset(*w, i); } for (i=*r; i < n; i++) { set_bitset(*a, i); set_bitset(*w, i); } set_bitset(*w, n); } } /* Generate next combination of k elements from set of n elements using r *a and *w initialized with combination_init. Next combination is found in *a, with r and *w being updated for use with next call. Returns 1 if next combination exists and output values were updated. Otherwise it returns 0 in which case output values are unchanged. Uses chase sequence algorithm created by Donald Knuth. This implementation based on C implementation by Glenn C. Rhoads. */ static int combination_next(int n, int k, int *r, unsigned char *a, unsigned char *w) { /* for (i = n-1; i >= 0; i--) printf( "%3d", a[i] ); printf( "\n" ); */ int i, ret; for (i=*r; !test_bitset(w, i); i++) set_bitset(w,i); if (i == n) ret = 0; else { unset_bitset(w, i); if (test_bitset(a, i)) { unset_bitset(a, i); if (ODD(i) || test_bitset(a, i-2)) { set_bitset(a, i-1); if (*r == i-1) *r = i; else if (*r == i && i > 1) *r = i-1; } else { set_bitset(a, i-2); if (*r == i-2) *r = i-1; else if (*r == i) { if (i > 3) *r = i-2; else *r = 1; } } } else { set_bitset(a, i); if (ODD(i) && !test_bitset(a, i-1)) { unset_bitset(a, i-2); if (*r == i-2) *r = i; else if (*r == i-1) *r = i-2; } else { unset_bitset(a, i-1); if (*r == i-1) *r = i; else if (*r == i && i > 1) *r = i-1; } } ret = 1; } return ret; } /*----*/ static void fill_index_set(unsigned char* included, int* index, int size) { int i; int j = 0; for(i = 0; i != size; i++) if(test_bitset(included, i)) index[j++] = i; } /* finds optimal combination of features to add to dataset using specified criteria. As side effect index array is sorted. dset - (in) full dataset i - (in) number of currently included features add_attrs - (in) number of features to remove index - (in) pre allocated array to temporary store feature indexes size dset->d included - (in) pre populated bitset with all features currently included same size as index tmp_index - (in) preallocated array of feature indices size sub_attrs max_index - (in, out) preallocated array to hold optimal combination of features to remove on return max_crit (in, out) value of criteria if features from max_index are added All combinations producing criteria below initial value are disregarded. select_crit - (in) number of criteria to apply errc - (out) error code if any return - number of features in optimal combination == sub_attrs or 0 if error */ static int fwd_step(struct dataset *dset, int i, int add_attrs, int* index, unsigned char *included, int* tmp_index, int* max_index, double* max_crit, int select_crit, int* errc) { int max_feat = 0; double curr_crit; int remaining_attrs = dset->d - i; unsigned char* to_include; unsigned char * util_set; int util_comb; combination_init(remaining_attrs, add_attrs, &util_comb, &to_include, &util_set); do { int addset_index = 0; int currset_index = 0; int attrs_added = 0; int tmp_count; fill_index_set(included, index, dset->d); while((currset_index != dset->d) && (attrs_added != add_attrs) && (addset_index != remaining_attrs)) { if(!test_bitset(included, currset_index)) { if(test_bitset(to_include, addset_index++)) { tmp_index[attrs_added++] = currset_index; set_bitset(included, currset_index); } } currset_index++; } fill_index_set(included, index, dset->d); curr_crit = eval_crit(dset, i + add_attrs, index, select_crit, errc); for(tmp_count = 0; tmp_count != attrs_added; tmp_count++) unset_bitset(included, tmp_index[tmp_count]); if(*errc != 0) break; if(curr_crit > *max_crit) { *max_crit = curr_crit; max_feat = 1; for(tmp_count = 0; tmp_count != add_attrs; tmp_count++) max_index[tmp_count] = tmp_index[tmp_count]; } /* if(!(++comb_tried % 1000)) fprintf(stderr, "Tried %d combinations, max criteria = %f\n", comb_tried, max_crit); */ } while(combination_next(remaining_attrs, add_attrs, &util_comb, to_include, util_set)); fill_index_set(included, index, dset->d); free(to_include); free(util_set); if(!max_feat) add_attrs = 0; /* fprintf(stderr, "Features = %d. crit = %f\n", i+add_attrs, max_crit);*/ return add_attrs; } /* Applies forward feature selection algorithm on the dataset dset - datset to select features from nfeat - number of features to select select_crit - ID of selection criteria l - number of features to simultaneously insert - complexity of this function is exponentialy dependent on this value so supplying anything more than 1 should not be considered lightly val_crit - value of criteria function for resulting feature set errc - error code from criteria function if any returns - internaly allocated array of 0-based feature indices or NULL in case of failure. Caller is responsible for deallocation. */ int *forward_select(struct dataset *dset, int nfeat, int select_crit, int l, double *val_crit, int *errc) { int* index = calloc(nfeat, sizeof(int)); int* tmp_index = calloc(l, sizeof(int)); int* max_index = calloc(l, sizeof(int)); unsigned char* included = alloc_bitset(dset->d); int i, j; int add_attrs = l; int attrs_added = 0; double max_crit; *errc = 0; for(i = 0; i != nfeat; i+= add_attrs) { if(i + add_attrs > nfeat) add_attrs = nfeat - i; max_crit = 0; attrs_added = fwd_step(dset, i, add_attrs, index, included, tmp_index, max_index, &max_crit, select_crit, errc); for(j = 0; j != attrs_added; j++) { viprint_line(6, 1, "Added feature %7d (%4d/%4d); criterion value: %f", max_index[j]+1, i+j+1, nfeat, max_crit); set_bitset(included, max_index[j]); } } fill_index_set(included, index, dset->d); free(included); free(max_index); free(tmp_index); if(*errc != 0) { fprintf(stderr, "Error %d in forward select\n", *errc); free(index); index = NULL; } else if (val_crit) *val_crit = max_crit; return index; } /* finds optimal combination of features to remove from dataset using specified criteria. As side effect index array is sorted. dset - (in) full dataset i - (in) number of currently included features sub_attrs - (in) number of features to remove index - (in) pre allocated array to temporary store feature indexes size dset->d included - (in) pre populated bitset with all features currently incluses same size as index tmp_index - (in) preallocated array of feature indices size sub_attrs max_index - (in, out) preallocated array to hold optimal combination of features to remove on return max_crit (in, out) value of criteria if features from max_index are removed. All combinations producing criteria below initial value are disregarded. select_crit - (in) number of criteria to apply errc - (out) error code if any return - number of features in optimal combination == sub_attrs or 0 if error */ static int backward_step(struct dataset *dset, int i, int sub_attrs, int* index, unsigned char *included, int* tmp_index, int* max_index, double* max_crit, int select_crit, int* errc) { int max_feat = 0; double curr_crit; unsigned char* to_exclude; unsigned char* util_set; int util_comb; combination_init(i, sub_attrs, &util_comb, &to_exclude, &util_set); do { int subset_index = 0; int currset_index = 0; int attrs_subed = 0; int tmp_count; fill_index_set(included, index, dset->d); while((currset_index != dset->d) && (attrs_subed != sub_attrs) && (subset_index != i)) { if(test_bitset(included, currset_index)) { if(test_bitset(to_exclude, subset_index++)) { unset_bitset(included, currset_index); tmp_index[attrs_subed++] = currset_index; } } currset_index++; } fill_index_set(included, index, dset->d); curr_crit = eval_crit(dset, i - attrs_subed, index, select_crit, errc); for(tmp_count = 0; tmp_count != attrs_subed; tmp_count++) { set_bitset(included, tmp_index[tmp_count]); } if(*errc != 0) break; if(curr_crit > *max_crit) { *max_crit = curr_crit; max_feat = 1; for(tmp_count = 0; tmp_count != attrs_subed; tmp_count++) max_index[tmp_count] = tmp_index[tmp_count]; } } while(combination_next(i, sub_attrs, &util_comb, to_exclude, util_set)); fill_index_set(included, index, dset->d); free(to_exclude); free(util_set); if(!max_feat) sub_attrs = 0; return sub_attrs; } /* Applies backward feature selection algorithm on the dataset dset - datset to select features from nfeat - number of features to select select_crit - ID of selection criteria l - number of features to simultaneously remove - complexity of this function is exponentialy dependent on this value so supplying anything more than 1 should not be considered lightly val_crit - value of criteria function for resulting feature set errc - error code from criteria function if any returns - internaly allocated array of 0-based feature indices or NULL in case of failure. Caller is responsible for deallocation. */ int *backward_select(struct dataset *dset, int nfeat, int select_crit, int r, double *val_crit, int *errc) { int* index = calloc(dset->d, sizeof(int)); int* tmp_index = calloc(r, sizeof(int)); int* max_index = calloc(r, sizeof(int)); unsigned char* included = alloc_bitset(dset->d); int inc_size = dset->d / 8; int i, j; double max_crit; int sub_attrs = r; if((dset->d % 8) != 0) inc_size++; for(i = 0; i != inc_size; i++) included[i] = ~(included[i]); *errc = 0; for(i = dset->d; i != nfeat; i-= sub_attrs) { int attrs_subed; max_crit = 0.; if(i - sub_attrs < nfeat) sub_attrs = i - nfeat; attrs_subed = backward_step(dset, i, sub_attrs, index, included, tmp_index, max_index, &max_crit, select_crit, errc); if(*errc != 0) break; for(j = 0; j != attrs_subed; j++) { viprint_line(6, 1, "Removed feature %d (%d/%d); criterion value: %f", max_index[j], dset->d - i + 1, dset->d - nfeat, max_crit); unset_bitset(included, max_index[j]); } } if(*errc != 0) { free(index); index = NULL; fprintf(stderr, "Error %d in backward select\n", *errc); } else fill_index_set(included, index, dset->d); free(included); free(max_index); free(tmp_index); if (val_crit) *val_crit = max_crit; return index; } /* Applies Pudil's floating forward feature selection algorithm on the dataset dset - datset to select features from nfeat - number of features to select select_crit - ID of selection criteria val_crit (out) - value of criteria function for resulting feature set current_attrs (out) - actual number of attributes in result set errc (out) - error code from criteria function if any returns - internaly allocated array of 0-based feature indices or NULL in case of failure. Caller is responsible for deallocation. */ int *float_fwd_select(struct dataset *dset, int nfeat, int select_crit, double *val_crit, int* current_attrs, int *errc) { int* index = calloc(dset->d, sizeof(int)); int tmp_index; int add_index; int sub_index; unsigned char* included = alloc_bitset(dset->d); double max_crit = 0.; double* crit_vals = calloc(nfeat + 1, sizeof(double)); int progress = 1; *errc = 0; *current_attrs = 0; while(*current_attrs != nfeat && progress) { int attrs_added = 0; int attrs_subed = 0; progress = 0; max_crit = crit_vals[*current_attrs]; attrs_added = fwd_step(dset, *current_attrs, 1, index, included, &tmp_index, &add_index, &max_crit, select_crit, errc); if(*errc) break; if(attrs_added) { set_bitset(included, add_index); fill_index_set(included, index, dset->d); crit_vals[++(*current_attrs)] = max_crit; #if defined(_FSELECT_DEBUG) fprintf(stderr, "Added feature %d\n", add_index); #endif progress = 1; } while(*current_attrs > 2) { max_crit = crit_vals[*current_attrs - 1]; attrs_subed = backward_step(dset, *current_attrs, 1, index, included, &tmp_index, &sub_index, &max_crit, select_crit, errc); if(*errc) break; if(attrs_subed && (max_crit > crit_vals[*current_attrs - 1])) { unset_bitset(included, sub_index); (*current_attrs)--; crit_vals[*current_attrs] = max_crit; #if defined(_FSELECT_DEBUG) fprintf(stderr, "Removed feature %d\n", sub_index); #endif progress = 1; } else break; } } if(*errc) { free(index); index = NULL; } else { *val_crit = crit_vals[*current_attrs - 1]; fill_index_set(included, index, dset->d); } free(crit_vals); return index; } /* Select optimal `nfeat' features of `dset', using `select_crit' criterion and plus_l_minus_r method with parameters `l' and `r'. The function returns the indices of the optimal subset. The optimal value of the criterion is set in `val_crit'. */ static int *plus_l_minus_r(struct dataset *dset, int nfeat, int select_crit, int l, int r, double *val_crit) { int *index; index = (int *) 0; return index; } /* Compute optimal subset of `d' features in `dset' using feature selection method `fsel_method' and feature subset evaluation criterion `fscrit'. If `dsflag' is 1, return the optimal-feature subset of `dset'. If `index' is not NULL, return 0-based indices of selected features in 'index'. In case of error, return NULL and set errc. However, if `dsflag' is 0, the function returns NULL, so you have to check errc for errors. */ struct dataset *select_subset(struct dataset *dset, int d, int fsel_method, int fscrit, int **index, int dsflag, int *errc) { int l = 1; int r = 1; double val_crit; int *l_index; float *ranking; struct dataset *subset; if (errc) *errc = 0; subset = (struct dataset *) 0; if ((d > 0) && (d <= dset->d)) { if (fsel_method == PDR_RANKING) { ranking = rank_features(dset, fscrit, &l_index); if (ranking) { subset = dataset_select(dset, l_index, d); free(ranking); if (index) *index = l_index; else free(l_index); } } else if ((fsel_method == PDR_FORWARD) || (fsel_method == PDR_BACKWARD) || (fsel_method == PDR_PLUS_L_MINUS_R) || (fsel_method == PDR_FLOAT_FORWARD) || (fsel_method == PDR_BB)) { if (fsel_method == PDR_FORWARD) { l_index = forward_select(dset, d, fscrit, l, &val_crit, errc); } else if (fsel_method == PDR_BACKWARD) { l_index = backward_select(dset, d, fscrit, r, &val_crit, errc); } else if (fsel_method == PDR_FLOAT_FORWARD) { int really_selected; l_index = float_fwd_select(dset, d, fscrit, &val_crit, &really_selected, errc); d = really_selected; } else if (fsel_method == PDR_BB) { ; } else if (fsel_method == PDR_PLUS_L_MINUS_R) { l = PCP_FSEL_LPAR; r = PCP_FSEL_RPAR; l_index = plus_l_minus_r(dset, d, fscrit, l, r, &val_crit); } if (l_index) { if (dsflag == 1) subset = dataset_select(dset, l_index, d); if (index) *index = l_index; else free(l_index); } } } return subset; }