/* Module name: pau.c Created by: Ljubomir Buturovic Created: 02/18/2002 Purpose: assorted utilities for PCP. */ /* Copyright 2004 Ljubomir J. Buturovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include "lerr.h" #include "lau.h" #include "lmat.h" #include "lin.h" #include "pau.h" #include "ddset.h" #include "mlp.h" #include "pcp.h" #include "hash.h" #include "hash_util.h" /* Log matrix. */ void log_mx(float **mx, int rows, int columns, FILE *fdbg) { int i; int j; if (fdbg != (FILE *) 0) for (i = 0; i < rows; i++) { fprintf(fdbg, "row %5d: ", i); for (j = 0; j < columns; j++) fprintf(fdbg, "%12.5g ", mx[i][j]); fprintf(fdbg, "\n"); } } /* Log columns of 'mx'. */ void log_mxt(float **mx, int rows, int columns, FILE *fdbg) { int i; int j; if (fdbg != (FILE *) 0) for (i = 0; i < columns; i++) { fprintf(fdbg, "column %5d: ", i); for (j = 0; j < rows; j++) fprintf(fdbg, "%12.5g ", mx[j][i]); fprintf(fdbg, "\n"); } } /* Calculate and display covariance matrix and matrix of linear correlation coefficients for data in 'xmx'. */ void covar(float **xmx, int d, int nv, int *errc, int dbg) { int i; int status; int max_row; int max_column; float fmax; float **sigma; FILE *outdev; FILE *fdbg = (FILE *) 0; outdev = fopen("/dev/stdout", "w"); status = 0; if (dbg > 0) fdbg = fopen(PCP_DBG, "a"); sigma = cest(xmx, d, nv, COVARIANCE_MATRIX); inverse_video(); printf("Covariance matrix:\n"); log_mx(sigma, d, d, stdout); fmx_max(sigma, d, d, &max_row, &max_column, &fmax); printf("Maximal element (%5d)(%5d): %12.5g\n", max_row+1, max_column+1, fmax); correst(sigma, d); printf("\nMatrix of linear correlations:\n"); log_mx(sigma, d, d, stdout); if (d > 1) { /* To find max. element in correlation matrix, first set diagonal to large negative values (because by definition, the diagonal elements are already 1, and therefore max.) */ for (i = 0; i < d; i++) sigma[i][i] = -FLT_MAX; fmx_max(sigma, d, d, &max_row, &max_column, &fmax); printf("Maximal off-diagonal element (%5d)(%5d): %12.5g\n", max_row+1, max_column+1, fmax); } reset_video(); mx_free((void **) sigma, d); } /* Terminate 'string' after the first token. */ static void token_string(char *string) { char *str; str = string+strspn(string, WHITESPACE); str = str+strcspn(string, WHITESPACE); *str = '\0'; } /* Print question 'msg' on 'outdev', read integer response from 'indev'. The function loops until the answer is in ['min_range', 'max_range']. 'length' is total length of question line. If 'dflt' is not NULL, the function returns that value when the user enters empty line. In case of error return EOF. The errors are bad arguments ('msg', 'indev' or 'outdev' NULL) or malloc() error. In the latter case set 'errno'. */ int input_integer(FILE *indev, FILE *outdev, char *msg, int length, int *dflt, int *min_range, int *max_range) { int idx; int done = 0; char *string; char *emsg; errno = 0; idx = EOF; if (msg && indev && outdev) { emsg = malloc((length+1)*sizeof(char)); string = malloc((P_MAX_LINE_LEN+2)*sizeof(char)); if (emsg && string) while (!done) { print_line(outdev, msg, length); fgets(string, P_MAX_LINE_LEN+1, indev); token_string(string); idx = atoi(string); if (!idx && dflt) { idx = *dflt; done = 1; } else if (!min_range && !max_range) done = 1; else if (min_range && !max_range) { if (idx >= *min_range) done = 1; else { sprintf(emsg, "The value must be greater than or equal to %5d.", *min_range); print_line(outdev, emsg, length); fprintf(outdev, "\n"); } } else if (!min_range && max_range) { if (idx <= *max_range) done = 1; else { sprintf(emsg, "The value must be less or equal than %5d.", *max_range); print_line(outdev, emsg, length); fprintf(outdev, "\n"); } } else if (min_range && max_range) { if ((idx >= *min_range) && (idx <= *max_range)) done = 1; else { sprintf(emsg, "The value must be between %5d and %5d.", *min_range, *max_range); print_line(outdev, emsg, length); fprintf(outdev, "\n"); } } } free(emsg); free(string); } return idx; } /* Print question 'msg' on 'outdev', read integer response from 'indev'. The function repeats the question until the response is one of the values in vector 'choices' of length 'len'. 'length' is total length of question line. If 'dflt' is not NULL, the function returns that value when the user enters empty line. In case of error return EOF. The errors are bad arguments ('msg', 'indev' or 'outdev' NULL) or malloc() error. In the latter case set 'errno'. */ int input_choice(FILE *indev, FILE *outdev, char *msg, int *dflt, int *choices, int len) { int i; int idx; int done; char *string; errno = 0; idx = EOF; done = 0; if (msg && indev && outdev && choices && (len > 0)) { string = malloc((P_MAX_LINE_LEN+2)*sizeof(char)); if (string) while (done == 0) { viprint_line(2, 1, msg); fgets(string, P_MAX_LINE_LEN+1, indev); token_string(string); idx = atoi(string); if ((idx == 0) && dflt) { idx = *dflt; done = 1; } else { for (i = 0; (i < len) && !done; i++) if (idx == choices[i]) done = 1; } if (!done) { viprint_line(2, 1, PCP_UMSG_CHOICE); printf("\n"); } } free(string); } return idx; } static int decode_format(int if1, int if2) { int format; format = DATASET_FF_RAW; if ((if1 == 0) && (if2 == 0)) format = DATASET_FF_RAW; else if ((if1 == 1) && (if2 == 0)) format = DATASET_FF_COL; else if ((if1 == 0) && (if2 == 1)) format = DATASET_FF_VEC; else if ((if1 == 1) && (if2 == 1)) format = DATASET_FF_COLVEC; return format; } /* Read dataset file format from 'indev'. */ int input_format(FILE *indev, FILE *outdev) { int dflt; int min_range; int max_range; int if1; int if2; int format; dflt = 0; min_range = 0; max_range = 1; if1 = input_integer(indev, outdev, PCP_UMSG_HFORMAT, PCP_QLEN, &dflt, &min_range, &max_range); if2 = input_integer(indev, outdev, PCP_UMSG_RFORMAT, PCP_QLEN, &dflt, &min_range, &max_range); format = decode_format(if1, if2); return format; } /* Float version of input_integer(). The input value must be in [min_range, max_range]. */ float input_float(FILE *indev, FILE *outdev, char *msg, int length, float *dflt, float *min_range, float *max_range) { float fdx; int done = 0; int len; char *string; char *emsg; errno = 0; fdx = 0.0; if (msg && indev && outdev) { emsg = malloc((length+1)*sizeof(char)); string = malloc((P_MAX_LINE_LEN+2)*sizeof(char)); if (emsg && string) while (done == 0) { print_line(outdev, msg, length); fgets(string, P_MAX_LINE_LEN+1, indev); token_string(string); len = strspn(string, DIGITS); string[len] = '\0'; fdx = (float) atof(string); if ((fdx == 0) && (dflt != (float *) 0)) { fdx = *dflt; done = 1; } else if (!min_range && !max_range) done = 1; else if (min_range && !max_range) { if (fdx >= *min_range) done = 1; else { sprintf(emsg, "The value must be greater than or equal to %5f.", *min_range); print_line(outdev, emsg, length); fprintf(outdev, "\n"); } } else if (!min_range && !max_range) { if (fdx <= *max_range) done = 1; else { sprintf(emsg, "The value must be less or equal %5f.", *max_range); print_line(outdev, emsg, length); fprintf(outdev, "\n"); } } else if (min_range && max_range) { if ((fdx >= *min_range) && (fdx <= *max_range)) done = 1; else { sprintf(emsg, "The value must be between %5f and %5f.", *min_range, *max_range); print_line(outdev, emsg, length); fprintf(outdev, "\n"); } } } free(emsg); free(string); } return fdx; } /* Print question 'msg' on 'outdev', read response from 'indev'. Total length of question line is 'length'. The function returns the string response. The max. name of the response is limited to the maximum length of the file name in the current filesystem, as determined by statfs(). It is the caller's responsibility to free() the returned string. If user enters empty string, returns empty string. In case of illegal arguments, returns (char *) 0 and sets errno to 0. In case of error, returns (char *) 0 and sets errno. */ static char *input_fname(FILE *indev, FILE *outdev, char *msg, int length) { int maxlen; char *string; char *xtr; string = (char *) 0; errno = 0; if (msg && indev && outdev && (length > 0)) { maxlen = get_namelen(); if (maxlen != -1) { string = malloc((maxlen+1)*sizeof(char)); if (string) { print_line(outdev, msg, length); fgets(string, maxlen+1, indev); xtr = strrchr(string, '\n'); if (xtr) *xtr = '\0'; /* Remove trailing CR, if present. This may happen when reading filenames on Windows. */ xtr = strrchr(string, '\r'); if (xtr) *xtr = '\0'; } } } return string; } /* Get filename from stdin. 'dflt' is the default filename. The query message is 'txt[dflt]:' or 'txt', however at most PCP_QLEN of the message is printed. */ char *input_filename(char *txt, char *dflt, FILE *outdev) { char *message; char *fname; message = malloc((PCP_QLEN+1)*sizeof(char)); if (dflt) snprintf(message, PCP_QLEN+1, "%s[%s]:", txt, dflt); else snprintf(message, PCP_QLEN+1, "%s", txt); fname = input_fname(stdin, outdev, message, PCP_QLEN); free(message); if ((!fname || !strlen(fname)) && dflt) fname = strdup(dflt); return fname; } /* Read input data: number of features, number of classes, number of vectors per class, and input files. Return the populated 'dataset' structure. In case of error, return (struct dataset *) 0 and set 'errc' to error code. Error codes are PERR_BAD_INPUT_FILE, LERR_FILE_FORMAT, or error codes. In case of file-related errors, 'fname' has name of file which caused the error. */ struct dataset *pcp_input(FILE *indev, FILE *outdev, int *errc, char **xname) { int i; int status; int line_count; int min_range; int errno_save; int ntok; int fmt; int c; int d; char *msg = (char *) 0; int *nd; char **fnames; struct dataset *dset; *errc = 0; d = 0; status = 0; dset = (struct dataset *) 0; fmt = input_format(indev, outdev); min_range = 1; c = input_integer(indev, outdev, PCP_UMSG_NCLASSES, PCP_QLEN, (int *) 0, &min_range, (int *) 0); if (c != EOF) { nd = malloc(c*sizeof(int)); if (nd) { msg = malloc((PCP_QLEN+1)*sizeof(char)); if (msg) { fnames = calloc(c, sizeof(char *)); if (fnames) { for (i = 0; (i < c) && (status == 0); i++) { /* Read the file names and check the format. */ sprintf(msg, "%s %5d:", PCP_UMSG_CLASS_FNAME, i+1); fnames[i] = input_fname(indev, outdev, msg, PCP_QLEN); if (fnames[i] == (char *) 0) status = -1; else { line_count = file_info(fnames[i], (int *) 0, &ntok, '\0'); if (i == 0) { if ((fmt == DATASET_FF_RAW) || (fmt == DATASET_FF_COL)) d = ntok; else if ((fmt == DATASET_FF_VEC) || (fmt == DATASET_FF_COLVEC)) d = ntok-1; } if ((line_count != -1) && (d > 0)) { if ((fmt == DATASET_FF_RAW) || (fmt == DATASET_FF_VEC)) nd[i] = line_count; else if ((fmt == DATASET_FF_COL) || (fmt == DATASET_FF_COLVEC)) nd[i] = line_count-1; } else { status = -1; *xname = strdup(fnames[i]); if (d == -1) *errc = PERR_BAD_INPUT_FILE; if (ntok == -1) *errc = LERR_VAR_NCOL; mx_free((void **) fnames, c); } } } if (!status) { dset = load_dataset(d, c, nd, fnames, fmt, errc, xname); if (!dset) status = -1; } } } else status = -1; } else status = -1; } else status = -1; if (status == -1) { errno_save = errno; dset = dataset_free(dset); if (!*errc) *errc = errno_save; } free(msg); return dset; } /* Print 'msg' to 'outdev', centered within a string of 'length' characters. If 'msg' is longer than 'length', it is truncated. */ void center_line(FILE *outdev, char *msg, int length) { int len; int l1; int l2; char *prefix; char *suffix; char *str; len = strlen(msg); if (len >= length) { str = strdup(msg); str[length] = '\0'; printf("%s", str); } else { l1 = (length-len)/2; l2 = length-len-l1; prefix = malloc(l1+1); suffix = malloc(l2+1); memset(prefix, ' ', l1); prefix[l1] = '\0'; memset(suffix, ' ', l2); suffix[l2] = '\0'; fprintf(outdev, "%s%s%s", prefix, msg, suffix); } } /* Print 'msg' to 'outdev'. Fill remainder up to 'length' with space characters. */ void print_line(FILE *outdev, char *msg, int length) { int len; char blank[PCP_QLEN+1]; if (outdev) { inverse_video(); if (msg && *msg) len = length-strlen(msg); else len = length; if (len > 0) { memset(blank, ' ', len); blank[len] = '\0'; fprintf(outdev, "%s%s\n", msg, blank); } else { msg[length] = '\0'; fprintf(outdev, "%s\n", msg); } reset_video(); } } /* Similar to print_line(), but doesn't set video, and doesn't print newline. */ void print_ln(FILE *outdev, char *msg, int length) { int len; char *ptr; char blank[PCP_QLEN+1]; if (msg && *msg) { if ((ptr = strrchr(msg, '\n'))) *ptr = '\0'; len = length-strlen(msg); } else len = length; memset(blank, ' ', len); blank[len] = '\0'; fprintf(outdev, "%s%s", msg, blank); } static void vpr_line(va_list VariableArgsPtr) { int i; int len; int msglen; int nlines; int nrem; char *format; char *msg; char *blank; msg = malloc((PCP_LONG_LEN+1)*sizeof(char)); format = va_arg(VariableArgsPtr, char *); vsnprintf(msg, PCP_LONG_LEN+1, format, VariableArgsPtr); msglen = strlen(msg); nlines = msglen/PCP_QLEN; /* number of complete lines */ nrem = msglen-nlines*PCP_QLEN; /* number of characters in the incomplete line */ if (nrem > 0) { len = PCP_QLEN-nrem; /* number of blank characters to fill the incomplete line */ blank = malloc((len+1)*sizeof(char)); memset(blank, ' ', len); blank[len] = '\0'; msglen += len; } else blank = strdup(""); strcat(msg, blank); for (i = 0; i < msglen; i++) { if ((i > 0) && ((i%PCP_QLEN) == 0)) putchar('\n'); putchar(msg[i]); } putchar('\n'); va_end(VariableArgsPtr); free(msg); free(blank); } /* Print fixed-length message, with variable number of arguments. 'nargs' is the number of arguments following 'nargs'. The arguments are the same as for printf() and friends. TBD: this function doesn't seem to work... and is unused anyway. Remove? ljb, 11/26/2005. */ void vprint_line(int nargs, ...) { va_list VariableArgsPtr; va_start(VariableArgsPtr, nargs); vpr_line(VariableArgsPtr); va_end(VariableArgsPtr); } /* Like vprint_line(), with inverse video. First argument after `nargs' is an integer. If it equals one, display message in inverse video, otherwise display in normal video. */ void viprint_line(int nargs, ...) { int video; va_list VariableArgsPtr; va_start(VariableArgsPtr, nargs); video = va_arg(VariableArgsPtr, int); if (video) inverse_video(); vpr_line(VariableArgsPtr); va_end(VariableArgsPtr); if (video) reset_video(); } /* Return pseudo-random number in [0, 1]. */ float float_rand(void) { float frand; frand = rand()/(RAND_MAX+1.0); return frand; } /* Display control functions. */ #define STR_RESET "[0m" void cursor_on(void) { char escape; escape = ESCAPE_CHAR; printf("%c[?25h", escape); fflush(stdout); } void cursor_off(void) { char escape; escape = ESCAPE_CHAR; printf("%c[?25l", escape); fflush(stdout); } void reset_video(void) { int fd; int count; char escape; char buf[10]; fd = fileno(stdout); escape = ESCAPE_CHAR; buf[0] = escape; strcpy(&buf[1], STR_RESET); count = strlen(STR_RESET); write(fd, buf, count+1); } void clear_screen(void) { char escape; escape = ESCAPE_CHAR; printf("%c[2J", escape); cpos(0, 0); } /* Position cursor at (x, y). x is column, y is row. (0, 0) is upper left corner. */ void cpos(int x, int y) { char escape; char xdigit1; char xdigit2; char ydigit1; char ydigit2; escape = ESCAPE_CHAR; xdigit1 = '0'+(x+1)/10; xdigit2 = '0'+(x+1)%10; ydigit1 = '0'+(y+1)/10; ydigit2 = '0'+(y+1)%10; printf("%c[%c%c;%c%cH", escape, ydigit1, ydigit2, xdigit1, xdigit2); } void inverse_video(void) { char escape; char color[3]; escape = ESCAPE_CHAR; memcpy(color, PCP_VT100_COLOR, strlen(PCP_VT100_COLOR)); color[2] = '\0'; printf("%c[0;7;1;%sm", escape, color); } unsigned int input_seed(FILE *indev, FILE *outdev) { int min_range; unsigned int seed; char *msg; min_range = 1; msg = malloc((PCP_QLEN+1)*sizeof(char)); sprintf(msg, PCP_UMSG_SEED, min_range); seed = input_integer(stdin, outdev, msg, PCP_QLEN, &min_range, &min_range, (int *) 0); free(msg); return seed; } int input_nmodels(FILE *indev, FILE *outdev) { int min_range; int int_default; int nmodels; char *msg; min_range = 1; msg = malloc((PCP_QLEN+1)*sizeof(char)); int_default = 10; sprintf(msg, PCP_UMSG_NCOMB, int_default); nmodels = input_integer(stdin, outdev, msg, PCP_QLEN, &int_default, &min_range, (int *) 0); free(msg); return nmodels; } /* Enter dimension of transformed space for linear dimension reduction. The arguments have obvious meanings. */ int input_d(FILE *indev, FILE *outdev, int max_value, int default_value) { int min_range; int max_range; int int_default; int d; char *msg; min_range = 1; max_range = max_value; int_default = default_value; msg = malloc((PCP_QLEN+1)*sizeof(char)); sprintf(msg, PCP_UMSG_NDR, max_range, int_default); d = input_integer(stdin, outdev, msg, PCP_QLEN, &int_default, &min_range, &max_range); free(msg); return d; } /* Enter number of features to select for feature subset selection. */ int input_nfeat(FILE *indev, FILE *outdev, int max_value, int default_value) { int min_range; int max_range; int int_default; int d; char *msg; min_range = 1; max_range = max_value; int_default = default_value; msg = malloc((PCP_LONG_LEN+1)*sizeof(char)); sprintf(msg, PCP_UMSG_NFEAT, max_range, int_default); d = input_integer(stdin, outdev, msg, PCP_QLEN, &int_default, &min_range, &max_range); free(msg); return d; } int input_nexp(char *msg) { int dflt; int idx; int nexp; dflt = PCP_DFLT_NEXP; snprintf(msg, PCP_QLEN+1, PCP_UMSG_NEXP, dflt); idx = 1; nexp = input_integer(stdin, stdout, msg, PCP_QLEN, &dflt, &idx, (int *) 0); return nexp; } int input_nxval(FILE *indev, FILE *outdev, int max_nxval) { int min_range; int int_default; int max_range; int nxval; char *msg; min_range = 2; msg = malloc((PCP_QLEN+1)*sizeof(char)); if (max_nxval >= 10) int_default = 10; else int_default = max_nxval; max_range = max_nxval; sprintf(msg, PCP_UMSG_NXVAL, min_range, max_range, int_default); nxval = input_integer(stdin, outdev, msg, PCP_QLEN, &int_default, &min_range, &max_range); free(msg); return nxval; } /* Read MLP parameters from stdin. */ int input_mlp(FILE *outdev, int inputs, int outputs, int nvec, int *mlp_continue, int *nlayers, int **npl, int *itmax, float *range, int *opt_method, float *eta, float *mu, int *nxval, int max_nxval, int *nmlp, unsigned int *seed, char **fname) { int i; int status = 0; int ixd = 0; int min_range; int int_default; int max_range; int new_mlp_flag; int nlay; float minf_range; float maxf_range; float float_default; int *xpl; char *msg; FILE *fptr; fptr = (FILE *) 0; new_mlp_flag = 0; clear_screen(); cursor_on(); msg = malloc((PCP_QLEN+1)*sizeof(char)); if (mlp_continue) { min_range = 0; max_range = 1; ixd = input_integer(stdin, outdev, "Begin learning (0), or continue (1) [0]:", PCP_QLEN, &min_range, &min_range, &max_range); *mlp_continue = ixd; } if (!mlp_continue || (*mlp_continue == 0)) new_mlp_flag = 1; if (fname) { if (nxval) *fname = input_filename(PCP_UMSG_XMP_FNAME, PCP_XMP, outdev); else *fname = input_filename(PCP_UMSG_FNAME_MLP, PCP_MLP, outdev); if (new_mlp_flag == 1) fptr = fopen(*fname, "w"); else fptr = fopen(*fname, "r"); } if (fptr || !fname) { if (fname) fclose(fptr); if (new_mlp_flag == 1) { if (nlayers) { min_range = 1; ixd = input_integer(stdin, outdev, PCP_UMSG_NHIDDEN, PCP_QLEN, &min_range, &min_range, (int *) 0); } else ixd = 1; } if (ixd != EOF) { if (new_mlp_flag == 1) { nlay = ixd+1; if (nlayers) { *nlayers = nlay; sprintf(msg, "Number of inputs is %5d.", inputs); print_line(outdev, msg, PCP_QLEN); sprintf(msg, "Number of output nodes is %5d.", outputs); print_line(outdev, msg, PCP_QLEN); printf("\n"); xpl = malloc(nlay*sizeof(int)); for (i = 0; i < nlay-1; i++) { min_range = 1; sprintf(msg, PCP_UMSG_NNODES, i+1); ixd = input_integer(stdin, outdev, msg, PCP_QLEN, (int *) 0, &min_range, (int *) 0); xpl[i] = ixd; } xpl[nlay-1] = outputs; *npl = xpl; } } if (new_mlp_flag == 1) { if (seed) *seed = input_seed(stdin, outdev); minf_range = 0.01; float_default = 1.0; sprintf(msg, PCP_UMSG_INIT_WEIGHTS, float_default); *range = input_float(stdin, outdev, msg, PCP_QLEN, &float_default, &minf_range, (float *) 0); } min_range = 1; max_range = 2; int_default = MLP_OPT_CGPLUS; sprintf(msg, PCP_UMSG_OPT_METHOD, MLP_OPT_CGPLUS, MLP_OPT_GRADIENT_DESCENT, int_default); *opt_method = input_integer(stdin, outdev, msg, PCP_QLEN, &int_default, &min_range, (int *) 0); if (*opt_method == MLP_OPT_GRADIENT_DESCENT) { minf_range = FLT_MIN; float_default = 0.01; sprintf(msg, PCP_UMSG_LRATE, float_default); *eta = input_float(stdin, outdev, msg, PCP_QLEN, &float_default, &minf_range, (float *) 0); minf_range = 0.0; maxf_range = 1.0-FLT_MIN; float_default = 0.0; sprintf(msg, PCP_UMSG_MOMENTUM, float_default); *mu = input_float(stdin, outdev, msg, PCP_QLEN, &float_default, &minf_range, &maxf_range); } if (itmax) { min_range = -1; int_default = 100; sprintf(msg, PCP_UMSG_NMAXIT, int_default); *itmax = input_integer(stdin, outdev, msg, PCP_QLEN, &int_default, &min_range, (int *) 0); } if (new_mlp_flag == 1) if (nxval) *nxval = input_nxval(stdin, outdev, max_nxval); if (nmlp) { int_default = 10; min_range = 1; sprintf(msg, PCP_UMSG_NMLP, int_default); *nmlp = input_integer(stdin, outdev, msg, PCP_QLEN, &int_default, &min_range, (int *) 0); } } else status = -1; } else status = -1; free(msg); return status; } /* Input k-NN distance code. */ int input_knn_dist(FILE *outdev) { int dist; int *dch; int int_default; char *msg; msg = malloc((PCP_QLEN+1)*sizeof(char)); int_default = EUCLIDEAN_DIST; dch = malloc(PCP_N_DIST*sizeof(int)); dch[0] = EUCLIDEAN_DIST; dch[1] = CITY_BLOCK_DIST; dch[2] = MAHALANOBIS_DIST; snprintf(msg, PCP_QLEN+1, PCP_UMSG_KDIST, dch[0], dch[1], dch[2], int_default); dist = input_choice(stdin, outdev, msg, &int_default, dch, PCP_N_DIST); free(dch); free(msg); return dist; } /* Get ensemble variant of the original method (bagging or boosting). */ int input_ensemble_method(int method) { int mtd; int dtd; int dch[2]; int int_default; char *msg; msg = malloc((PCP_QLEN+1)*sizeof(char)); dch[0] = 0; dch[1] = 1; int_default = 0; snprintf(msg, PCP_QLEN+1, PCP_UMSG_ENSEMBLE, dch[0], dch[1], int_default); dtd = input_choice(stdin, stdout, msg, &int_default, dch, PCP_N_DIST); if (method == PALG_SVM) { if (dtd == 0) mtd = PALG_BAG_SVM; else mtd = PALG_ADABOOST_SVM; } free(msg); return mtd; } /* Get transformation (SVD, scaling) mode. 0: use training and test sets to compute the transformation 1: use only the training set */ int input_transform_mode() { int mode; int dch[2]; int int_default; char *msg; msg = malloc((PCP_QLEN+1)*sizeof(char)); dch[0] = 0; dch[1] = 1; int_default = 0; snprintf(msg, PCP_QLEN+1, PCP_UMSG_SVD_MODE, dch[1], dch[0], int_default); mode = input_choice(stdin, stdout, msg, &int_default, dch, 2); free(msg); return mode; } /* Read k-NN parameters from stdin. In case of failure, return -1 and set errno. Possible errors are fopen() errors. */ int input_knn(FILE *outdev, int *nxval, int max_nxval, int *nmlp, int *k, int maxk, unsigned int *seed, char **fname, int *dist) { int status = 0; int min_range; int max_range; int int_default; int new_mlp_flag; char *msg; FILE *fptr; new_mlp_flag = 0; clear_screen(); cursor_on(); msg = malloc((PCP_QLEN+1)*sizeof(char)); int_default = 1; min_range = 1; if (maxk > 0) { sprintf(msg, PCP_UMSG_NN, maxk, int_default); max_range = maxk; *k = input_integer(stdin, outdev, msg, PCP_QLEN, &int_default, &min_range, &max_range); } else { sprintf(msg, PCP_UMSG_NN1, int_default); *k = input_integer(stdin, outdev, msg, PCP_QLEN, &int_default, &min_range, (int *) 0); } *dist = input_knn_dist(outdev); if (fname) { if (nxval) *fname = input_filename(PCP_UMSG_XNN, PCP_XNN, outdev); else *fname = input_filename(PCP_UMSG_ONAME, PCP_KNN, outdev); /* currently unused */ /* Check if we can write to the file. */ fptr = fopen(*fname, "w"); if (fptr) fclose(fptr); else status = -1; } if (!status) { if (nxval) *nxval = input_nxval(stdin, outdev, max_nxval); if (nmlp) { int_default = 10; min_range = 1; sprintf(msg, PCP_UMSG_KSUB, int_default); *nmlp = input_integer(stdin, outdev, msg, PCP_QLEN, &int_default, &min_range, (int *) 0); } if (seed) *seed = input_seed(stdin, outdev); } vx_free(msg); return status; } /* Read feature selection criterion. Used to evaluate feature subsets. Note that PCP_FSEL_GOLUB is only defined for two classes. */ int input_fscrit(FILE *outdev, int dr_method, int c) { int fscrit; int int_default; int len; int *fch; char *msg; msg = malloc((PCP_LONG_LEN+1)*sizeof(char)); len = 0; fch = (int *) 0; if (dr_method == PDR_RANKING) { int_default = PCP_FSEL_PEARSON; fch = malloc(PCP_N_FSCRIT*sizeof(int)); if (c == 2) { fch[len++] = PCP_FSEL_EUCLIDEAN; fch[len++] = PCP_FSEL_PEARSON; fch[len++] = PCP_FSEL_GOLUB; fch[len++] = PCP_FSEL_KNN; fch[len++] = PCP_FSEL_BAYES; sprintf(msg, PCP_UMSG_FSEL_1, fch[0], fch[1], fch[2], fch[3], fch[4], int_default); } else { fch[len++] = PCP_FSEL_EUCLIDEAN; fch[len++] = PCP_FSEL_PEARSON; fch[len++] = PCP_FSEL_KNN; fch[len++] = PCP_FSEL_BAYES; sprintf(msg, PCP_UMSG_FSEL_2, fch[0], fch[1], fch[2], fch[3], int_default); } } else if ((dr_method == PDR_FORWARD) || (dr_method == PDR_BACKWARD) || (dr_method == PDR_PLUS_L_MINUS_R) || (dr_method == PDR_BB) || (dr_method == PDR_FLOAT_FORWARD)) { fch = malloc(PCP_N_SUB_FSCRIT*sizeof(int)); int_default = PCP_FSUB_KNN; fch[len++] = PCP_FSUB_KNN; fch[len++] = PCP_FSUB_IN_IN; fch[len++] = PCP_FSUB_BAYES; sprintf(msg, PCP_UMSG_FSEL_3, fch[0], fch[1], fch[2], int_default); } fscrit = input_choice(stdin, outdev, msg, &int_default, fch, len); free(fch); free(msg); return fscrit; } /* Read dimension reduction parameters: method, dimension of transformed space, and feature subset evaluation criterion, for feature selection methods. The function returns the chosen method, and sets the dimension of transformed space in `idr' and the feature subset evaluation criterion in `fscrit'. The methods offered are based on number of vectors 'nv' and and input data dimensionality 'd'. The logic is: available dimensionality reduction methods nv <= d none, individual, SVD, EMAP nv > d none, individual, PCA, FLD, EMAP The 'none' option is the default. */ int input_dr(FILE *outdev, int nv, int d, int c, int *idr, int *fscrit) { int int_default; int min_range; int max_range; int dr_method; int len; int choice[50]; /* we don't expect more than 50 dimension reduction methods, ever */ char *msg; int_default = PDR_NONE; *idr = d; dr_method = PDR_NONE; choice[0] = PDR_NONE; len = 1; msg = malloc((PCP_LONG_LEN+1)*sizeof(char)); /* NOTE: the function will display at most PCP_QLEN characters. */ if (nv <= d) { sprintf(msg, PCP_UMSG_DR_SM, PDR_NONE, PDR_SVD, PDR_EMAP, PDR_RANKING, PDR_FORWARD, PDR_BACKWARD, int_default); choice[len++] = PDR_RANKING; choice[len++] = PDR_SVD; choice[len++] = PDR_EMAP; choice[len++] = PDR_FORWARD; choice[len++] = PDR_BACKWARD; #if 0 choice[len++] = PDR_PLUS_L_MINUS_R; choice[len++] = PDR_BB; #endif } else if (nv > d) { sprintf(msg, PCP_UMSG_DR_LG, PDR_NONE, PDR_FISHER, PDR_PCA, PDR_EMAP, PDR_RANKING, PDR_FORWARD, PDR_BACKWARD, int_default); choice[len++] = PDR_RANKING; choice[len++] = PDR_PCA; choice[len++] = PDR_FISHER; choice[len++] = PDR_EMAP; choice[len++] = PDR_FORWARD; choice[len++] = PDR_BACKWARD; #if 0 choice[len++] = PDR_PLUS_L_MINUS_R; choice[len++] = PDR_BB; #endif } dr_method = input_choice(stdin, outdev, msg, &int_default, choice, len); if (!dr_method) dr_method = PDR_NONE; else { /* In case the chosen dimensionality reduction method is a feature selection method, choose the feature subset evaluation criterion. */ if (fscrit) { if ((dr_method == PDR_RANKING) || (dr_method == PDR_FORWARD) || (dr_method == PDR_BACKWARD) || (dr_method == PDR_PLUS_L_MINUS_R) || (dr_method == PDR_BB) || (dr_method == PDR_FLOAT_FORWARD)) *fscrit = input_fscrit(outdev, dr_method, c); } /* Get dimension. */ min_range = 1; if (dr_method == PDR_SVD) max_range = nv; else if (dr_method == PDR_FISHER) max_range = int_min(d, c-1); else max_range = d; int_default = max_range; sprintf(msg, PCP_UMSG_NDIM, min_range, max_range, int_default); *idr = input_integer(stdin, outdev, msg, PCP_QLEN, &int_default, &min_range, &max_range); } free(msg); return dr_method; } /* Read feature selection method. */ int input_fsel(FILE *outdev) { int int_default; int dr_method; int len; int choice[50]; /* we don't expect more than 50 feature selection methods, ever */ char *msg; int_default = PDR_RANKING; dr_method = PDR_NONE; choice[0] = PDR_NONE; len = 1; msg = malloc((PCP_LONG_LEN+1)*sizeof(char)); sprintf(msg, PCP_UMSG_FEATSEL, PDR_RANKING, PDR_FORWARD, PDR_BACKWARD, PDR_RANKING); choice[len++] = PDR_RANKING; choice[len++] = PDR_FORWARD; choice[len++] = PDR_BACKWARD; #if 0 choice[len++] = PDR_PLUS_L_MINUS_R; choice[len++] = PDR_BB; #endif dr_method = input_choice(stdin, outdev, msg, &int_default, choice, len); if (!dr_method) dr_method = PDR_NONE; free(msg); return dr_method; } #define PAU_DISPNAME_LEN 16 /* Return index of next non-empty class. */ static int next_class(int tcl, int *nd, int c) { int itcl; itcl = tcl; if (itcl < c-1) { itcl++; /* next class */ /* If this test class is empty (which is perfectly legal), we have to skip to the next non-empty one. */ while ((itcl <= c-1) && (nd[itcl] == 0)) itcl++; } return itcl; } /* Write prediction for vector `idx' in `dset' to 'fptr' in PCP_RCL format. TBD: `tset' is the training dataset. It is used to extract the actual class name for the input vector. This is brain-dead; it is a consequence of the current one-class-per-file input format. A better solution is to use actual class labels (dataset.label). */ void write_rcl(FILE *fptr, int idx, struct dataset *dset, struct dataset *tset) { int i; int predicted_class; int actual_class; int fmt; char *name; char *tname; if (fptr) { predicted_class = dset->prediction[idx]; name = bname(tset->fnames[predicted_class]); actual_class = dataset_class(idx, dset->c, dset->nd); /* actual class for vector idx */ tname = bname(dset->fnames[actual_class]); fmt = dset->format; if ((fmt == DATASET_FF_VEC) || (fmt == DATASET_FF_COLVEC)) fprintf(fptr, "%s\t%s\t%s\t", dset->xlab[idx], tname, name); else fprintf(fptr, "%d\t%s\t%s\t", idx+1, tname, name); /* The last column is `1' for correct prediction, `0' for incorrect. */ if (predicted_class == actual_class) fprintf(fptr, "1"); else fprintf(fptr, "0"); if (dset->c == 2) { /* Additional four columns for two classes. The columns are 1 if the prediction is TP, FN, FP, TN, respectively, and 0 otherwise. Obviously, exactly one column will have a 1 :-) And yes, it could be all be encoded with one column, but the is more consistent with the other file formats in PCP. */ fprintf(fptr, "\t"); if (actual_class == 0) { if (actual_class == predicted_class) fprintf(fptr, "1\t0\t0\t0"); /* TP */ else fprintf(fptr, "0\t1\t0\t0"); /* FN */ } else { if (actual_class == predicted_class) fprintf(fptr, "0\t0\t0\t1"); /* TN */ else fprintf(fptr, "0\t0\t1\t0"); /* FP */ } } /* Optionally store class prediction strengths. */ if (dset->ps) { fprintf(fptr, "\t"); for (i = 0; i < dset->c; i++) fprintf(fptr, "%g\t", dset->ps[idx][i]); } fprintf(fptr, "\n"); free(tname); free(name); } } /* Display classification results for 'test_set'. The prediction were obtained using 'algorithm'. The function employs the following logic to evaluate and display the predictions. If test_set->c > 1, we assume that the test_set class labels are the correct classifications of the test samples, and compute error rates correspondingly. The test_set file names are used as class labels. If test_set->c == 1, we interpret that to mean that the actual class labels for the test_set are not known, and therefore error rates cannot be calculated; hence the function just displays the (integer) class IDs. If actual class labels can be established using the above logic, and if 'verbose' is 1, the function displays class prediction for all vectors. If 'verbose' is 0, only misclassified vectors are displayed. If actual class labels are unknown (i.e., test_set->c == 1), 'verbose' is ignored, and the function displays prediction for all vectors. */ void predict_disp(struct dataset *test_set, int verbose, int algorithm) { int i; int tcl; int icx; int coffset; int tmc; int n_P; int T_P; int F_P; int n_N; int T_N; int F_N; float e_P; float e_N; float mcf; float ppv; float npv; int *mmc; int *pmc; char *tname; char *disp_name; char *name; inverse_video(); printf(PCP_LINE); e_P = -1.0; e_N = -1.0; tmc = 0; name = (char *) 0; if (test_set->c > 1) { printf("| Class | Actual/predicted card. | Error rate |\n"); printf(PCP_LINE); tcl = 0; coffset = test_set->nd[0]; mmc = calloc(test_set->c, sizeof(int)); /* number of misclassified samples in class i */ pmc = calloc(test_set->c, sizeof(int)); /* predicted cardinality of class i */ for (i = 0; i < test_set->nv; i++) { if (i == coffset) { tcl = next_class(tcl, test_set->nd, test_set->c); coffset += test_set->nd[tcl]; } icx = test_set->prediction[i]; if (icx != tcl) /* tcl is actual class of test vector i */ mmc[tcl]++; pmc[icx]++; } tmc = ivec_sum(mmc, test_set->c); /* total number of misclassified samples */ icx = ivec_sum(test_set->nd, test_set->c); i = 0; mcf = tmc; mcf = 100.0*mcf/icx; printf("| | %6d/%-6d | %6.2f%% (%6d/%6d) |\n", test_set->nv, test_set->nv, mcf, tmc, icx); for (i = 0; i < test_set->c; i++) { mcf = mmc[i]; if (test_set->nd[i] > 0) mcf = mcf/test_set->nd[i]; else mcf = 0.0; if (test_set->c == 2) { if (i == 0) e_P = mcf; else e_N = mcf; } mcf = 100.0*mcf; tname = bname(test_set->fnames[i]); disp_name = lau_str_lim(tname, PAU_DISPNAME_LEN); printf("|%4d/%s | %6d/%-6d | %6.2f%% (%6d/%6d) |\n", i+1, disp_name, test_set->nd[i], pmc[i], mcf, mmc[i], test_set->nd[i]); free(disp_name); free(tname); } /* Display additional performance measures if number of classes is 2. */ if (test_set->c == 2) { n_P = test_set->nd[0]; T_P = n_P-mmc[0]; F_P = mmc[1]; n_N = test_set->nd[1]; T_N = n_N-mmc[1]; F_N = mmc[0]; if (T_P+F_P > 0) ppv = 100.0*T_P/(T_P+F_P); else ppv = 0.0; if (T_N+F_N > 0) npv = 100.0*T_N/(T_N+F_N); else npv = 0.0; printf(PCP_LINE); printf("| Sensitivity | %6.2f%% (%6d/%6d) |\n", 100.0*(1.0-e_P), T_P, n_P); printf("| Specificity | %6.2f%% (%6d/%6d) |\n", 100.0*(1.0-e_N), T_N, n_N); printf("| Positive Predictive Value | %6.2f%% (%6d/%6d) |\n", ppv, T_P, T_P+F_P); printf("| Negative Predictive Value | %6.2f%% (%6d/%6d) |\n", npv, T_N, T_N+F_N); } printf(PCP_LINE); } if ((test_set->c == 1) || (tmc > 0) || verbose) { if (test_set->xlab) { if (test_set->c > 1) { if ((algorithm == PALG_MLP) || (algorithm == PALG_BAG_MLP)) printf("| Vector | Actual class | MLP prediction |\n"); else if ((algorithm == PALG_SVM) || (algorithm == PALG_BAG_SVM)) printf("| Vector | Actual class | SVM prediction |\n"); else if ((algorithm == PALG_LIN) || (algorithm == PALG_BAG_LIN)) printf("| Vector | Actual class | Lin. discriminant pred. |\n"); else if ((algorithm == PALG_KNN) || (algorithm == PALG_BAG_KNN)) printf("| Vector | Actual class | k-NN prediction |\n"); else if ((algorithm == PALG_PLC) || (algorithm == PALG_BAG_PLC)) printf("| Vector | Actual class | Parametric lin. pred. |\n"); else if ((algorithm == PALG_PQC) || (algorithm == PALG_BAG_PQC)) printf("| Vector | Actual class | Parametric quadr. pred. |\n"); } else { if ((algorithm == PALG_MLP) || (algorithm == PALG_BAG_MLP)) printf("| Vector | MLP prediction |\n"); else if ((algorithm == PALG_SVM) || (algorithm == PALG_BAG_SVM)) printf("| Vector | SVM prediction |\n"); else if ((algorithm == PALG_LIN) || (algorithm == PALG_BAG_LIN)) printf("| Vector | Linear discriminant prediction |\n"); else if ((algorithm == PALG_KNN) || (algorithm == PALG_BAG_KNN)) printf("| Vector | k-NN prediction |\n"); else if ((algorithm == PALG_PLC) || (algorithm == PALG_BAG_PLC)) printf("| Vector | Parametric linear prediction |\n"); else if ((algorithm == PALG_PQC) || (algorithm == PALG_BAG_PQC)) printf("| Vector | Parametric quadratic prediction |\n"); } } else { if (test_set->c > 1) { if ((algorithm == PALG_MLP) || (algorithm == PALG_BAG_MLP)) printf("| Vector | Actual class | MLP prediction |\n"); else if ((algorithm == PALG_SVM) || (algorithm == PALG_BAG_SVM)) printf("| Vector | Actual class | SVM prediction |\n"); else if ((algorithm == PALG_LIN) || (algorithm == PALG_BAG_LIN)) printf("| Vector | Actual class | Linear discriminant prediction |\n"); else if ((algorithm == PALG_KNN) || (algorithm == PALG_BAG_KNN)) printf("| Vector | Actual class | k-NN prediction |\n"); else if ((algorithm == PALG_PLC) || (algorithm == PALG_BAG_PLC)) printf("| Vector | Actual class | Parametric linear prediction |\n"); else if ((algorithm == PALG_PQC) || (algorithm == PALG_BAG_PQC)) printf("| Vector | Actual class | Parametric quadratic prediction |\n"); } else { if ((algorithm == PALG_MLP) || (algorithm == PALG_BAG_MLP)) printf("| Vector | File name | MLP prediction |\n"); else if ((algorithm == PALG_SVM) || (algorithm == PALG_BAG_SVM)) printf("| Vector | File name | SVM prediction |\n"); else if ((algorithm == PALG_LIN) || (algorithm == PALG_BAG_LIN)) printf("| Vector | File name | Linear discriminant prediction |\n"); else if ((algorithm == PALG_KNN) || (algorithm == PALG_BAG_KNN)) printf("| Vector | File name | k-NN prediction |\n"); else if ((algorithm == PALG_PLC) || (algorithm == PALG_BAG_PLC)) printf("| Vector | File name | Parametric linear prediction |\n"); else if ((algorithm == PALG_PQC) || (algorithm == PALG_BAG_PQC)) printf("| Vector | File name | Parametric quadratic prediction |\n"); } } printf(PCP_LINE); tcl = 0; coffset = test_set->nd[0]; for (i = 0; i < test_set->nv; i++) { if ((test_set->c > 1) && (i == coffset)) { tcl = next_class(tcl, test_set->nd, test_set->c); coffset += test_set->nd[tcl]; } tname = bname(test_set->fnames[tcl]); icx = test_set->prediction[i]; if (icx == PCP_UNASSIGNED) name = strdup(PCP_UNASSIGNED_NAME); else if (test_set->c > 1) name = bname(test_set->fnames[icx]); if (test_set->xlab) { if (test_set->c == 1) printf("| %-30s | %41d |\n", test_set->xlab[i], icx+1); else if ((verbose == 1) || (icx != tcl)) printf("| %-20s | %-23s | %-23s |\n", test_set->xlab[i], tname, name); } else { if (test_set->c == 1) printf("| %6d | %-30s | %30d |\n", i+1, tname, icx+1); else if ((verbose == 1) || (icx != tcl)) printf("| %6d | %-30s | %-30s |\n", i+1, tname, name); } free(tname); free(name); } printf(PCP_LINE); } reset_video(); cursor_off(); } /* Extract 'subset_id'-th 'training_set' and 'test_set' from 'x'. 'test_set' is obtained by extracting vectors listed in 'sx' from 'x' (see xss() for structure of 'sx'). 'training_set' has the remaining vectors in 'x'. */ int extract_sets(float **x, int subset_id, int c, int *nd, int d, int **sx, int **lx, float ***training_set, float ***test_set, FILE *fdbg) { int status; int i; int j; int itx; int ntraining; int ntest; int nvec; int idx; int ivx; int ixc; int cx; int isx; int trpos; int tepos; int lxd; float **trset; float **teset; hash_t *xhash; status = 0; ntest = 0; teset = (float **) 0; for (i = 0; i < c; i++) ntest += lx[i][subset_id]; nvec = ivec_sum(nd, c); ntraining = nvec-ntest; trset = fmx_alloc(ntraining, d); if (test_set) teset = fmx_alloc(ntest, d); ixc = 0; /* current position within 'x' array */ cx = 0; /* cumulative cardinality of classes 0..i-1. */ tepos = 0; /* current position within test set being extracted from 'x' */ trpos = 0; /* current position within training set being extracted from 'x' */ for (i = 0; i < c; i++) { /* Store all 'subset_id' vector indices into 'xhash'. The indices are positions within array 'x'. */ xhash = hashCreate(); isx = ivec_sum(lx[i], subset_id); /* position within 'sx' array */ for (j = 0; j < lx[i][subset_id]; j++) { idx = sx[i][isx++]; hashPutInt(xhash, idx, idx); } for (j = cx; j < cx+nd[i]; j++) { ivx = hashGetInt(xhash, j); if (fdbg != (FILE *) 0) { if (ivx == j) fprintf(fdbg, "extract_sets(); class %d; moving %d into teset[%d]; ", i, ixc, tepos); else fprintf(fdbg, "extract_sets(); class %d; moving %d into trset[%d]; ", i, ixc, trpos); } if (ivx == j) { if (fdbg) { fprintf(fdbg, "test set vector:\t"); lxd = d; if (d > P_MAX_LOGD) lxd = P_MAX_LOGD; for (itx = 0; itx < lxd; itx++) fprintf(fdbg, "%f ", x[ixc][itx]); fprintf(fdbg, "\n"); } if (test_set) fvec_copy(teset[tepos++], x[ixc++], d); } else { if (fdbg) { lxd = d; if (d > P_MAX_LOGD) lxd = P_MAX_LOGD; fprintf(fdbg, "training set vector:\t"); for (itx = 0; itx < lxd; itx++) fprintf(fdbg, "%f ", x[ixc][itx]); fprintf(fdbg, "\n"); } fvec_copy(trset[trpos++], x[ixc++], d); } } hashDelete(xhash); cx += nd[i]; } *training_set = trset; if (test_set) *test_set = teset; return status; } static void log_tset(FILE *fdbg, char *msg, int c, int d, int *nd, float **tset) { int lxd; int jx; int itx; int nvec; int ctx; int offset; offset = 0; lxd = d; if (d > P_MAX_LOGD) lxd = P_MAX_LOGD; nvec = ivec_sum(nd, c); if (nvec > 0) { if (msg != (char *) 0) fprintf(fdbg, "%s", msg); if (nd) offset = nd[0]; ctx = 0; for (jx = 0; jx < nvec; jx++) { if (jx == offset) offset += nd[++ctx]; fprintf(fdbg, "%d\t", ctx+1); for (itx = 0; itx < lxd; itx++) fprintf(fdbg, "%f\t", tset[jx][itx]); fprintf(fdbg, "\n"); } fprintf(fdbg, "\n"); fflush(fdbg); } } void log_tt(FILE *fdbg, char *msg, int c, int d, int *training_nd, int *test_nd, float **training_set, float **test_set) { char *tmsg = (char *) 0; if (fdbg != (FILE *) 0) { if (msg != (char *) 0) { tmsg = malloc(strlen(msg)+100); sprintf(tmsg, "%s training set\n", msg); } log_tset(fdbg, tmsg, c, d, training_nd, training_set); if (msg != (char *) 0) sprintf(tmsg, "%s test set\n", msg); log_tset(fdbg, tmsg, c, d, test_nd, test_set); free(tmsg); } } /* Log cross-validation subset 'idx'. */ void log_ites(FILE *fdbg, int idx, int c, int nv, int **lxc, int **sxc) { int jx; int ix; int mx; int sflg; int offset; int coffset; if (fdbg) { fprintf(fdbg, "nxval:\t%d\t", idx+1); coffset = 0; for (jx = 0; jx < c; jx++) { offset = ivec_sum(lxc[jx], idx); for (mx = coffset; mx < coffset+tds->nd[jx]; mx++) { sflg = 0; for (ix = 0; (ix < lxc[jx][idx]) && !sflg; ix++) if (mx == sxc[jx][offset+ix]) sflg = 1; fprintf(fdbg, "%d\t", sflg); } coffset += tds->nd[jx]; } fprintf(fdbg, "\n"); fprintf(fdbg, "nxval:\t%d\t", idx+1); for (jx = 0; jx < c; jx++) { offset = ivec_sum(lxc[jx], idx); for (ix = 0; ix < lxc[jx][idx]; ix++) fprintf(fdbg, "%d\t", sxc[jx][offset+ix]); } fprintf(fdbg, "\n"); } } /* Calculate and display eigenvalues and condition number of covariance matrices of training data set. */ void p_eigen(int *errc) { int i; int status; int offset; float **evlc; float **sigma; offset = 0; evlc = malloc(tds->c*sizeof(float *)); for (i = 0; i < tds->c; i++) { sigma = cest(&tds->x[offset], tds->d, tds->nd[i], COVARIANCE_MATRIX); offset += tds->nd[i]; /* Calculate eigenvalues. */ status = eigsn(sigma, tds->d, &evlc[i], (float ***) 0, errc); } if (!*errc) { ddevl(tds->c, tds->d, evlc); pwait(); } } /* Calculate inverse covariance matrices and determinants for TDS. In case of success, set 'errc' to 0, otherwise set to error code. Possible errors are codes for malloc()/calloc() functions, and LERR_SINGCOV (singular covariance matrix). */ void p_sigma(int *errc) { *errc = 0; dataset_sigma(tds, errc); } /* Log error message in error file. */ static void log_error(int exit_code, char *msg) { time_t log_time; char time_buf[26]; char *str; FILE *fptr; log_time = time(NULL); fptr = fopen(PCP_ERR, "a"); ctime_r(&log_time, time_buf); str = strchr(time_buf, '\n'); *str = '\0'; if (fptr) { fprintf(fptr, "%s %4d %s\n", time_buf, exit_code, msg); fclose(fptr); } } static void print_msg(int maxlen, char *msg, int log, int exit_code) { int i; int ipad; ipad = maxlen-strlen(msg); for (i = 0; i < ipad/2; i++) putc(' ', stdout); printf("%s", msg); for (i = 0; i < ipad/2; i++) putc(' ', stdout); if (log) log_error(exit_code, msg); } struct pcp_stat *hashPutStat(hash_t *hashtable, int key, struct pcp_stat *value) { int status; char *keyptr; struct pcp_stat *valptr; struct pcp_stat *retval = (struct pcp_stat *) 0; keyptr = malloc(MAX_INT_LENGTH); sprintf(keyptr, "%d", key); if (hashtable && value) if (!hash_lookup(hashtable, keyptr)) { valptr = malloc(sizeof(struct pcp_stat)); if (valptr) { memcpy(valptr, value, sizeof(struct pcp_stat)); status = hash_alloc_insert(hashtable, keyptr, valptr); if (status == 1) retval = value; } } return retval; } struct pcp_stat *hashGetStat(hash_t *hashtable, int key) { char *keyptr; struct pcp_stat *value = (struct pcp_stat *) 0; hnode_t *hn; keyptr = malloc(MAX_INT_LENGTH); sprintf(keyptr, "%d", key); hn = hash_lookup(hashtable, keyptr); if (hn) value = (struct pcp_stat *) hnode_get(hn); free(keyptr); return value; } void pcp_stat_set(struct pcp_stat *pcp_status, int internal_code, int exit_code, char *msg) { pcp_status->internal_code = internal_code; pcp_status->exit_code = exit_code; if (msg) pcp_status->msg = strdup(msg); else pcp_status->msg = (char *) 0; } /* Create hash table of PCP status and error codes. Each element in the table is a struct pcp_status, which has internal code, exit code, and msg corresponding to the status. The exit code is a translation of internal code into legal exit status values, which have to be in the range [1..255]. This is because UNIX programs return 'status & 0377' - i.e., the lower byte of the status code passed to exit() - to the parent process; see exit(3) for more details. Note that this means we only have 127 codes available for PCP status (since [0..128] seem to be taken, at least in Linux). If we need more, we are doomed. Also, this function needs to be updated every time a new error code is introduced. */ hash_t *create_status_table(void) { int code; struct pcp_stat *pcp_status; hash_t *table; table = hashCreate(); /* Do not translate native UNIX (errno) error codes, which we believe are in the range 1..127. */ pcp_status = calloc(1, sizeof(struct pcp_stat)); for (code = 0; code <= PMAX_ERRNO; code++) { pcp_stat_set(pcp_status, code, code, (char *) 0); hashPutStat(table, code, pcp_status); } pcp_stat_set(pcp_status, PERR_UNDEFINED_TDS, code++, PMSG_UNDEFINED_TDS); hashPutStat(table, PERR_UNDEFINED_TDS, pcp_status); pcp_stat_set(pcp_status, PERR_UNDEFINED_TEDS, code++, PMSG_UNDEFINED_TEDS); hashPutStat(table, PERR_UNDEFINED_TEDS, pcp_status); pcp_stat_set(pcp_status, PERR_UNDEFINED, code++, PMSG_UNDEFINED); hashPutStat(table, PERR_UNDEFINED, pcp_status); pcp_stat_set(pcp_status, PERR_TWO_CLASS, code++, PMSG_TWO_CLASS); hashPutStat(table, PERR_TWO_CLASS, pcp_status); pcp_stat_set(pcp_status, PERR_ONE_CLASS, code++, PMSG_ONE_CLASS); hashPutStat(table, PERR_ONE_CLASS, pcp_status); pcp_stat_set(pcp_status, PERR_ONE_SAMPLE, code++, PMSG_ONE_SAMPLE); hashPutStat(table, PERR_ONE_SAMPLE, pcp_status); pcp_stat_set(pcp_status, PERR_BAD_INPUT, code++, (char *) 0); hashPutStat(table, PERR_BAD_INPUT, pcp_status); pcp_stat_set(pcp_status, PERR_BAD_INPUT_FILE, code++, (char *) 0); hashPutStat(table, PERR_BAD_INPUT_FILE, pcp_status); pcp_stat_set(pcp_status, PERR_INC_DIM, code++, (char *) 0); hashPutStat(table, PERR_INC_DIM, pcp_status); pcp_stat_set(pcp_status, PERR_INCONSISTENT_MLP, code++, PMSG_INCONSISTENT_MLP); hashPutStat(table, PERR_INCONSISTENT_MLP, pcp_status); pcp_stat_set(pcp_status, PERR_UNRECOGNIZED_MLP, code++, PMSG_UNRECOGNIZED_MLP); hashPutStat(table, PERR_UNRECOGNIZED_MLP, pcp_status); pcp_stat_set(pcp_status, PERR_UNRECOGNIZED_SVM, code++, PMSG_UNRECOGNIZED_SVM); hashPutStat(table, PERR_UNRECOGNIZED_SVM, pcp_status); pcp_stat_set(pcp_status, PERR_INCONSISTENT_SVD, code++, PMSG_INCONSISTENT_SVD); hashPutStat(table, PERR_INCONSISTENT_SVD, pcp_status); pcp_stat_set(pcp_status, PERR_INCONSISTENT_FE, code++, PMSG_INCONSISTENT_FE); hashPutStat(table, PERR_INCONSISTENT_FE, pcp_status); pcp_stat_set(pcp_status, PERR_INCONSISTENT_MAP, code++, PMSG_INCONSISTENT_MAP); hashPutStat(table, PERR_INCONSISTENT_MAP, pcp_status); pcp_stat_set(pcp_status, PERR_INCONSISTENT_FILE, code++, PMSG_INCONSISTENT_FILE); hashPutStat(table, PERR_INCONSISTENT_FILE, pcp_status); pcp_stat_set(pcp_status, LERR_FILE_FORMAT, code++, LMSG_FILE_FORMAT); hashPutStat(table, LERR_FILE_FORMAT, pcp_status); pcp_stat_set(pcp_status, LERR_DESCENT, code++, LMSG_DESCENT); hashPutStat(table, LERR_DESCENT, pcp_status); pcp_stat_set(pcp_status, LERR_INTERNAL, code++, LMSG_INTERNAL); hashPutStat(table, LERR_INTERNAL, pcp_status); pcp_stat_set(pcp_status, LERR_INCONSISTENT_MODEL, code++, LMSG_INCONSISTENT_MODEL); hashPutStat(table, LERR_INCONSISTENT_MODEL, pcp_status); pcp_stat_set(pcp_status, LERR_INCONSISTENT_LIN, code++, LMSG_INCONSISTENT_LIN); hashPutStat(table, LERR_INCONSISTENT_LIN, pcp_status); pcp_stat_set(pcp_status, LERR_SINGCOV, code++, LMSG_SINGCOV); hashPutStat(table, LERR_SINGCOV, pcp_status); pcp_stat_set(pcp_status, LERR_SINGULAR, code++, LMSG_SINGULAR); hashPutStat(table, LERR_SINGULAR, pcp_status); pcp_stat_set(pcp_status, LERR_ITMAX, code++, LMSG_ITMAX); hashPutStat(table, LERR_ITMAX, pcp_status); pcp_stat_set(pcp_status, LERR_VAR_NCOL, code++, LMSG_VAR_NCOL); hashPutStat(table, LERR_VAR_NCOL, pcp_status); return table; } /* Print message in the designated area. The function has variable number of arguments. 'nargs' is the number of arguments following 'nargs'. The following arguments are int (message code) and, if nargs is larger than 1, (char *). The second argument is name (usually of a file causing error), if message code is non-zero. If message code is zero, the second argument is printed as is. TBD: change name. The name is bad since not all messages are necessarily error messages. */ void errmsg(int nargs, ...) { int i; int code; int ipad; int maxlen; int len; int exit_code; char *str; char *pstr; char *estr; char *name; char *tname; hash_t *status_table; struct pcp_stat *pcp_status; va_list VariableArgsPtr; code = 0; exit_code = 0; str = (char *) 0; tname = (char *) 0; va_start(VariableArgsPtr, nargs); for (i = 0; i < nargs; i++) { if (i == 0) code = va_arg(VariableArgsPtr, int); else if (i == 1) { name = va_arg(VariableArgsPtr, char *); if (name && *name) { tname = strdup(name); str_trim(tname); } } } /* 'code' is PCP internal error code, 'exit_code' is the user-visible exit code. We need to store the exit_code in the error file. */ if (code != 0) { status_table = create_status_table(); pcp_status = hashGetStat(status_table, code); if (pcp_status) exit_code = pcp_status->exit_code; else exit_code = code; hashDelete(status_table); } maxlen = 73; cpos(3, 20); inverse_video(); /* These error codes require special handling because they come with a file name. */ if ((code == EACCES) || (code == ENOENT) || (code == PERR_UNRECOGNIZED_MLP) || (code == PERR_UNRECOGNIZED_SVM) || (code == PERR_INCONSISTENT_MLP) || (code == PERR_BAD_INPUT_FILE) || (code == PERR_INCONSISTENT_FILE) || (code == PERR_ILLEGAL_RNK_FILE)) { if (code == EACCES) { if (tname) str = PMSG_EACCES; else str = PMSG_FILE_ACCESS; } else if (code == ENOENT) str = PMSG_ENOENT; else if (code == PERR_UNRECOGNIZED_MLP) str = PMSG_UNRECOGNIZED_MLP; else if (code == PERR_UNRECOGNIZED_SVM) str = PMSG_UNRECOGNIZED_SVM; else if (code == PERR_INCONSISTENT_MLP) str = PMSG_INCONSISTENT_MLP; else if (code == PERR_BAD_INPUT_FILE) str = PMSG_BAD_INPUT_FILE; else if (code == PERR_INCONSISTENT_FILE) str = PMSG_INCONSISTENT_FILE; else if (code == PERR_ILLEGAL_RNK_FILE) str = PMSG_ILLEGAL_RNK_FILE; ipad = maxlen-strlen(str)-1; if (tname) ipad -= strlen(tname); for (i = 0; i < ipad/2; i++) putc(' ', stdout); len = strlen(str)+5; if (tname) len += strlen(tname); pstr = malloc(len*sizeof(char)); if (tname) sprintf(pstr, "%s %s.", str, tname); else sprintf(pstr, "%s.", str); printf("%s", pstr); log_error(exit_code, pstr); free(pstr); #if 0 for (i = 0; i < 5; i++) putc(' ', stdout); #endif } else if (code == PERR_UNDEFINED_TDS) print_msg(maxlen, PMSG_UNDEFINED_TDS, 1, exit_code); else if (code == PERR_UNDEFINED_TEDS) print_msg(maxlen, PMSG_UNDEFINED_TEDS, 1, exit_code); else if (code == PERR_UNDEFINED) print_msg(maxlen, PMSG_UNDEFINED, 1, exit_code); else if (code == PERR_INCONSISTENT_SVD) print_msg(maxlen, PMSG_INCONSISTENT_SVD, 1, exit_code); else if (code == PERR_INCONSISTENT_FE) print_msg(maxlen, PMSG_INCONSISTENT_FE, 1, exit_code); else if (code == PERR_INCONSISTENT_MAP) print_msg(maxlen, PMSG_INCONSISTENT_MAP, 1, exit_code); else if (code == PERR_TWO_CLASS) print_msg(maxlen, PMSG_TWO_CLASS, 1, exit_code); else if (code == PERR_ONE_CLASS) print_msg(maxlen, PMSG_ONE_CLASS, 1, exit_code); else if (code == PERR_ONE_SAMPLE) print_msg(maxlen, PMSG_ONE_SAMPLE, 1, exit_code); else if (code == LERR_FILE_FORMAT) print_msg(maxlen, LMSG_FILE_FORMAT, 1, exit_code); else if (code == LERR_DESCENT) print_msg(maxlen, LMSG_DESCENT, 1, exit_code); else if (code == LERR_LNSEARCH) print_msg(maxlen, LMSG_LNSEARCH, 1, exit_code); else if (code == LERR_SINGCOV) print_msg(maxlen, LMSG_SINGCOV, 1, exit_code); else if (code == LERR_SINGULAR) print_msg(maxlen, LMSG_SINGULAR, 1, exit_code); else if (code == LERR_INCONSISTENT_LIN) print_msg(maxlen, LMSG_INCONSISTENT_LIN, 1, exit_code); else if (code == LERR_INCONSISTENT_MODEL) print_msg(maxlen, LMSG_INCONSISTENT_MODEL, 1, exit_code); else if (code == LERR_VAR_NCOL) print_msg(maxlen, LMSG_VAR_NCOL, 1, exit_code); else if (code == PSTS_DONE) print_msg(maxlen, PMSG_DONE, 0, exit_code); else if (code == PSTS_INVCOV) print_msg(maxlen, PMSG_INVCOV, 0, exit_code); else if (code == PERR_BAD_INPUT) print_msg(maxlen, PMSG_BAD_INPUT, 1, exit_code); else if (code != 0) { if ((code > 0) && (code <= PMAX_ERRNO)) { estr = strerror(code); pstr = strdup(estr); ipad = maxlen-strlen(pstr); } else { ipad = maxlen-strlen(PMSG_UNRECOGNIZED_ERROR)-7; pstr = malloc((strlen(PMSG_UNRECOGNIZED_ERROR)+10)*sizeof(char)); sprintf(pstr, "%s%6d.", PMSG_UNRECOGNIZED_ERROR, code); } for (i = 0; i < ipad/2; i++) putc(' ', stdout); printf("%s", pstr); log_error(code, pstr); free(pstr); } else if ((code == 0) && (nargs >= 2) && tname) { /* If error code is 0, and there is a message argument, simply display it. */ ipad = maxlen-strlen(tname); for (i = 0; i < ipad/2; i++) putc(' ', stdout); printf("%s", tname); for (i = 0; i < ipad-ipad/2; i++) putc(' ', stdout); } fflush(stdout); reset_video(); free(tname); } /* Classify test data set using linear classifier 'method' in a file. 'method' can be PALG_LIN/PALG_BAG_LIN for linear discriminant classifier, or PALG_PLC/PALG_BAG_PLC for parametric linear classifier. The function obtains input parameters from the user, calls lin_predict(), and displays classification results. In case of success, return 0. In case of failure, return error code. In case of file access error, return the relevant file name in 'xname'. */ int p_lin_predict(int method, char **xname) { int i; int j; int type; int errc; int verbose; int min_range; int max_range; int predicted_class; int nmodels; int rows; int columns; float *predictions; char *fname; char *output_fname; float *weights; float **model; float ***models; FILE *outdev; FILE *fptr; FILE *output_fptr; clear_screen(); outdev = stdout; errc = 0; cursor_on(); if ((method == PALG_LIN) || (method == PALG_BAG_LIN)) fname = input_filename(PCP_UMSG_LD_FNAME, PCP_LIN, outdev); else fname = input_filename(PCP_UMSG_PLC_FNAME, PCP_PLC, outdev); fptr = fopen(fname, "r"); if (fptr) { fclose(fptr); output_fname = strdup(PCP_RCL); output_fptr = fopen(output_fname, "w"); if (output_fptr) { models = lin_load_models(fname, &type, &nmodels, &weights, &rows, &columns, &errc); if (models) { /* Check consistency between the models and test data set. */ if (((teds->c > 1) && (teds->c != rows)) || (teds->d+1 != columns)) errc = LERR_INCONSISTENT_LIN; if (errc == 0) { predictions = calloc(rows, sizeof(float)); teds->prediction = malloc(teds->nv*sizeof(int)); for (i = 0; i < teds->nv; i++) { fvec_set(predictions, rows, 0.0); for (j = 0; j < nmodels; j++) { model = models[j]; predicted_class = lin_predict(model, rows, teds->d+1, teds->x[i]); predictions[predicted_class] += weights[j]; } predicted_class = fvec_argmax(predictions, rows); teds->prediction[i] = predicted_class; write_rcl(output_fptr, i, teds, tds); } free(predictions); #if 0 for (j = 0; j < nmodels; j++) svm_destroy_model(models[j]); #endif free(models); fclose(output_fptr); min_range = 0; max_range = 1; verbose = input_integer(stdin, outdev, OUTPUT_MSG, PCP_QLEN, &min_range, &min_range, &max_range); /* Display results. */ predict_disp(teds, verbose, method); pwait(); } } else { *xname = fname; fclose(output_fptr); unlink(output_fname); } } else { errc = errno; *xname = output_fname; } } else { errc = errno; *xname = fname; } reset_video(); return errc; } /* Create a temporary file in the current directory and return the name. The file is created with permissions -rw-r--r--. Return (char *) 0 in case of error and set errno. The errors are malloc(), mkstemp() and fchmod() errors. */ char *tempfile(void) { int status = 0; int fd; char *template; template = malloc(20*sizeof(char)); if (template != (char *) 0) { sprintf(template, "pcp_XXXXXX"); fd = mkstemp(template); if (fd != -1) { status = fchmod(fd, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); if (status == 0) status = close(fd); } else status = -1; if (status == -1) { free(template); template = (char *) 0; } } return template; } /* Insert underscore 'iffix' into 'fnames'. For example, 'iris1.dat' becomes 'iris1_nrm.dat'. */ static void rename_files(char **fnames, int length, char *iffix) { int j; int pos; char *xtr; char *fname; if (fnames && (length > 0) && iffix) { for (j = 0; j < length; j++) { xtr = base_name(fnames[j]); if (xtr) { pos = strcspn(xtr, "."); fname = str_insert(4, xtr, pos, "_", iffix); free(xtr); fnames[j] = fname; } } } } /* Set 'dset1' and 'dset2' to the deep or shallow copies of TEDS and TDS, respectively, depending on user input. If the copies are deep, the current data sets will get replaced by the result of the subsequent operations; it the copies are shallow, new data sets will be created and saved to disk. 'iffix' is used to define the file names for the saved data sets. Return the mode (replace data sets -> 1, do not replace -> 0). */ int input_replace(struct dataset **dset1, struct dataset **dset2, char *iffix) { int do_not_replace; int replace; int mode; char *msg; do_not_replace = P_DO_NOT_REPLACE; replace = P_REPLACE; msg = malloc((PCP_QLEN+1)*sizeof(char)); sprintf(msg, PCP_UMSG_REPLACE, replace, do_not_replace, do_not_replace); mode = input_integer(stdin, stdout, msg, PCP_QLEN, &do_not_replace, &do_not_replace, &replace); free(msg); if (mode == P_REPLACE) *dset1 = teds; else *dset1 = dataset_clone(teds); if (mode == P_REPLACE) *dset2 = tds; else *dset2 = dataset_clone(tds); if (dset1 && *dset1) rename_files((*dset1)->fnames, (*dset1)->c, iffix); rename_files((*dset2)->fnames, (*dset2)->c, iffix); return mode; } /* Utility function for feature extraction and selection. Saves 'dset1', 'dset2'. If 'mode' is P_REPLACE, updates the status file. In case of error, return -1 and set 'errc'. In case of file error, set file name in 'xname'. */ int save_datasets(struct dataset *dset1, struct dataset *dset2, int mode, int *errc, char **xname) { int status; *errc = 0; status = dataset_write(dset1, xname); if (!status) status = dataset_write(dset2, xname); if ((mode == P_REPLACE) && !status) { status = save_sts(PCP_STS, teds, tds); if (status == -1) { *errc = errno; *xname = strdup(PCP_STS); remove_datasets(); } } if ((status == -1) && (*errc == 0)) *errc = errno; return status; } /* Save the clusters defined by 'ncl' (number of clusters), 'ccard' (cluster cardinalities), 'clist' (list of vectors in the clusters), using file 'format'. If 'format' is Named Vector format (DATASET_FF_VECTOR), and TDS is in the Raw format (DATASET_FF_RAW) , we store vector IDs instead of the (non-existent) vector names. In case of error, set 'errc' and set file name in 'xname' (the only possible errors are file errors). */ void save_clusters(int ncl, int *ccard, int *clist, int format, int *errc, char **xname) { int i; int j; int k; int status; int idx; char *fname; FILE *fptr; status = 0; fname = (char *) 0; for (i = 0; (i < ncl) && !status; i++) { fname = malloc((strlen(PCP_PCP)+strlen(PCP_CLX)+MAX_INT_DIGITS+1)*sizeof(char)); sprintf(fname, "%s%d%s", PCP_PCP, i+1, PCP_CLX); fptr = fopen(fname, "w"); if (fptr) { for (j = 0; j < ccard[i]; j++) { idx = clist[j]; if (format == DATASET_FF_VEC) { if (tds->format == DATASET_FF_VEC) status = fprintf(fptr, "%s\t", tds->xlab[idx]); else if (tds->format == DATASET_FF_RAW) status = fprintf(fptr, "%d\t", idx); } for (k = 0; k < tds->d; k++) status = fprintf(fptr, "%g\t", tds->x[idx][k]); fprintf(fptr, "\n"); } free(fname); status = fclose(fptr); } else status = -1; } if (status == -1) { *errc = errno; *xname = strdup(fname); } } /* A convenient memory allocator for simplex() optimization. Initializes the criterion vector `fval' and simplex `smx'. */ int smplx_alloc(float **fval, float ***smx, int n) { int i; int status; float **mx; status = 0; mx = malloc((n+1)*sizeof(float *)); if (mx) { for (i = 0; (i < n+1) && !status; i++) { mx[i] = malloc(n*sizeof(float)); if (!mx[i]) status = -1; } if (!status) { *fval = malloc((n+1)*sizeof(float)); if (!*fval) status = -1; else *smx = mx; } } else status = -1; return status; } /* Accept input parameters and pass them to the corresponding ensemble (bagging or boosting) learning function. In case of error, set 'errc'. If error is file access error, set 'xname'. */ void pcp_ensemble(int method, int *errc, char **xname, int dbg) { int status; int opt_method; int ensemble_method; /* the ensemble variant of `method'; for example, */ /* for PALG_SVM, the possible values for ensemble_method */ /* are PALG_BAG_SVM and PALG_ADABOOST_SVM */ float **t; int mlp_nlayers; int *mlp_npl; int mlp_itmax; float mlp_range; float eta; float mu; unsigned int seed; char *fname; int nmodels; int bag_size; float *weights; void **models; struct mlp_options *mlp_optional; struct svm_parameter *parameters; void *options = (void **) 0; void **problems; FILE *outdev; FILE *fdbg = (FILE *) 0; fname = (char *) 0; models = (void **) 0; if (tds) { status = 0; /* TBD: decided to fix bag_size to 'nvec' in November 2002. Perhaps revisit. */ bag_size = tds->nv; outdev = stdout; fflush(outdev); if (dbg > 0) fdbg = fopen(PCP_DBG, "a"); clear_screen(); cursor_on(); if (method == PALG_BAG_MLP) { t = mlp_target(tds->c, tds->nd); status = input_mlp(outdev, tds->d, tds->c, tds->nv, (int *) 0, &mlp_nlayers, &mlp_npl, &mlp_itmax, &mlp_range, &opt_method, &eta, &mu, (int *) 0, 0, (int *) 0, &seed, &fname); if (status == 0) { mlp_optional = calloc(1, sizeof(struct mlp_options)); mlp_optional->nlayers = mlp_nlayers; mlp_optional->npl = mlp_npl; mlp_optional->itmax = mlp_itmax; mlp_optional->range = mlp_range; mlp_optional->opt_method = opt_method; mlp_optional->eta = eta; options = mlp_optional; } } else if (method == PALG_SVM) { parameters = calloc(1, sizeof(struct svm_parameter)); options = parameters; status = input_svm(outdev, tds->c, tds->nv, (struct svm_problem *) 0, parameters, &fname, (int *) 0, 0, 1, PCP_SVM_K_NONE, &ensemble_method); seed = input_seed(stdin, outdev); } else if ((method == PALG_BAG_LIN) || (method == PALG_BAG_PLC)) { clear_screen(); cursor_on(); if (method == PALG_BAG_LIN) fname = input_filename(LD_MSG, PCP_LIN, outdev); else fname = input_filename(CLASSIFIER_MSG, PCP_PLC, outdev); seed = input_seed(stdin, outdev); } else if (method == PALG_BAG_PQC) { clear_screen(); cursor_on(); fname = input_filename(CLASSIFIER_MSG, PCP_PQC, outdev); seed = input_seed(stdin, outdev); } if (status == 0) { nmodels = input_nmodels(stdin, outdev); inverse_video(); srand(seed); if (method == PALG_SVM) { if (ensemble_method == PALG_BAG_SVM) models = (void **) bagging(tds, outdev, ensemble_method, fname, seed, nmodels, bag_size, options, &problems, errc, fdbg); else if (ensemble_method == PALG_ADABOOST_SVM) models = adaboost(tds, ensemble_method, &nmodels, &weights, fname, seed, options, errc, fdbg); } else models = (void **) bagging(tds, outdev, method, fname, seed, nmodels, bag_size, options, (void ***) 0, errc, fdbg); reset_video(); } if (!models) *xname = strdup(fname); } else *errc = PERR_UNDEFINED_TDS; } /* Combine matrices `x1' and `x2' with `n1' and `n2' rows, respectively. The combination is an array of pointers pointing to rows of x1 followed by rows of x2. In case of malloc() failure, return (float **) 0 and set errno. */ float **combine_x(float **x1, int n1, float **x2, int n2) { int i; int idx; int nsamples; float **cx; nsamples = n1+n2; cx = malloc(nsamples*sizeof(float *)); if (cx) { for (i = 0; i < n1; i++) cx[i] = x1[i]; idx = n1; for (i = 0; i < n2; i++) cx[idx++] = x2[i]; } return cx; }