/* File name: mlp.c Created by: Ljubomir Buturovic Created: 03/11/2001 Purpose: multi-layer perceptron (MLP) learning and prediction. */ /* Copyright 2004 Ljubomir J. Buturovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* The principal function in this file is mlp_learn(). It trains a MLP network using various optimization methods, including conjugate gradient and gradient descent. p_mlp_learn() serves as an interface between the menu system and mlp_learn(). The MLP algorithms and variable names in mlp_learn() largely follow notation and theory given in: Christopher M. Bishop, Neural Networks for Pattern Recognition, Chapter 4. Oxford University Press, Oxford, 1995. */ static char rcsid[] = "$Id: mlp.c,v 1.201 2005/09/03 09:03:04 ljubomir Exp $"; #include #include #include #include #include #include "mlp.h" #include "pau.h" #include "lau.h" #include "lmat.h" #include "conjg.h" #include "simplex.h" #include "lerr.h" #include "bagging.h" /* The structure used to pass parameters to mlp_function() through wn_conj_gradient_method(). */ struct mlp_parameters { /* 'int iteration' has to be the first member of the structure. MLP optimization function sets it to the current iteration. */ int iteration; float **t; int errc; struct mlp *perceptron; struct dataset *dset; FILE *outdev; FILE *fdbg; }; static float sigmoid(float a) { float g; g = 1.0/(1.0+exp(-a)); return g; } /* Calculate node inputs 'a' and activations 'z' for inputs 'x' and network given in 'perceptron'. */ static void calculate_network(struct mlp *perceptron, float *x) { int nodes; int i; int j; int d; int c; int layer; int nlayers; int cix; int cl; int iw; int *npl; float *w; float *a; float *z; float *delta; w = perceptron->w; nodes = perceptron->nodes; nlayers = perceptron->nlayers; npl = perceptron->npl; d = perceptron->d; c = perceptron->npl[nlayers-1]; a = perceptron->a; z = perceptron->z; delta = perceptron->delta; /* 'cix' is index of first node in previous layer. Current node takes activations from nodes cix, cix+1, ... cix+npl[layer-1]. */ cix = 0; iw = 0; layer = 0; cl = npl[0]; /* cumulative number of nodes up to the current layer */ for (j = 0; j < nodes; j++) { if (j == cl) { layer++; cl += npl[layer]; if (layer >= 2) cix += npl[layer-2]; } a[j] = w[iw++]; /* bias term */ if (layer == 0) for (i = 0; i < d; i++) a[j] += w[iw++]*x[i]; else { for (i = 0; i < npl[layer-1]; i++) a[j] += w[iw++]*z[cix+i]; } z[j] = sigmoid(a[j]); } } /* Compute sample error, defined as sum of squares of differences between output and target values for each of the 'c' outputs. */ static double output_error(float *output, float *target, int c) { int i; double deviation; double dev; dev = 0.0; for (i = 0; i < c; i++) { deviation = output[i]-target[i]; dev += deviation*deviation; } return dev; } /* Test functions. */ /* Test function. It calculates the value of error function for the perceptron described by 'perceptron' for a single sample 'x' and target value 't'. */ static double mlp_check_criterion(struct mlp *perceptron, float *x, float *t) { int cix; int c; int nlayers; double dev; int *npl; float *z; nlayers = perceptron->nlayers; npl = perceptron->npl; c = perceptron->npl[nlayers-1]; z = perceptron->z; cix = ivec_sum(npl, nlayers-1); /* cix is index of first output node */ /* Calculate network outputs for sample 'x'. */ calculate_network(perceptron, x); /* Calculate output error. */ dev = output_error(&z[cix], t, c); dev = 0.5*dev; return dev; } /* Test function. It calculates gradient 'gw' numerically at 'pw'. The function has signature and semantics identical to mlp_gradient(). */ static void mlp_check_gradient(double *gw, double *pw, void *parameters) { int i; int n; int wlen; int nsamples; float epsilon; float depsilon; float error_plus; float error_minus; float factor; float wx; float w; float **t; float **x; struct mlp_parameters *mlp_params; struct mlp *perceptron; FILE *fdbg; mlp_params = (struct mlp_parameters *) parameters; perceptron = mlp_params->perceptron; fdbg = mlp_params->fdbg; nsamples = mlp_params->dset->nv; t = mlp_params->t; x = mlp_params->dset->x; wlen = perceptron->wlen; for (i = 0; i < wlen; i++) { gw[i] = 0.0; perceptron->w[i] = pw[i]; } factor = 0.001; epsilon = factor; /* Calculate approximate gradient by perturbing each weight in turn. Use formula (4.47) from Bishop. */ for (n = 0; n < nsamples; n++) for (i = 0; i < wlen; i++) { wx = perceptron->w[i]; /*epsilon = wx*factor;*/ depsilon = epsilon+epsilon; w = wx+epsilon; perceptron->w[i] = w; error_plus = mlp_check_criterion(perceptron, x[n], t[n]); w -= depsilon; perceptron->w[i] = w; error_minus = mlp_check_criterion(perceptron, x[n], t[n]); gw[i] += (error_plus-error_minus)/depsilon; perceptron->w[i] = wx; } if (fdbg) for (i = 0; i < wlen; i++) fprintf(fdbg, "mlp_check_gradient(); leaving; perceptron->w[%d]: %f; " "pw[%d]: %f; gw[%d]: %f\n", i, perceptron->w[i], i, pw[i], i, gw[i]); } /* Free mlp struct. Returns (struct mlp *) 0. */ struct mlp *mlp_free(struct mlp *perceptron) { if (perceptron) { free(perceptron->npl); free(perceptron->w); free(perceptron->a); free(perceptron->z); free(perceptron->delta); free(perceptron->fname); free(perceptron); } return (struct mlp *) 0; } /* Return deep copy of 'perceptron'. */ struct mlp *mlp_clone(struct mlp *perceptron) { int i; struct mlp *clone = (struct mlp *) 0; clone = calloc(1, sizeof(struct mlp)); if (clone != (struct mlp *) 0) { clone->d = perceptron->d; clone->nlayers = perceptron->nlayers; clone->npl = malloc(perceptron->nlayers*sizeof(int)); for (i = 0; i < perceptron->nlayers; i++) clone->npl[i] = perceptron->npl[i]; clone->wlen = perceptron->wlen; clone->w = malloc(perceptron->wlen*sizeof(float)); fvec_copy(clone->w, perceptron->w, perceptron->wlen); clone->nodes = perceptron->nodes; clone->a = malloc(perceptron->nodes*sizeof(float)); fvec_copy(clone->a, perceptron->a, perceptron->nodes); clone->z = malloc(perceptron->nodes*sizeof(float)); fvec_copy(clone->z, perceptron->z, perceptron->nodes); clone->delta = malloc(perceptron->nodes*sizeof(float)); fvec_copy(clone->delta, perceptron->delta, perceptron->nodes); clone->noff = perceptron->noff; clone->seed = perceptron->seed; clone->range = perceptron->range; clone->iterations = perceptron->iterations; clone->p_iter = perceptron->p_iter; clone->itmax = perceptron->itmax; clone->error = perceptron->error; clone->mce = perceptron->mce; if (perceptron->fname) clone->fname = strdup(perceptron->fname); } return clone; } /* Calculate derivative of sigmoid function, _given value of sigmoid_ 'g' (as opposed to being given input to sigmoid). This is obviously faster. */ static float dsigmoid(float g) { float gprime; gprime = g*(1.0-g); return gprime; } /* Generate target outputs for MLP, given input categorized in 'nc' classes with 'nd[i]' samples in class i. In case of malloc() failure, return NULL and set errno. */ float **mlp_target(int nc, int *nd) { int idx; int jdx; int nsamples; int cthd; int current_class; float **t; nsamples = ivec_sum(nd, nc); t = fmx_alloc(nsamples, nc); if (t) { cthd = nd[0]; current_class = 0; for (idx = 0; idx < nsamples; idx++) { if (idx >= cthd) cthd += nd[++current_class]; for (jdx = 0; jdx < nc; jdx++) { if (jdx == current_class) t[idx][jdx] = MLP_TARGET_HIGH; else t[idx][jdx] = MLP_TARGET_LOW; } } } return t; } /* Determine index of first (left-most) node within 'layer'. */ static int get_layer_index(int layer, int *npl) { int i; int idx; idx = 0; for (i = 0; i < layer; i++) idx += npl[i]; return idx; } /* Determine index (within 'w' array) of weight from node 'j', which is in 'layer', to first (left-most) node in next layer. 'd' is number of inputs. */ static int get_weight_index(int j, int d, int layer, int *npl) { int i; int iw; /* First, there are (d+1)*npl[0] weights before layer 0. */ iw = (d+1)*npl[0]; /* Then, previous hidden layers. */ for (i = 0; i < layer; i++) iw += (npl[i]+1)*npl[i+1]; /* Current layer. */ iw += j-get_layer_index(layer, npl)+1; return iw; } /* Writer function for mlp_save(), mlp_write(). */ static void mlp_fprintf(FILE *outfile, struct mlp *perceptron, int mode, int index, float weight) { int i; int j; int l; int iw; int inode; int idx = 1; float wt = 1.0; float *w; if (mode == MLP_MODE_APPEND) { idx = index; wt = weight; } w = perceptron->w; fprintf(outfile, "combined model / combined weight: %d / %f\n", idx, wt); fprintf(outfile, "iterations: %d\n", perceptron->p_iter+perceptron->iterations); if (perceptron->method == MLP_OPT_CGPLUS) fprintf(outfile, "optimization method: %s\n", MLP_STR_CGPLUS); else if (perceptron->method == MLP_OPT_GRADIENT_DESCENT) fprintf(outfile, "optimization method: %s\n", MLP_STR_GRADIENT_DESCENT); else if (perceptron->method == MLP_OPT_WNLIB) fprintf(outfile, "optimization method: %s\n", MLP_STR_WNLIB); else fprintf(outfile, "optimization method: %s\n", MLP_STR_UNSPECIFIED); fprintf(outfile, "seed: %d\n", perceptron->seed); fprintf(outfile, "range of initial weights: %f\n", perceptron->range); fprintf(outfile, "misclassified samples: %d\n", perceptron->mce); fprintf(outfile, "avg. error per node and sample (MSE): %f\n", perceptron->error); fprintf(outfile, "number of layers: %d\n", perceptron->nlayers); for (i = 0; i < perceptron->nlayers; i++) fprintf(outfile, "nodes in layer %d: %d\n", i, perceptron->npl[i]); fprintf(outfile, "number of inputs: %d\n", perceptron->d); fprintf(outfile, "number of weights: %d\n", perceptron->wlen); iw = 0; for (i = 0; i < perceptron->npl[0]; i++) { fprintf(outfile, "weights from inputs to node %3d: ", i); for (j = 0; j <= perceptron->d; j++) { fprintf(outfile, "%f ", w[iw]); iw++; } fprintf(outfile, "\n"); } inode = 0; for (i = 0; i < perceptron->nlayers-1; i++) { inode += perceptron->npl[i]; for (j = 0; j < perceptron->npl[i+1]; j++) { fprintf(outfile, "weights from layer %1d to node %3d: ", i, j+inode); for (l = 0; l <= perceptron->npl[i]; l++) { fprintf(outfile, "%f ", w[iw]); iw++; } fprintf(outfile, "\n"); } } /* One final newline needed to make bagging MLP file more readable. */ fprintf(outfile, "\n"); } /* Save multi-layer perceptron model described by 'perceptron' in perceptron->fname. The model is written (mode == MLP_MODE_WRITE) or appended (mode == MLP_MODE_APPEND) to the perceptron->fname file. If mode == MLP_MODE_APPEND, prepend 'index'/'weight' to the model saved, otherwise prepend the default values (1/1.0). In case of success, return 0. In case of failure, return -1 and set errno. */ int mlp_save(struct mlp *perceptron, int mode, int index, float weight) { int status; FILE *outfile = (FILE *) 0; status = 0; if (perceptron->fname) { if (mode == MLP_MODE_WRITE) outfile = fopen(perceptron->fname, "w"); else outfile = fopen(perceptron->fname, "a"); if (outfile) { mlp_fprintf(outfile, perceptron, mode, index, weight); fclose(outfile); } else status = -1; } return status; } /* File-pointer-based version of mlp_save(). In case of success, return 0. In case of failure, return -1 and set errno. */ int mlp_write(FILE *fptr, struct mlp *perceptron, int mode, int index, float weight) { int status; status = 0; if (fptr) { mlp_fprintf(fptr, perceptron, mode, index, weight); fflush(fptr); } else status = -1; return status; } /* The function calculates gradient 'gw' of the criterion function for MLP defined by 'parameters->perceptron'. The gradient is calculated at point 'pw'. Internally, the function copies 'pw' into perceptron->w. perceptron->w is then used in calculations. */ static void mlp_gradient(double *gw, double *pw, void *parameters) { int i; int j; int k; int n; int cix; int hix; int wlen; int layer; int nlayers; int c; int d; int idx; int kdx; int shift; int iw; int *npl; float sum; float zk; float *z; float *delta; double *dwt; /* instance (single vector) derivatives */ float *wx; float **t; float **x; struct mlp_parameters *mlp_params; struct mlp *perceptron; struct dataset *dset; FILE *fdbg; mlp_params = (struct mlp_parameters *) parameters; perceptron = mlp_params->perceptron; t = mlp_params->t; dset = mlp_params->dset; x = dset->x; fdbg = mlp_params->fdbg; fvec_dcopy(perceptron->w, pw, perceptron->wlen); wx = perceptron->w; d = perceptron->d; nlayers = perceptron->nlayers; c = perceptron->npl[nlayers-1]; npl = perceptron->npl; wlen = perceptron->wlen; delta = perceptron->delta; z = perceptron->z; dvec_set(gw, wlen, 0.0); dwt = malloc(wlen*sizeof(double)); if (fdbg) for (i = 0; i < wlen; i++) fprintf(fdbg, "mlp_gradient(); perceptron->w[%d]: %12.5g\n", i, perceptron->w[i]); cix = ivec_sum(npl, nlayers-1); for (n = 0; n < dset->nv; n++) { /* Calculate network for sample 'n'. */ calculate_network(perceptron, dset->x[n]); /* Calculate deltas. Output nodes: */ for (k = cix; k < cix+c; k++) { zk = z[k]; delta[k] = zk*(1.0-zk)*(zk-t[n][k-cix]); if (fdbg) fprintf(fdbg, "mlp_gradient(); vector %d; output z[%d]: %12.5g; delta[%d]: %12.5g\n", n, k, zk, k, delta[k]); } /* Hidden nodes: */ idx = cix; /* in the following loop, idx is index of first (left-most) node in the current layer */ for (layer = nlayers-2; layer >= 0; layer--) { kdx = idx; /* index of first node in layer+1 */ idx -= npl[layer]; for (j = idx; j < idx+npl[layer]; j++) { sum = 0.0; shift = npl[layer]+1; /* iw is index of weight from node j to first node in next layer */ iw = get_weight_index(j, d, layer, npl); for (k = kdx; k < kdx+npl[layer+1]; k++) { sum += wx[iw]*delta[k]; iw += shift; } delta[j] = dsigmoid(z[j])*sum; } } /* All deltas done. Calculate derivatives for sample 'n'. First calculate derivatives with respect to weights before layer 0 (i.e., weights connecting network inputs to layer 0). */ j = 0; i = 0; for (iw = 0; iw < (d+1)*npl[0]; iw++) { if ((iw%(d+1)) == 0) /* bias weight */ dwt[iw] = delta[j]; else dwt[iw] = delta[j]*x[n][i++]; /* non-bias weight */ if (i == d) { i = 0; j++; } } /* Hidden layer derivatives. */ layer = 0; /* 'from' layer */ j = 0; /* current 'from' node */ k = npl[0]; /* current 'to' node */ hix = 0; /* left-most node in 'layer' */ idx = 0; /* total number of weights in current layer */ for (iw = (d+1)*npl[0]; iw < wlen; iw++) { if ((idx%(npl[layer]+1)) == 0) /* bias weight */ dwt[iw] = delta[k]; else dwt[iw] = z[j++]*delta[k]; /* non-bias weight */ if (j == hix+npl[layer]) { j = hix; k++; } idx++; if (idx == (npl[layer]+1)*npl[layer+1]) /* next layer? */ { hix += npl[layer]; j = hix; layer++; idx = 0; } } /* Update total derivatives. */ for (iw = 0; iw < wlen; iw++) gw[iw] += dwt[iw]; } #ifdef TESTING /* If the file is compiled with -DTESTING, true and approximate gradient values are printed. The values should be very close; examples (gw: actual gradient; dwt: approximate gradient): gw[0]: -0.127655; dwt[0]: -0.127599 gw[1]: -0.939811; dwt[1]: -0.939935 gw[2]: -0.218290; dwt[2]: -0.218108 gw[3]: -1.097668; dwt[3]: -1.097396 */ mlp_check_gradient(dwt, pw, parameters); fprintf(stdout, "perceptron->w: weights; pw: point; gw: gradient; dwt: app. gradient\n"); for (iw = 0; iw < wlen; iw++) fprintf(stdout, "perceptron->w[%d]: %f; pw[%d]: %f; gw[%d]: %f; dwt[%d]: %f\n", iw, perceptron->w[iw], iw, pw[iw], iw, gw[iw], iw, dwt[iw]); pwait(); #endif if (fdbg) for (iw = 0; iw < wlen; iw++) fprintf(fdbg, "mlp_gradient(); leaving; perceptron->w[%d]: %f; pw[%d]: %f; " "gw[%d]: %f; dwt[%d]: %f\n", iw, perceptron->w[iw], iw, pw[iw], iw, gw[iw], iw, dwt[iw]); free(dwt); } static void mlp_dgradient(double *pw, int n, double *gw, void *parameters, int *errc) { mlp_gradient(gw, pw, parameters); } /* Calculate outputs for MLP defined in 'perceptron' for a given input 'x'. The function allocates space for the returned vector - it is the caller's responsiblity to free() it. */ float *mlp_output(struct mlp *perceptron, float *x) { int i; int nlayers; int *npl; int c; int cix; int iz; float *z; float *output; nlayers = perceptron->nlayers; npl = perceptron->npl; c = perceptron->npl[nlayers-1]; output = malloc(c*sizeof(float)); z = perceptron->z; cix = ivec_sum(npl, nlayers-1); /* cix is index of first output node */ calculate_network(perceptron, x); iz = cix; for (i = 0; i < c; i++) output[i] = z[iz++]; return output; } /* Testing function. Calculates derivatives dw[1..wlen] numerically. The function has signature and semantics identical to mlp_derivative(). */ static void mlp_check_derivative(float *wt, float *dw, struct mlp *perceptron, float **x, int nsamples, float **t, FILE *outdev, FILE *fdbg) { int i; int n; int wlen; float epsilon; float depsilon; float error_plus; float error_minus; float factor; float wx; float w; factor = 0.001; wlen = perceptron->wlen; for (i = 0; i < wlen; i++) perceptron->w[i] = wt[i+1]; /* This loop used for gdb debugging only. error_plus = 0; for (n = 0; n < nsamples; n++) error_plus += mlp_check_criterion(perceptron, x[n], t[n]); */ /* Calculate approximate derivatives by perturbing each weight in turn. Use formula (4.47) from Bishop. Note that Numerical Recipes function frprmn() requires that the derivatives be placed in dw[1..wlen]. */ for (i = 0; i < wlen; i++) dw[i+1] = 0.0; for (n = 0; n < nsamples; n++) for (i = 0; i < wlen; i++) { wx = perceptron->w[i]; /*epsilon = wx*factor;*/ epsilon = factor; depsilon = epsilon+epsilon; w = wx+epsilon; perceptron->w[i] = w; error_plus = mlp_check_criterion(perceptron, x[n], t[n]); w -= depsilon; perceptron->w[i] = w; error_minus = mlp_check_criterion(perceptron, x[n], t[n]); dw[i+1] += (error_plus-error_minus)/depsilon; perceptron->w[i] = wx; } } /* Return the number of MLPs in 'fptr'. */ static int get_nmlp(FILE *fptr, char *line, int llen) { int nmlp = 0; while (fgets(line, llen+2, fptr)) if (strncmp(line, MLP_SIGNATURE_STRING, strlen(MLP_SIGNATURE_STRING)) == 0) nmlp++; rewind(fptr); return nmlp; } /* Convert mnemonic optimization algorithm name 'name' into numeric code. */ static int convert_opt_method(char *name) { int code; code = MLP_OPT_CGPLUS; if (name) { if (strstr(name, MLP_STR_CGPLUS)) code = MLP_OPT_CGPLUS; else if (strstr(name, MLP_STR_GRADIENT_DESCENT)) code = MLP_OPT_GRADIENT_DESCENT; else if (strstr(name, MLP_STR_WNLIB)) code = MLP_OPT_WNLIB; } return code; } /* Load one or more multi-layer perceptrons described in 'fname' into (struct mlp **) array. The last element in the array is NULL. In case of success, returns 0. In case of failure, returns -1 and sets errno. */ struct mlp **mlp_load(char *fname) { int i; int j; int llen; int nl; int status; int idx; int imlp; int nmlp; int index; float model_weight; char *xtr; char *str; char **str_weights; FILE *fptr; struct mlp *perceptron = (struct mlp *) 0; struct mlp **perceptrons = (struct mlp **) 0; status = file_info(fname, &llen, (int *) 0, '\0'); if (status != -1) { fptr = fopen(fname, "r"); if (fptr != 0) { xtr = malloc(llen+2); str = malloc(llen+2); nmlp = get_nmlp(fptr, str, llen); if (nmlp > 0) { status = 0; perceptrons = calloc(nmlp+1, sizeof(struct mlp *)); if (!perceptrons) status = -1; for (imlp = 0; (imlp < nmlp) && (status == 0); imlp++) { perceptron = calloc(1, sizeof(struct mlp)); fscanf(fptr, "%40c%d/%f\n", xtr, &index, &model_weight); fscanf(fptr, "%40c%d\n", xtr, &perceptron->p_iter); fscanf(fptr, "%40c%s\n", xtr, str); perceptron->method = convert_opt_method(str); fscanf(fptr, "%40c%u\n", xtr, &perceptron->seed); fscanf(fptr, "%40c%f\n", xtr, &perceptron->range); fscanf(fptr, "%40c%d\n", xtr, &perceptron->mce); fscanf(fptr, "%40c%f\n", xtr, &perceptron->error); fscanf(fptr, "%40c%d\n", xtr, &perceptron->nlayers); nl = perceptron->nlayers; perceptron->npl = malloc(nl*sizeof(int)); perceptron->nodes = 0; for (i = 0; i < nl; i++) { fscanf(fptr, "%40c%d\n", xtr, &perceptron->npl[i]); perceptron->nodes += perceptron->npl[i]; } perceptron->noff = perceptron->nodes-perceptron->npl[nl-1]; perceptron->a = malloc(perceptron->nodes*sizeof(float)); perceptron->z = malloc(perceptron->nodes*sizeof(float)); perceptron->delta = malloc(perceptron->nodes*sizeof(float)); fscanf(fptr, "%40c%d\n", xtr, &perceptron->d); fscanf(fptr, "%40c%d\n", xtr, &perceptron->wlen); perceptron->w = malloc(perceptron->wlen*sizeof(float)); idx = 0; for (i = 0; i < perceptron->nodes; i++) { fgets(str, llen+2, fptr); str_weights = str_tokenize(str+PCP_HFT, WHITESPACE); j = 0; if (str_weights) { while (str_weights[j]) { perceptron->w[idx] = atof(str_weights[j]); idx++; j++; } str_free(str_weights); } } perceptrons[imlp]= perceptron; } } else { status = -1; errno = PERR_UNRECOGNIZED_MLP; } vx_free(xtr); vx_free(str); if (!status) status = fclose(fptr); else fptr_close(fptr); if (status) perceptrons = (struct mlp **) 0; } } return perceptrons; } /* The function calculates the value of error function for the perceptron described by 'perceptron' at weight vector 'w'. */ static double mlp_function(double *w, void *parameters) { int i; int status; int cix; int n; int c; int nlayers; int iter; int mce; int class; int mlp_class; int nclass; double dev; int *npl; float *z; float **t; struct mlp_parameters *mlp_params; struct mlp *perceptron; struct dataset *dset; mlp_params = (struct mlp_parameters *) parameters; mlp_params->errc = 0; perceptron = mlp_params->perceptron; dset = mlp_params->dset; t = mlp_params->t; /* Copy weight vector 'w' to 'perceptron'. perceptron->w is then used in calculations. */ for (i = 0; i < perceptron->wlen; i++) perceptron->w[i] = w[i]; nlayers = perceptron->nlayers; npl = perceptron->npl; c = perceptron->npl[nlayers-1]; z = perceptron->z; cix = ivec_sum(npl, nlayers-1); /* cix is index of first output node */ mce = 0; class = 0; /* actual class of sample 'n' */ nclass = dset->nd[0]; /* number of samples in categories 0..class */ dev = 0.0; for (n = 0; n < dset->nv; n++) { /* Calculate network outputs for sample 'n'. */ calculate_network(perceptron, dset->x[n]); /* Determine predicted sample class (i.e., output node with maximum output). */ mlp_class = fvec_argmax(&z[cix], c); /* Update cumulative error, defined as sum of squares of differences between output and target errors across all outputs and for all vectors. TBD: using double precision 'dev' variable to calculate the total deviation has a dramatic positive effect on the learning performance. Strange, perhaps to be further investigated. */ dev += output_error(&z[cix], t[n], c); /* Update misclassification count. */ if (class != mlp_class) mce++; if ((n == nclass-1)) { class++; if (class < c) nclass += dset->nd[class]; } } perceptron->error = dev/dset->nv; perceptron->mce = mce; iter = mlp_params->iteration; if ((iter%10 == 0) || (perceptron->itmax == iter)) { if (mlp_params->outdev) fprintf(mlp_params->outdev, "Epoch: %8d; avg. error per output node: %10.7f; error rate: %6.2f \n", perceptron->p_iter+iter, perceptron->error/c, 100.0*perceptron->mce/dset->nv); /*dev = dev/2.0;*/ /* The Great Bug - fixed 09/02/2004 */ status = mlp_save(perceptron, MLP_MODE_WRITE, 0, 0.0); if (status == -1) mlp_params->errc = errno; } perceptron->iterations = iter; dev = 0.5*dev; /* The Great Bug fix */ return dev; } /* Converter between the mlp_func typedef and mlp_function(). */ static double mlp_dfunction(double *w, int n, int iter, void *parameters, int *errc) { struct mlp_parameters *mlp_params; mlp_params = (struct mlp_parameters *) parameters; mlp_params->iteration = iter; return mlp_function(w, parameters); } /* TBD: horrible kludge. Merge it with mlp_function() if it starts working. ljb, 07/24/2004 */ static float mlp_simplex_function(float *w, int n, int iteration, void *parameters, int *errc) { int i; double *wd; struct mlp_parameters *mlp_params; mlp_params = (struct mlp_parameters *) parameters; mlp_params->iteration = iteration; wd = malloc(n*sizeof(double)); for (i = 0; i < n; i++) wd[i] = w[i]; return mlp_function(wd, parameters); free(wd); } static void set_parameters(struct mlp_parameters *parameters, struct dataset *dset, float **t, struct mlp *mlp_perceptron, FILE *outdev, FILE *fdbg) { parameters->iteration = mlp_perceptron->iterations; parameters->dset = dset; parameters->t = t; parameters->perceptron = mlp_perceptron; parameters->outdev = outdev; parameters->fdbg = fdbg; } /* Optimize weights of an MLP neural network using modified Polak-Ribiere conjugate gradient method. 't' are 'dset->nv' target output vectors. The function calculates the set of weights which optimize the square error and stores them in mlp->w, and saves them in mlp->fname. It periodically reports results on 'outdev'. In case of success, return 0, otherwise return -1 and set error code in 'errc'. */ int mlp_optimize(struct dataset *dset, float **t, struct mlp *perceptron, FILE *outdev, int *errc, FILE *fdbg) { int iflag; int iter; int status; int irest; int method; double eps; double *vect; struct mlp_parameters *parameters; status = 0; parameters = calloc(1, sizeof(struct mlp_parameters)); set_parameters(parameters, dset, t, perceptron, outdev, fdbg); vect = double_clone(perceptron->w, perceptron->wlen); eps = 1.0e-5; method = 2; /* 1: FR; 2: PR; 3: PR+ */ irest = 1; cgfam(perceptron->wlen, vect, mlp_dfunction, mlp_dgradient, irest, eps, method, (void *) parameters, perceptron->itmax, &iter, &iflag, errc); if (errc) { if (iflag == -3) *errc = EINVAL; else if (iflag == -2) *errc = LERR_DESCENT; else if (iflag == -1) *errc = LERR_LNSEARCH; else *errc = 0; } if (errc && (*errc != 0)) status = -1; free(vect); free(parameters); return status; } /* Memory allocator for mlp_optimize_gradient_descent(). Return -1 in case of failure. */ static int mlp_gradient_descent_alloc(struct mlp_parameters **parameters, int length, double **gw, double **pw, double **deltaw) { int status; status = -1; *parameters = calloc(1, sizeof(struct mlp_parameters)); if (*parameters) { *gw = malloc(length*sizeof(double)); if (*gw) { *pw = calloc(length, sizeof(double)); if (*pw) { *deltaw = calloc(length, sizeof(double)); if (*deltaw) status = 0; } } } return status; } /* Optimize weights of an MLP neural network using gradient descent algorithm defined by step size 'eta' and using momentum term defined by 'mu'. 't' are 'dset->nv' target output vectors. The function calculates the set of weights which optimize the square error and stores them in mlp->w, and saves them in mlp->fname. It periodically reports results on 'outdev'. In case of success, return 0, otherwise return -1 and set error code in 'errc'. */ int mlp_optimize_gradient_descent(struct dataset *dset, float **t, struct mlp *perceptron, float eta, float mu, FILE *outdev, int *errc, FILE *fdbg) { int i; int c; int nlayers; int status; int iter; double *gw; double *pw; double *deltaw; struct mlp_parameters *parameters; status = 0; parameters = (struct mlp_parameters *) 0; gw = (double *) 0; pw = (double *) 0; deltaw = (double *) 0; if (dset && perceptron) { nlayers = perceptron->nlayers; c = perceptron->npl[nlayers-1]; status = mlp_gradient_descent_alloc(¶meters, perceptron->wlen, &gw, &pw, &deltaw); if (!status) { set_parameters(parameters, dset, t, perceptron, outdev, fdbg); dvec_fcopy(pw, perceptron->w, perceptron->wlen); iter = 0; while (iter <= perceptron->itmax) { mlp_gradient(gw, pw, parameters); if (mu != 0.0) for (i = 0; i < perceptron->wlen; i++) perceptron->w[i] += mu*deltaw[i]; for (i = 0; i < perceptron->wlen; i++) perceptron->w[i] -= eta*gw[i]; if (mu != 0.0) for (i = 0; i < perceptron->wlen; i++) deltaw[i] = perceptron->w[i]-pw[i]; dvec_fcopy(pw, perceptron->w, perceptron->wlen); mlp_function(pw, parameters); iter++; parameters->iteration = iter; } status = mlp_save(perceptron, MLP_MODE_WRITE, 0, 0.0); } else if (errc) *errc = errno; } else if (errc) *errc = errno; free(parameters); free(gw); free(pw); free(deltaw); return status; } static int mlp_simplex_alloc(float **fval, float ***smx, int n) { int i; int status; float **mx; status = 0; mx = malloc((n+1)*sizeof(float *)); if (mx) { for (i = 0; (i < n+1) && !status; i++) { mx[i] = malloc(n*sizeof(float)); if (!mx[i]) status = -1; } if (!status) { *fval = malloc((n+1)*sizeof(float)); if (!*fval) status = -1; else *smx = mx; } } else status = -1; return status; } /* Optimize weights of an MLP neural network using Simplex algorithm of Nelder and Mead. 't' are 'dset->nv' target output vectors. The function calculates the set of weights which optimize the square error and stores them in mlp->w, and saves them in mlp->fname. It periodically reports results on 'outdev'. In case of success, return 0, otherwise return -1 and set error code in 'errc'. */ static int mlp_optimize_simplex(struct dataset *dset, float **t, struct mlp *perceptron, FILE *outdev, int *errc, FILE *fdbg) { int i; int j; int iter; int status; int ndim; float ftol; float lambda; float *fval; float **smx; struct mlp_parameters *parameters; status = 0; ndim = perceptron->wlen; ftol = 1e-6; status = 0; parameters = calloc(1, sizeof(struct mlp_parameters)); set_parameters(parameters, dset, t, perceptron, outdev, fdbg); status = mlp_simplex_alloc(&fval, &smx, ndim); if (!status) { for (i = 0; i < ndim; i++) smx[0][i] = (2*perceptron->range*rand()/(RAND_MAX+1.0))-perceptron->range; fval[0] = mlp_simplex_function(smx[0], ndim, 0, parameters, errc); if (*errc == 0) { /* Initialize the remaining ndim vertices of the simplex, using equation (10.4.1) from Numerical Recipes in C. */ lambda = 1.0; /* TBD: hardcoded for now. */ for (i = 1; (i < ndim+1) && (*errc == 0); i++) { for (j = 0; j < ndim; j++) smx[i][j] = smx[0][j]; smx[i][i-1] += lambda; fval[i] = mlp_simplex_function(smx[i], ndim, 0, parameters, errc); } simplex(smx, ndim, fval, mlp_simplex_function, (void *) parameters, perceptron->itmax, ftol, &iter, errc); if (errc && *errc) status = -1; } else status = -1; } free(parameters); return status; } /* Optimize weights of an MLP neural network using up to 'itmax' iterations of 'opt_method' optimization method. 'x' are 'nsamples' input vectors of length 'd' with 'nd[i]' samples per class 'i'. 't' are 'nsamples' target output vectors. 'eta' and 'mu' are the step size and momentum parameters for gradient descent optimization (therefore used only if 'opt_method' is MLP_OPT_GRADIENT_DESCENT). 'mlp_continue' is 0 to begin learning, 1 to continue starting with network in 'fname'. The function calculates the set of weights which optimize the square error and stores them in 'fname'. It periodically reports results on 'outdev'. The possible values for opt_method are defined in mlp.h. In case of success, the function returns 'mlp' structure which contains the trained neural network and sets 'errc' to 0. In case of error, return NULL and set 'errc', except for LERR_LNSEARCH (line search failure in optimization routine) and LERR_DESCENT (optimization procedure unable to find descent direction) errors. In those cases, the function returns the trained network, but sets 'errc' to LERR_LNSEARCH or LERR_DESCENT. This is done because optimization function often reports these errors, and yet finds a decent minimum. The caller may choose to ignore these two error codes. */ struct mlp *mlp_learn(int opt_method, float **x, int nsamples, int *nd, int d, float **t, int nlayers, int *npl, int itmax, float range, float eta, float mu, FILE *outdev, int mlp_continue, char *fname, unsigned int seed, int *errc, FILE *fdbg) { int status; int i; int wlen; int nodes; int c; float *w; float *dw; struct dataset *dset; struct mlp *perceptron = (struct mlp *) 0; struct mlp_parameters *parameters; struct mlp **perceptrons; c = 0; status = 0; if (errc) *errc = 0; /* If begin learning, initialize 'perceptron' pseudo-randomly, otherwise read it from 'fname'. */ if (mlp_continue == 1) { perceptrons = mlp_load(fname); if (perceptrons) { perceptron = perceptrons[0]; perceptron->fname = strdup(fname); perceptron->itmax = itmax; wlen = perceptron->wlen; w = malloc((wlen+1)*sizeof(float)); for (i = 0; i < wlen; i++) w[i] = perceptron->w[i]; } } else { c = npl[nlayers-1]; perceptron = calloc(1, sizeof(struct mlp)); perceptron->itmax = itmax; perceptron->method = opt_method; if (fname && *fname) perceptron->fname = strdup(fname); wlen = (d+1)*npl[0]; for (i = 0; i < nlayers-1; i++) wlen += (npl[i]+1)*npl[i+1]; /* Allocate space for weights and derivatives vectors. Use 'wlen+1' because 'w' and 'dw' are 1-based vectors, as required by frprmn() API. */ w = malloc((wlen+1)*sizeof(float)); dw = malloc((wlen+1)*sizeof(float)); perceptron->d = d; perceptron->nlayers = nlayers; nodes = 0; for (i = 0; i < nlayers; i++) nodes += npl[i]; perceptron->nodes = nodes; perceptron->npl = malloc(nlayers*sizeof(int)); for (i = 0; i < nlayers; i++) perceptron->npl[i] = npl[i]; perceptron->a = malloc(nodes*sizeof(float)); perceptron->z = malloc(nodes*sizeof(float)); perceptron->delta = malloc(nodes*sizeof(float)); perceptron->range = range; perceptron->seed = seed; perceptron->wlen = wlen; perceptron->w = malloc(wlen*sizeof(float)); for (i = 0; i < wlen; i++) w[i] = (2*perceptron->range*rand()/(RAND_MAX+1.0))-perceptron->range; for (i = 0; i < wlen; i++) perceptron->w[i] = w[i]; if (fdbg) for (i = 0; i < wlen; i++) fprintf(fdbg, "mlp_learn(); perceptron->w[%d]: %12.5g\n", i, perceptron->w[i]); } if (outdev) fprintf(outdev, "\n"); parameters = calloc(1, sizeof(struct mlp_parameters)); dset = dataset_lt(d, c, nd, nsamples, (char **) 0, x); set_parameters(parameters, dset, t, perceptron, outdev, fdbg); if (*errc != 0) { perceptron = mlp_free(perceptron); status = -1; } else if (itmax > 0) { perceptron->method = opt_method; if (opt_method == MLP_OPT_SIMPLEX) status = mlp_optimize_simplex(dset, t, perceptron, outdev, errc, fdbg); else if (opt_method == MLP_OPT_CGPLUS) status = mlp_optimize(dset, t, perceptron, outdev, errc, fdbg); else if (opt_method == MLP_OPT_GRADIENT_DESCENT) status = mlp_optimize_gradient_descent(dset, t, perceptron, eta, mu, outdev, errc, fdbg); if (status == -1) { if (errc && (*errc != LERR_LNSEARCH) && (*errc != LERR_DESCENT)) perceptron = mlp_free(perceptron); } } return perceptron; } /* Accept input parameters from the user and pass them to MLP learning function mlp_learn(). In case of error, set 'errc'. If error is file access error, set 'xname'. */ void p_mlp_learn(int *errc, char **xname, int *dbg) { int status; int opt_method; int mlp_continue; int mlp_nlayers; /* mlp_nlayers */ int mlp_itmax; /* mlp_itmax */ float mlp_range; /* mlp_range */ float eta; float mu; int *mlp_npl; /* mlp_npl */ char *mlp_fname; /* mlp_fname */ float **t; unsigned int mlp_seed; struct mlp *perceptron; FILE *outdev; FILE *fdbg = (FILE *) 0; *errc = 0; if (tds) { outdev = stdout; fflush(outdev); if (*dbg > 0) fdbg = fopen(PCP_DBG, "a"); t = mlp_target(tds->c, tds->nd); mlp_continue = 0; status = input_mlp(outdev, tds->d, tds->c, tds->nv, &mlp_continue, &mlp_nlayers, &mlp_npl, &mlp_itmax, &mlp_range, &opt_method, &eta, &mu, (int *) 0, 0, (int *) 0, &mlp_seed, &mlp_fname); if (status == 0) { inverse_video(); srand(mlp_seed); perceptron = mlp_learn(opt_method, tds->x, tds->nv, tds->nd, tds->d, t, mlp_nlayers, mlp_npl, mlp_itmax, mlp_range, eta, mu, outdev, mlp_continue, mlp_fname, mlp_seed, errc, fdbg); reset_video(); if (!perceptron) status = -1; else if (mlp_itmax != -1) { printf("\n"); print_line(stdout, "Done.", PCP_QLEN); pwait(); } } if (status == -1) { *errc = errno; *xname = strdup(mlp_fname); } } else *errc = PERR_UNDEFINED_TDS; } /* Classify dataset 'dset' using MLP defined in 'fname', for dataset 'dset'. The MLP outputs are placed in the returned struct mlp, and the classification results are placed in dset->cx. The function assigns each vector to the class corresponding to the node with the largest output, unless 'thd' is > 0. In that case, the assignment is performed only if the largest node output exceeds 'thd'. In case of error, return NULL and place error code in 'errc'. */ static struct mlp *recall(struct dataset *dset, float thd, char *fname, int *errc) { struct mlp **perceptrons; struct mlp *perceptron; *errc = 0; perceptron = (struct mlp *) 0; /* Read MLP from 'fname' into mlp structure. */ perceptrons = mlp_load(fname); if (perceptrons) perceptron = mlp_predict(dset, perceptrons, thd, errc); else *errc = errno; return perceptron; } /* Classify dataset 'dset' using MLP defined in 'perceptrons'. The MLP outputs are placed in the returned struct mlp, and the classification results are placed in dset->cx. The function assigns each vector to the class corresponding to the node with the largest output, unless 'thd' is > 0. In that case, the assignment is performed only if the largest node output exceeds 'thd'. 'perceptrons' is an array of 'struct mlp' objects. It can be created, for example, from a file using calling mlp_load(). In case of error, return NULL and place error code in 'errc'. */ struct mlp *mlp_predict(struct dataset *dset, struct mlp **perceptrons, float thd, int *errc) { int i; int j; int nl; int noff; int imax; int icx; int nmlp; int imlp; int nout; float cmax; float *mlp_output; struct mlp *perceptron; perceptron = (struct mlp *) 0; if (errc && perceptrons) { *errc = 0; nmlp = 0; while (perceptrons[nmlp]) nmlp++; perceptron = perceptrons[0]; nl = perceptron->nlayers-1; /* Check that the number of inputs equals number of features in 'dset', and, in case the number of classes in dset is greater than 1, that it equals the number of outputs in the network. Otherwise the network and the dataset are incompatible. If the number of classes in 'dset' is 1, it means that the dataset is unlabeled, which is OK. */ if (((dset->c == 1) || ((dset->c > 1) && (perceptron->npl[nl] == dset->c))) && (perceptron->d == dset->d)) { /* For each MLP, calculate outputs and store results in dset->prediction. */ dset->prediction = malloc(dset->nv*sizeof(int)); if (dset->prediction) { noff = perceptron->noff; nl = perceptron->nlayers-1; nout = perceptron->npl[nl]; mlp_output = malloc(nout*sizeof(float)); for (i = 0; i < dset->nv; i++) { fvec_set(mlp_output, nout, 0.0); for (imlp = 0; imlp < nmlp; imlp++) { calculate_network(perceptrons[imlp], dset->x[i]); for (j = 0; j < nout; j++) mlp_output[j] += perceptron->z[noff+j]; } /* For nmlp > 1 (MLP bagging classifier), voting is done by finding average outputs from all nmlp classifiers, then assigning the class corresponding to the largest output. This is consistent with the rule recommended in: Amanda J. C. Sharkey (Ed.), Combining Artificial Neural Nets, Chapter 3, Section 3.2.1. Springer, London, 1999. */ for (j = 0; j < nout; j++) mlp_output[j] /= nmlp; cmax = mlp_output[0]; imax = 0; for (j = 1; j < nout; j++) { if (cmax < mlp_output[j]) { imax = j; cmax = mlp_output[j]; } } dset->prediction[i] = imax; icx = dataset_class(i, teds->c, teds->nd); } free(mlp_output); } else *errc = errno; } else *errc = PERR_INCONSISTENT_MLP; if (*errc) { for (i = 0; i < nmlp; i++) mlp_free(perceptrons[i]); free(perceptrons); perceptron = (struct mlp *) 0; } } else if (errc) *errc = EINVAL; return perceptron; } void p_mlp_predict(int *errc, char **xname) { int verbose; int min_range; int max_range; int i; float thd; char *mlp_fname; char *output_fname; FILE *outdev; FILE *fptr; struct mlp *perceptron; clear_screen(); outdev = stdout; cursor_on(); mlp_fname = input_filename(PCP_UMSG_FNAME_MLP, PCP_MLP, outdev); fptr = fopen(mlp_fname, "r"); if (fptr) { fclose(fptr); output_fname = strdup(PCP_RCL); fptr = fopen(output_fname, "w"); if (fptr) { thd = 0.0; thd = input_float(stdin, outdev, PCP_UMSG_DTHD, PCP_QLEN, &thd, (float *) 0, (float *) 0); perceptron = recall(teds, thd, mlp_fname, errc); if (perceptron) { min_range = 0; max_range = 1; verbose = input_integer(stdin, outdev, OUTPUT_MSG, PCP_QLEN, &min_range, &min_range, &max_range); /* Store predictions in PCP_RCL. */ for (i = 0; i < teds->nv; i++) write_rcl(fptr, i, teds, tds); if (!fclose(fptr)) { /* Display results. */ predict_disp(teds, verbose, PALG_MLP); pwait(); } else { *errc = errno; *xname = strdup(output_fname); } } else *xname = strdup(mlp_fname); } else { *errc = errno; *xname = strdup(output_fname); } } else { *errc = errno; *xname = strdup(mlp_fname); } }