/***************************************************************************** FILE : $Source: /projects/higgs1/SNNS/CVS/SNNS/kernel/sources/scaled_conj_grad.c,v $ SHORTNAME : scaled_conj_grad.c SNNS VERSION : 4.2 PURPOSE : SNNS-Kernel Learning Functions NOTES : with following learning functions: - SCG AUTHOR : Bruno ORSIER DATE : 09.95 VERSION : 1.01 CHANGES :1.01 -> 1.02 - bug fix: in case the first computed gradient is null, norm_of_p_2 = |p|^2 is equal to zero and then a division by zero occurs in the computation of sigma. The 4th learning parameter, tolerance, is now also used to check whether |p| is almost equal to zero. If the answer is yes, SCG stops. SCG also now stops if the norm of the gradient is smaller than tolerance. 1.0 -> 1.01 new termination criterion, based on the book 'Numerical Recipes'. Takes into account the floating-point precision of the machine. (new #define TOLERANCE in .ph) new learning parameters with defaults Validation-function by Thomas Gern ******************************************************************************/ #include #include #include #include #ifdef HAVE_VALUES_H #include #endif #include "kr_typ.h" /* Kernel Types and Constants */ #include "kr_const.h" /* Constant Declarators for SNNS-Kernel */ #include "kernel.h" /* kernel function prototypes */ #include "kr_mac.h" /* Kernel Macros */ #include "glob_typ.h" #include "learn_f.h" #include "scaled_conj_grad.ph" #include "kr_newpattern.h" #ifdef SCG_DEBUG #define TRACE(s) printf s #else #define TRACE(s) #endif /***************************************************************************** FUNCTION : LEARN_SCG PURPOSE : main function for SCG learning algorithm RETURNS : kernel error code NOTES : UPDATE : ******************************************************************************/ krui_err LEARN_SCG(int start_pattern, int end_pattern, float *parameterInArray, int NoOfInParams, float **parameterOutArray, int *NoOfOutParams) { static float OutParameter[1]; /* OutParameter[0] stores the * learning error. */ static int k, restart_scg = FALSE, stop_scg = FALSE, success; static int count_under_tol ; /* scg stops after 3 consecutive under tolerance cases. This is the counter. */ static float delta, norm_of_p_2, lambda, lambda_bar, current_error, old_error, norm_of_rk ; static FlintType* *weights ; static FlintType *old_gradient, *p, *r, *old_weights, *step ; register FlagWord flags; register struct Link *link_ptr; register struct Unit *unit_ptr; register struct Site *site_ptr; int i, ret_code ; int start_scg, make_allocations, under_tolerance ; float sigma, mu, alpha, grand_delta, beta ; float sigma_1, lambda_1, tolerance ; /* learning parameters */ if (NoOfInParams < 4) return (KRERR_PARAMETERS); /* Not enough input parameters */ /* DEFAULTS: */ if (( sigma_1 = LEARN_PARAM1( parameterInArray )) == 0.0) sigma_1 = SCG_FIRST_SIGMA ; if (( lambda_1 = LEARN_PARAM2( parameterInArray )) == 0.0) lambda_1 = SCG_FIRST_LAMBDA ; /* param 3 is DELTA_max */ if (( tolerance = LEARN_PARAM4( parameterInArray )) == 0.0) tolerance = SCG_TOLERANCE ; *NoOfOutParams = 1; /* One return value is available (the * learning error) */ *parameterOutArray = OutParameter; /* set the output parameter reference */ ret_code = KRERR_NO_ERROR; /* reset return code */ start_scg = NetModified || NetInitialize || LearnFuncHasChanged ; make_allocations = NetModified || LearnFuncHasChanged ; if (NetModified) { /* Net has been modified */ /* count the no. of I/O units and check the patterns */ ret_code = kr_IOCheck(); if (ret_code < KRERR_NO_ERROR) return (ret_code); /* sort units by topology and by topologic type */ ret_code = kr_topoSort(TOPOLOGICAL_FF); if ((ret_code != KRERR_NO_ERROR) && (ret_code != KRERR_DEAD_UNITS)) return (ret_code); NetModified = FALSE; } if (make_allocations) { int ts = 0, malloc_size ; /* store pointers to weigths and biases into a table */ /* 1) determines the size of the table */ scg_space_size = 0 ; FOR_ALL_UNITS(unit_ptr) if (!IS_INPUT_UNIT(unit_ptr) && !IS_SPECIAL_UNIT(unit_ptr)) { flags = unit_ptr->flags; scg_space_size++ ; /* bias */ if ((flags & UFLAG_IN_USE) == UFLAG_IN_USE) { /* unit is in use */ if (flags & UFLAG_SITES) { /* unit has sites */ FOR_ALL_SITES_AND_LINKS(unit_ptr, site_ptr, link_ptr) scg_space_size++; } else { /* unit has no sites */ if (flags & UFLAG_DLINKS) { /* unit has direct links */ FOR_ALL_LINKS(unit_ptr, link_ptr) scg_space_size++; } } } } TRACE(("SCG - weights+biases space size : %d\n",scg_space_size)); /* allocate space for all vectors of size scg_space_size */ malloc_size = scg_space_size*(sizeof(FlintType *)) ; if (weights != NULL) free(weights) ; weights = (FlintType **)malloc(malloc_size) ; if (scg_gradient != NULL) free(scg_gradient) ; scg_gradient = (FlintType **)malloc(malloc_size) ; if (step != NULL) free(step) ; step = (FlintType *) malloc(scg_space_size*sizeof(FlintType)) ; if (p != NULL) free(p) ; p = (FlintType *) malloc(scg_space_size*sizeof(FlintType)) ; if (r != NULL) free(r) ; r = (FlintType *) malloc(scg_space_size*sizeof(FlintType)) ; if (old_weights != NULL) free(old_weights) ; old_weights = (FlintType *) malloc(scg_space_size*sizeof(FlintType)) ; if (old_gradient != NULL) free(old_gradient) ; old_gradient = (FlintType *) malloc(scg_space_size*sizeof(FlintType)) ; if ((step == NULL)||(scg_gradient == NULL)||(weights == NULL) ||(p==NULL)||(r==NULL) ||(old_weights==NULL) ||(old_gradient ==NULL)) { printf("SCG : malloc problem\n") ; return(KRERR_CRITICAL_MALLOC); } else TRACE(("SCG - malloc's done\n")); /* 2) store the adresses */ FOR_ALL_UNITS(unit_ptr) if (!IS_INPUT_UNIT(unit_ptr) && !IS_SPECIAL_UNIT(unit_ptr)) { flags = unit_ptr->flags; scg_gradient[ts] = &unit_ptr->value_a ; weights[ts++] = &unit_ptr->bias; if ((flags & UFLAG_IN_USE) == UFLAG_IN_USE) { /* unit is in use */ if (flags & UFLAG_SITES) { /* unit has sites */ FOR_ALL_SITES_AND_LINKS(unit_ptr, site_ptr, link_ptr) { scg_gradient[ts] = &link_ptr->value_a ; weights[ts++] = &link_ptr->weight; } } else { /* unit has no sites */ if (flags & UFLAG_DLINKS) { /* unit has direct links */ FOR_ALL_LINKS(unit_ptr, link_ptr) { scg_gradient[ts] = &link_ptr->value_a ; weights[ts++] = &link_ptr->weight; } } } } } } /* make_allocations */ if (start_scg || restart_scg) { /* now we choose initial values for SCG parameters */ lambda = lambda_1 ; lambda_bar = 0.0 ; success = TRUE ; k = 1 ; restart_scg = FALSE ; count_under_tol = 0 ; TRACE(("START || RESTART\n")); } TRACE(("\nSCG K=%d\n",k)); if (start_scg) { TRACE(("SCG - starting\n")); /* initialize p and r */ /* we have starting weight vector (initialized through xgui or snnsbat) and we need the current gradient : we do as in LEARN_backpropBatch */ ret_code = compute_gradient(start_pattern,end_pattern, LEARN_PARAM3(parameterInArray), ¤t_error) ; if(ret_code != KRERR_NO_ERROR) return (ret_code); /* now we have the gradient in vector 'gradient' and copy it to p and r */ for (i=0 ; i < scg_space_size ; i++) { p[i] = - *scg_gradient[i] ; r[i] = p[i] ; } norm_of_rk = sqrt(square_of_norm(r,scg_space_size)) ; start_scg = FALSE ; stop_scg = FALSE ; } /* end of start scg */ if (stop_scg) { OutParameter[0] = current_error ; return (ret_code); } /* main part of SCG */ if (success) {/* calculate second order information */ TRACE(("success\n")) ; norm_of_p_2 = square_of_norm(p,scg_space_size) ; TRACE((" norm_of_p_2=%e ", norm_of_p_2)) ; if (norm_of_p_2 <= tolerance*tolerance) { stop_scg = 1 ; OutParameter[0] = current_error ; k++; return(ret_code) ; } sigma = sigma_1/sqrt(norm_of_p_2) ; /* in order to compute the new step we need a need gradient. */ for (i=0 ; i < scg_space_size ; i++) { old_gradient[i] = *scg_gradient[i] ; old_weights[i] = *weights[i]; } old_error = current_error ; TRACE(("old_error=%e ", old_error)) ; /* now we move to the new point in weight space */ for (i=0 ; i < scg_space_size ; i++) *weights[i] += sigma*p[i] ; /* and compute the new gradient */ ret_code = compute_gradient(start_pattern,end_pattern, LEARN_PARAM3(parameterInArray), ¤t_error) ; if(ret_code != KRERR_NO_ERROR) return (ret_code); TRACE(("current_error = %e",current_error)) ; /* now we have the new gradient and we continue the step computation */ for (i=0 ; i < scg_space_size ; i++) step[i] = (*scg_gradient[i]-old_gradient[i])/sigma ; delta = product_of_xt_by_y(p,step,scg_space_size) ; TRACE(("\n delta=%e\n",delta)) ; TRACE(("end of success true\n")); } /* end of if (success) */ /* scale delta */ delta += (lambda - lambda_bar) * norm_of_p_2 ; TRACE(("delta scaled = %e\n", delta)); if (delta <=0) { /* make the Hessian positive definite */ lambda_bar = 2.0 * (lambda - delta/norm_of_p_2) ; delta = -delta + lambda*norm_of_p_2 ; lambda = lambda_bar ; TRACE(("hessian: l_bar=%e delta=%e lambda=%e\n",lambda_bar,delta,lambda)); } /* calculate step size */ mu = product_of_xt_by_y(p,r,scg_space_size) ; alpha = mu/delta ; TRACE(("mu=%e alpha=%e\n",mu,alpha)); /* calculate the comparison parameter */ /* we must compute a new gradient but this time we do not want to */ /* keep the previous values (they were useful only for */ /* approximating the Hessian )*/ for (i=0 ; i < scg_space_size ; i++) *weights[i]=old_weights[i]+alpha*p[i]; ret_code = compute_gradient(start_pattern,end_pattern, LEARN_PARAM3(parameterInArray), ¤t_error) ; if(ret_code != KRERR_NO_ERROR) return (ret_code); TRACE(("current error=%e\n",current_error)) ; grand_delta = 2.0*delta*(old_error-current_error)/(mu*mu) ; TRACE(("grand delta=%e\n",grand_delta)); if (grand_delta >= 0) { /* a successful reduction in error */ /* can be made */ float r_sum = 0.0 ; /* product of r(k+1) by r(k) */ float tmp ; TRACE(("ERROR REDUCTION of %e %% --- ", (old_error-current_error)/old_error*100.0)); under_tolerance = 2.0 * fabs(old_error-current_error) <= tolerance * (fabs(old_error)+fabs(current_error)+1E-10) ; TRACE(("under tolerance = %d\n",under_tolerance)); /* we already are at w(k+1) in weight space, so we don't move */ /* we compute |r(k)| before changing r to r(k+1) */ norm_of_rk = sqrt(square_of_norm(r,scg_space_size)) ; /* now, r <- r(k+1) */ for (i=0 ; i < scg_space_size ; i++) { tmp = -1.0 * *scg_gradient[i] ; r_sum += tmp * r[i] ; r[i] = tmp ; } lambda_bar = 0 ; success = TRUE ; if (k >= scg_space_size) { restart_scg = TRUE ; for (i=0 ; i < scg_space_size ; i++) p[i] = r[i] ; } else { /* compute new conjugate direction */ beta = (square_of_norm(r,scg_space_size) - r_sum)/mu ; TRACE(("beta=%e\n",beta)); for (i=0 ; i < scg_space_size ; i++) p[i] = r[i]+ beta*p[i] ; restart_scg = FALSE ; } if (grand_delta >=0.75) lambda = lambda/4.0 ; } else { /* no reduction can be made */ TRACE(("NO REDUCTION\n")); under_tolerance = 0 ; /* we must go back to w(k), since w(k)+alpha*p(k) is not better */ for (i=0 ; i < scg_space_size ; i++) *weights[i] = old_weights[i] ; current_error = old_error ; lambda_bar = lambda ; success = FALSE ; } if (grand_delta < 0.25) lambda = lambda+delta*(1-grand_delta)/norm_of_p_2 ; /* let's try to prevent floating-point exceptions. Lambda may become too big even with the under_tolerance criterion, in case of a several consecutive 'NO REDUCTION'. */ if (lambda > MAXFLOAT) lambda = MAXFLOAT ; TRACE(("lambda after resizing=%e\n",lambda)) ; TRACE(("|r(k)| = %e\n",norm_of_rk)) ; /* scg stops after 3 consecutive under tolerance steps */ if (under_tolerance) count_under_tol++ ; else count_under_tol = 0 ; stop_scg = (count_under_tol > 2) || (norm_of_rk <= tolerance) ; TRACE(("STOP SCG : count_under_tol=%d, stop_scg=%d\n", count_under_tol, stop_scg)) ; if (stop_scg) count_under_tol = 0 ; /* just in case the user wants to try with a smaller tolerance */ k++; OutParameter[0] = current_error ; return (ret_code); } /***************************************************************************** FUNCTION : product_of_xt_by_y PURPOSE : RETURNS : NOTES : UPDATE : ******************************************************************************/ static float product_of_xt_by_y(float * x, float * y, int tab_size) { int indice ; float sum = 0 ; for (indice = 0 ; indice < tab_size ; indice++) sum += x[indice]*y[indice] ; return(sum) ; } /***************************************************************************** FUNCTION : square_of_norm PURPOSE : RETURNS : NOTES : UPDATE : ******************************************************************************/ static float square_of_norm(float * x, int tab_size) { return(product_of_xt_by_y(x,x,tab_size)); } /***************************************************************************** FUNCTION : compute_gradient PURPOSE : computes the SCG gradient RETURNS : kernel error code NOTES : UPDATE : ******************************************************************************/ static krui_err compute_gradient(int start_pattern, int end_pattern, float delta_max, float * error) { int pattern_no, sub_pat_no, i ; clearDeltas(); /* compute the necessary sub patterns */ KernelErrorCode = kr_initSubPatternOrder(start_pattern,end_pattern); if(KernelErrorCode != KRERR_NO_ERROR) return (KernelErrorCode); *error = 0.0; /* reset network error value */ while(kr_getSubPatternByOrder(&pattern_no,&sub_pat_no)){ propagateNetForward(pattern_no,sub_pat_no); /* Forward propagation */ *error += propagateNetBackwardBatch(pattern_no,sub_pat_no, delta_max, NULL, 0); /* -0.5 * gradient's coordinates are stored in value_a's */ } for (i=0 ; i< scg_space_size ; i++) *scg_gradient[i] = - 2.0 * *scg_gradient[i]; return (KernelErrorCode); } /***************************************************************************** FUNCTION : TEST_SCG PURPOSE : tests the network using TEST_backprop. RETURNS : kernel error code NOTES : saves the value of NetModified for initialising LEARN_SCG UPDATE : 18.10.95 by Thomas Gern ******************************************************************************/ krui_err TEST_SCG(int start_pattern, int end_pattern, float *parameterInArray, int NoOfInParams, float **parameterOutArray, int *NoOfOutParams) { bool OldNetModified; krui_err ret_val; OldNetModified = NetModified; ret_val = TEST_backprop(start_pattern, end_pattern, parameterInArray, NoOfInParams, parameterOutArray, NoOfOutParams); NetModified = OldNetModified; return ret_val; }