/*****************************************************************************
FILE : $Source: /projects/higgs1/SNNS/CVS/SNNS/kernel/sources/scaled_conj_grad.c,v $
SHORTNAME : scaled_conj_grad.c
SNNS VERSION : 4.2
PURPOSE : SNNS-Kernel Learning Functions
NOTES : with following learning functions:
- SCG
AUTHOR : Bruno ORSIER
DATE : 09.95
VERSION : 1.01
CHANGES :1.01 -> 1.02
- bug fix: in case the first computed gradient is null,
norm_of_p_2 = |p|^2 is equal to zero and then a division by
zero occurs in the computation of sigma. The 4th learning
parameter, tolerance, is now also used to check whether |p|
is almost equal to zero. If the answer is yes, SCG stops.
SCG also now stops if the norm of the gradient is smaller
than tolerance.
1.0 -> 1.01
new termination criterion, based on the book 'Numerical
Recipes'. Takes into account the floating-point precision of
the machine. (new #define TOLERANCE in .ph)
new learning parameters with defaults
Validation-function by Thomas Gern
******************************************************************************/
#include <config.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#ifdef HAVE_VALUES_H
#include <values.h>
#endif
#include "kr_typ.h" /* Kernel Types and Constants */
#include "kr_const.h" /* Constant Declarators for SNNS-Kernel */
#include "kernel.h" /* kernel function prototypes */
#include "kr_mac.h" /* Kernel Macros */
#include "glob_typ.h"
#include "learn_f.h"
#include "scaled_conj_grad.ph"
#include "kr_newpattern.h"
#ifdef SCG_DEBUG
#define TRACE(s) printf s
#else
#define TRACE(s)
#endif
/*****************************************************************************
FUNCTION : LEARN_SCG
PURPOSE : main function for SCG learning algorithm
RETURNS : kernel error code
NOTES :
UPDATE :
******************************************************************************/
krui_err LEARN_SCG(int start_pattern, int end_pattern, float *parameterInArray,
int NoOfInParams, float **parameterOutArray,
int *NoOfOutParams)
{
static float OutParameter[1]; /* OutParameter[0] stores the
* learning error. */
static int k, restart_scg = FALSE,
stop_scg = FALSE,
success;
static int count_under_tol ; /* scg stops after 3 consecutive under
tolerance cases. This is the counter. */
static float delta, norm_of_p_2, lambda, lambda_bar, current_error,
old_error, norm_of_rk ;
static FlintType* *weights ;
static FlintType *old_gradient, *p, *r, *old_weights, *step ;
register FlagWord flags;
register struct Link *link_ptr;
register struct Unit *unit_ptr;
register struct Site *site_ptr;
int i, ret_code ;
int start_scg, make_allocations, under_tolerance ;
float sigma, mu, alpha, grand_delta, beta ;
float sigma_1, lambda_1, tolerance ; /* learning parameters */
if (NoOfInParams < 4)
return (KRERR_PARAMETERS); /* Not enough input parameters */
/* DEFAULTS: */
if (( sigma_1 = LEARN_PARAM1( parameterInArray )) == 0.0)
sigma_1 = SCG_FIRST_SIGMA ;
if (( lambda_1 = LEARN_PARAM2( parameterInArray )) == 0.0)
lambda_1 = SCG_FIRST_LAMBDA ;
/* param 3 is DELTA_max */
if (( tolerance = LEARN_PARAM4( parameterInArray )) == 0.0)
tolerance = SCG_TOLERANCE ;
*NoOfOutParams = 1; /* One return value is available (the
* learning error) */
*parameterOutArray = OutParameter; /* set the output parameter reference */
ret_code = KRERR_NO_ERROR; /* reset return code */
start_scg = NetModified || NetInitialize || LearnFuncHasChanged ;
make_allocations = NetModified || LearnFuncHasChanged ;
if (NetModified) { /* Net has been modified */
/* count the no. of I/O units and check the patterns */
ret_code = kr_IOCheck();
if (ret_code < KRERR_NO_ERROR)
return (ret_code);
/* sort units by topology and by topologic type */
ret_code = kr_topoSort(TOPOLOGICAL_FF);
if ((ret_code != KRERR_NO_ERROR) && (ret_code != KRERR_DEAD_UNITS))
return (ret_code);
NetModified = FALSE;
}
if (make_allocations) {
int ts = 0, malloc_size ;
/* store pointers to weigths and biases into a table */
/* 1) determines the size of the table */
scg_space_size = 0 ;
FOR_ALL_UNITS(unit_ptr)
if (!IS_INPUT_UNIT(unit_ptr) && !IS_SPECIAL_UNIT(unit_ptr)) {
flags = unit_ptr->flags;
scg_space_size++ ; /* bias */
if ((flags & UFLAG_IN_USE) == UFLAG_IN_USE) {
/* unit is in use */
if (flags & UFLAG_SITES) { /* unit has sites */
FOR_ALL_SITES_AND_LINKS(unit_ptr, site_ptr, link_ptr)
scg_space_size++;
} else { /* unit has no sites */
if (flags & UFLAG_DLINKS) {
/* unit has direct links */
FOR_ALL_LINKS(unit_ptr, link_ptr)
scg_space_size++;
}
}
}
}
TRACE(("SCG - weights+biases space size : %d\n",scg_space_size));
/* allocate space for all vectors of size scg_space_size */
malloc_size = scg_space_size*(sizeof(FlintType *)) ;
if (weights != NULL) free(weights) ;
weights = (FlintType **)malloc(malloc_size) ;
if (scg_gradient != NULL) free(scg_gradient) ;
scg_gradient = (FlintType **)malloc(malloc_size) ;
if (step != NULL) free(step) ;
step = (FlintType *) malloc(scg_space_size*sizeof(FlintType)) ;
if (p != NULL) free(p) ;
p = (FlintType *) malloc(scg_space_size*sizeof(FlintType)) ;
if (r != NULL) free(r) ;
r = (FlintType *) malloc(scg_space_size*sizeof(FlintType)) ;
if (old_weights != NULL) free(old_weights) ;
old_weights = (FlintType *) malloc(scg_space_size*sizeof(FlintType)) ;
if (old_gradient != NULL) free(old_gradient) ;
old_gradient = (FlintType *) malloc(scg_space_size*sizeof(FlintType)) ;
if ((step == NULL)||(scg_gradient == NULL)||(weights == NULL)
||(p==NULL)||(r==NULL) ||(old_weights==NULL) ||(old_gradient ==NULL))
{
printf("SCG : malloc problem\n") ;
return(KRERR_CRITICAL_MALLOC);
}
else TRACE(("SCG - malloc's done\n"));
/* 2) store the adresses */
FOR_ALL_UNITS(unit_ptr)
if (!IS_INPUT_UNIT(unit_ptr) && !IS_SPECIAL_UNIT(unit_ptr)) {
flags = unit_ptr->flags;
scg_gradient[ts] = &unit_ptr->value_a ;
weights[ts++] = &unit_ptr->bias;
if ((flags & UFLAG_IN_USE) == UFLAG_IN_USE) {
/* unit is in use */
if (flags & UFLAG_SITES) { /* unit has sites */
FOR_ALL_SITES_AND_LINKS(unit_ptr, site_ptr, link_ptr) {
scg_gradient[ts] = &link_ptr->value_a ;
weights[ts++] = &link_ptr->weight;
}
} else { /* unit has no sites */
if (flags & UFLAG_DLINKS) {
/* unit has direct links */
FOR_ALL_LINKS(unit_ptr, link_ptr) {
scg_gradient[ts] = &link_ptr->value_a ;
weights[ts++] = &link_ptr->weight;
}
}
}
}
}
} /* make_allocations */
if (start_scg || restart_scg) {
/* now we choose initial values for SCG parameters */
lambda = lambda_1 ;
lambda_bar = 0.0 ;
success = TRUE ;
k = 1 ;
restart_scg = FALSE ;
count_under_tol = 0 ;
TRACE(("START || RESTART\n"));
}
TRACE(("\nSCG K=%d\n",k));
if (start_scg) {
TRACE(("SCG - starting\n"));
/* initialize p and r */
/* we have starting weight vector (initialized through xgui or snnsbat)
and we need the current gradient : we do as in LEARN_backpropBatch */
ret_code = compute_gradient(start_pattern,end_pattern,
LEARN_PARAM3(parameterInArray),
¤t_error) ;
if(ret_code != KRERR_NO_ERROR) return (ret_code);
/* now we have the gradient in vector 'gradient' and copy it to p and r */
for (i=0 ; i < scg_space_size ; i++) {
p[i] = - *scg_gradient[i] ;
r[i] = p[i] ;
}
norm_of_rk = sqrt(square_of_norm(r,scg_space_size)) ;
start_scg = FALSE ;
stop_scg = FALSE ;
} /* end of start scg */
if (stop_scg) {
OutParameter[0] = current_error ;
return (ret_code);
}
/* main part of SCG */
if (success) {/* calculate second order information */
TRACE(("success\n")) ;
norm_of_p_2 = square_of_norm(p,scg_space_size) ;
TRACE((" norm_of_p_2=%e ", norm_of_p_2)) ;
if (norm_of_p_2 <= tolerance*tolerance) {
stop_scg = 1 ;
OutParameter[0] = current_error ;
k++;
return(ret_code) ;
}
sigma = sigma_1/sqrt(norm_of_p_2) ;
/* in order to compute the new step we need a need gradient. */
for (i=0 ; i < scg_space_size ; i++) {
old_gradient[i] = *scg_gradient[i] ;
old_weights[i] = *weights[i];
}
old_error = current_error ;
TRACE(("old_error=%e ", old_error)) ;
/* now we move to the new point in weight space */
for (i=0 ; i < scg_space_size ; i++)
*weights[i] += sigma*p[i] ;
/* and compute the new gradient */
ret_code = compute_gradient(start_pattern,end_pattern,
LEARN_PARAM3(parameterInArray),
¤t_error) ;
if(ret_code != KRERR_NO_ERROR) return (ret_code);
TRACE(("current_error = %e",current_error)) ;
/* now we have the new gradient and we continue the step computation */
for (i=0 ; i < scg_space_size ; i++)
step[i] = (*scg_gradient[i]-old_gradient[i])/sigma ;
delta = product_of_xt_by_y(p,step,scg_space_size) ;
TRACE(("\n delta=%e\n",delta)) ;
TRACE(("end of success true\n"));
} /* end of if (success) */
/* scale delta */
delta += (lambda - lambda_bar) * norm_of_p_2 ;
TRACE(("delta scaled = %e\n", delta));
if (delta <=0) { /* make the Hessian positive definite */
lambda_bar = 2.0 * (lambda - delta/norm_of_p_2) ;
delta = -delta + lambda*norm_of_p_2 ;
lambda = lambda_bar ;
TRACE(("hessian: l_bar=%e delta=%e lambda=%e\n",lambda_bar,delta,lambda));
}
/* calculate step size */
mu = product_of_xt_by_y(p,r,scg_space_size) ;
alpha = mu/delta ;
TRACE(("mu=%e alpha=%e\n",mu,alpha));
/* calculate the comparison parameter */
/* we must compute a new gradient but this time we do not want to */
/* keep the previous values (they were useful only for */
/* approximating the Hessian )*/
for (i=0 ; i < scg_space_size ; i++) *weights[i]=old_weights[i]+alpha*p[i];
ret_code = compute_gradient(start_pattern,end_pattern,
LEARN_PARAM3(parameterInArray),
¤t_error) ;
if(ret_code != KRERR_NO_ERROR) return (ret_code);
TRACE(("current error=%e\n",current_error)) ;
grand_delta = 2.0*delta*(old_error-current_error)/(mu*mu) ;
TRACE(("grand delta=%e\n",grand_delta));
if (grand_delta >= 0) { /* a successful reduction in error */
/* can be made */
float r_sum = 0.0 ; /* product of r(k+1) by r(k) */
float tmp ;
TRACE(("ERROR REDUCTION of %e %% --- ",
(old_error-current_error)/old_error*100.0));
under_tolerance =
2.0 * fabs(old_error-current_error)
<=
tolerance * (fabs(old_error)+fabs(current_error)+1E-10) ;
TRACE(("under tolerance = %d\n",under_tolerance));
/* we already are at w(k+1) in weight space, so we don't move */
/* we compute |r(k)| before changing r to r(k+1) */
norm_of_rk = sqrt(square_of_norm(r,scg_space_size)) ;
/* now, r <- r(k+1) */
for (i=0 ; i < scg_space_size ; i++) {
tmp = -1.0 * *scg_gradient[i] ;
r_sum += tmp * r[i] ;
r[i] = tmp ;
}
lambda_bar = 0 ;
success = TRUE ;
if (k >= scg_space_size) {
restart_scg = TRUE ;
for (i=0 ; i < scg_space_size ; i++) p[i] = r[i] ;
}
else { /* compute new conjugate direction */
beta = (square_of_norm(r,scg_space_size) - r_sum)/mu ;
TRACE(("beta=%e\n",beta));
for (i=0 ; i < scg_space_size ; i++) p[i] = r[i]+ beta*p[i] ;
restart_scg = FALSE ;
}
if (grand_delta >=0.75) lambda = lambda/4.0 ;
}
else {
/* no reduction can be made */
TRACE(("NO REDUCTION\n"));
under_tolerance = 0 ;
/* we must go back to w(k), since w(k)+alpha*p(k) is not better */
for (i=0 ; i < scg_space_size ; i++) *weights[i] = old_weights[i] ;
current_error = old_error ;
lambda_bar = lambda ;
success = FALSE ;
}
if (grand_delta < 0.25)
lambda = lambda+delta*(1-grand_delta)/norm_of_p_2 ;
/* let's try to prevent floating-point
exceptions. Lambda may become too big even
with the under_tolerance criterion, in case
of a several consecutive 'NO REDUCTION'. */
if (lambda > MAXFLOAT) lambda = MAXFLOAT ;
TRACE(("lambda after resizing=%e\n",lambda)) ;
TRACE(("|r(k)| = %e\n",norm_of_rk)) ;
/* scg stops after 3 consecutive under tolerance steps */
if (under_tolerance) count_under_tol++ ;
else count_under_tol = 0 ;
stop_scg = (count_under_tol > 2) || (norm_of_rk <= tolerance) ;
TRACE(("STOP SCG : count_under_tol=%d, stop_scg=%d\n",
count_under_tol, stop_scg)) ;
if (stop_scg) count_under_tol = 0 ; /* just in case the user wants to try
with a smaller tolerance */
k++;
OutParameter[0] = current_error ;
return (ret_code);
}
/*****************************************************************************
FUNCTION : product_of_xt_by_y
PURPOSE :
RETURNS :
NOTES :
UPDATE :
******************************************************************************/
static float product_of_xt_by_y(float * x, float * y, int tab_size)
{
int indice ;
float sum = 0 ;
for (indice = 0 ; indice < tab_size ; indice++)
sum += x[indice]*y[indice] ;
return(sum) ;
}
/*****************************************************************************
FUNCTION : square_of_norm
PURPOSE :
RETURNS :
NOTES :
UPDATE :
******************************************************************************/
static float square_of_norm(float * x, int tab_size)
{
return(product_of_xt_by_y(x,x,tab_size));
}
/*****************************************************************************
FUNCTION : compute_gradient
PURPOSE : computes the SCG gradient
RETURNS : kernel error code
NOTES :
UPDATE :
******************************************************************************/
static krui_err compute_gradient(int start_pattern, int end_pattern,
float delta_max, float * error)
{
int pattern_no, sub_pat_no, i ;
clearDeltas();
/* compute the necessary sub patterns */
KernelErrorCode = kr_initSubPatternOrder(start_pattern,end_pattern);
if(KernelErrorCode != KRERR_NO_ERROR)
return (KernelErrorCode);
*error = 0.0; /* reset network error value */
while(kr_getSubPatternByOrder(&pattern_no,&sub_pat_no)){
propagateNetForward(pattern_no,sub_pat_no); /* Forward propagation */
*error += propagateNetBackwardBatch(pattern_no,sub_pat_no, delta_max,
NULL, 0);
/* -0.5 * gradient's coordinates are stored in value_a's */
}
for (i=0 ; i< scg_space_size ; i++)
*scg_gradient[i] = - 2.0 * *scg_gradient[i];
return (KernelErrorCode);
}
/*****************************************************************************
FUNCTION : TEST_SCG
PURPOSE : tests the network using TEST_backprop.
RETURNS : kernel error code
NOTES : saves the value of NetModified for initialising LEARN_SCG
UPDATE : 18.10.95 by Thomas Gern
******************************************************************************/
krui_err TEST_SCG(int start_pattern, int end_pattern,
float *parameterInArray, int NoOfInParams,
float **parameterOutArray, int *NoOfOutParams)
{
bool OldNetModified;
krui_err ret_val;
OldNetModified = NetModified;
ret_val = TEST_backprop(start_pattern, end_pattern,
parameterInArray, NoOfInParams,
parameterOutArray, NoOfOutParams);
NetModified = OldNetModified;
return ret_val;
}
syntax highlighted by Code2HTML, v. 0.9.1