// Copyright (C) 2004,2005 Michael Creel <michael.creel@uab.es>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// bfgsmin
//
// bfgs or limited memory bfgs minimization of function of form
//
// [value, return_2,..., return_m] = f(arg_1, arg_2,..., arg_n)
//
// By default, minimization is w.r.t. arg_1, but this can be overridden
#include <oct.h>
#include <octave/parse.h>
#include <octave/Cell.h>
#include <octave/lo-mappers.h>
#include <float.h>
#include "error.h"
// define argument checks
static bool
any_bad_argument(const octave_value_list& args)
{
int i;
if (!args(0).is_string())
{
error("bfgsmin: first argument must be string holding objective function name");
return true;
}
if (!args(1).is_cell())
{
error("bfgsmin: second argument must cell array of function arguments");
return true;
}
// controls should be 4 or 5 element cell array of integers
if (args.length() > 2)
{
Cell control (args(2).cell_value()); // get them to look at them
if (error_state)
{
error("bfgsmin: 3rd argument must be a cell array of 4 or 5 integers");
return true;
}
// there should be 4 or 5 elements, depending on if bfgs or lbfgs
if (((control.length() < 4) || (control.length() > 5)))
{
error("bfgsmin: 3rd argument must be a cell array of 4 or 5 integers");
return true;
}
// special treatment for 1st element - it can be Inf or an integer
if (!xisinf (control(0).double_value()))
{
int tmp = control(0).int_value();
if (error_state)
{
error("bfgsmin: 3rd argument must be a cell array of 4 or 5 integers");
return true;
}
}
// now the other 3 or 4 elements
for (i=1; i < control.length(); i++)
{
int tmp = control(i).int_value();
if (error_state)
{
error("bfgsmin: 3rd argument must be a cell array of 4 or 5 integers");
return true;
}
if (i == 3) // minarg must point to one of args(1)
{
if ((tmp > args(1).length())||(tmp < 1))
{
error("bfgsmin: 4th element of controls must be a positive integer that indicates \n\
which of the elements of the second argument is the one minimization is over");
return true;
}
}
if (i == 4) // memory must be zero (bfgs) or positive integer (lbfgs)
{
if (tmp < 0)
{
error("bfgsmin: 5th argument of controls, if provided\n\
must be 0 (bfgs) or positive integer (memory for lbfgs)");
return true;
}
}
}
}
// tolerances should be 3 element cell array of doubles
if (args.length() > 3)
{
// type is cell
Cell tols (args(3).cell_value());
if (error_state)
{
error("bfgsmin: 4th argument must be a cell array of 3 positive doubles");
return true;
}
// there should be 3 elements
if (!(tols.length()==3))
{
error("bfgsmin: 4th argument, if supplied, must a 3-element cell array of tolerances");
return true;
}
for (i=0; i<3; i++)
{
double tmp = tols(i).double_value();
if (error_state)
{
error("bfgsmin: 4th argument must be a cell array of 3 positive doubles");
return true;
}
if(tmp < 0)
{
error("bfgsmin: 4th argument must be a cell array of 3 positive doubles");
return true;
}
}
}
return false;
}
// this is the lbfgs direction, used if control has 5 elements
ColumnVector lbfgs_recursion(int memory, Matrix& sigmas, Matrix& gammas, ColumnVector d)
{
if (memory == 0)
{
const int n = sigmas.columns();
ColumnVector sig = sigmas.column(n-1);
ColumnVector gam = gammas.column(n-1);
// do conditioning if there is any memory
double cond = gam.transpose()*gam;
if (cond > 0)
{
cond = (sig.transpose()*gam) / cond;
d = cond*d;
}
return d;
}
else
{
const int k = d.rows();
const int n = sigmas.columns();
int i, j;
ColumnVector sig = sigmas.column(memory-1);
ColumnVector gam = gammas.column(memory-1);
double rho;
rho = 1.0 / (gam.transpose() * sig);
double alpha;
alpha = rho * (sig.transpose() * d);
d = d - alpha*gam;
d = lbfgs_recursion(memory - 1, sigmas, gammas, d);
d = d + (alpha - rho * gam.transpose() * d) * sig;
}
return d;
}
DEFUN_DLD(bfgsmin, args, ,
"bfgsmin: bfgs minimization of a function with respect to a column vector.\n\
\n\
By default, numeric derivatives are used, but see bfgsmin_example.m\n\
for how to use analytic derivatives\n\
\n\
Usage: [x, obj, convergence] = bfgsmin(\"f\", args, control, tols)\n\
\n\
Arguments:\n\
* \"f\": function name (string)\n\
* args: a cell array that holds all arguments of the function\n\
The argument with respect to which minimization is done\n\
MUST be a COLUMN vector\n\
* control: an optional cell array of 4 or 5 elements\n\
* elem 1: maximum iterations (-1 = infinity (default))\n\
* elem 2: verbosity\n\
0 = no screen output (default)\n\
1 = summary every iteration\n\
2 = only final results\n\
* elem 3: convergence criterion\n\
1 = strict (function, gradient and param change) (default)\n\
2 = weak - only function convergence required\n\
* elem 4: arg with respect to which minimization is done\n\
(default = first)\n\
* elem 5: (optimal) Memory limit for lbfgs. If it's a positive integer\n\
then lbfgs will be use. Otherwise ordinary bfgs is used\n\
* tols: an optional cell array of 3 elements\n\
* elem 1: function change tolerance, default 1e-12\n\
* elem 2: parameter change tolerance, default 1e-6\n\
* elem 3: gradient tolerance, default 1e-4\n\
\n\
Returns:\n\
* x: the minimizer\n\
* obj: the value of f() at x\n\
* convergence: 1 if normal conv, other values if not\n\
* iters: number of iterations performed\n\
\n\
Example:\n\
function a = f(x,y)\n\
a = x'*x + y'*y;\n\
endfunction\n\
\n\
In this example, x is optimized since it's the first\n\
element of the cell array, y is a fixed constant = 1\n\
\n\
bfgsmin(\"f\", {ones(2,1), 1}, {10,2,1,1})\n\
\n\
bfgsmin final results: Iteration 1\n\
Stepsize 0.0000000\n\
Objective function value 1.0000000000\n\
Function conv 1 Param conv 1 Gradient conv 1\n\
params gradient change\n\
0.0000 0.0000 0.0000\n\
0.0000 0.0000 0.0000\n\
\n\
ans =\n\
\n\
6.1497e-12\n\
6.1497e-12\n\
")
{
int nargin = args.length();
if ((nargin < 2) || (nargin > 4))
{
error("bfgsmin: you must supply 2, 3 or 4 arguments");
return octave_value_list();
}
// check the arguments
if (any_bad_argument(args)) return octave_value_list();
std::string f (args(0).string_value());
Cell f_args (args(1).cell_value());
octave_value_list step_args(4,1); // for stepsize: {f, f_args, direction, minarg}
octave_value_list g_args(3,1); // for numgradient {f, f_args, minarg}
octave_value_list c_args(2,1); // for celleval {f, f_args}
octave_value_list f_return; // holder for feval returns
int max_iters, verbosity, criterion, minarg;
int convergence, iter;
int memory, gradient_failed, i, j, k, conv_fun, conv_param, conv_grad;
int have_gradient = 0;
double func_tol, param_tol, gradient_tol, stepsize, obj_value;
double obj_in, last_obj_value, denominator, test;
Matrix H, tempmatrix, H1, H2;
ColumnVector thetain, d, g, g_new, p, q, sig, gam;
// Default values for controls
max_iters = INT_MAX; // no limit on iterations
verbosity = 0; // by default don't report results to screen
criterion = 1; // strong convergence required
minarg = 1; // by default, first arg is one over which we minimize
memory = 0;
// use provided controls, if applicable
if (args.length() > 2)
{
Cell control (args(2).cell_value());
if (xisinf (control(0).double_value())) // this is to allow 'Inf' as first element of control
max_iters = -1;
else max_iters = control(0).int_value();
if (max_iters == -1) max_iters = INT_MAX;
verbosity = control(1).int_value();
criterion = control(2).int_value();
minarg = control(3).int_value();
if (control.length() > 4) memory = control(4).int_value();
}
// type checking for minimization parameter done here, since we don't know minarg
// until now
if (!(f_args(minarg - 1).is_real_matrix() || (f_args(minarg - 1).is_real_scalar())))
{
error("bfgsmin: minimization must be with respect to a column vector");
return octave_value_list();
}
if ((f_args(minarg - 1).is_real_matrix()) && (f_args(minarg - 1).columns() != 1))
{
error("bfgsmin: minimization must be with respect to a column vector");
return octave_value_list();
}
// use provided tolerances, if applicable
if (args.length() == 4)
{
Cell tols (args(3).cell_value());
func_tol = tols(0).double_value();
param_tol = tols(1).double_value();
gradient_tol = tols(2).double_value();
}
else
{
// Default values for tolerances
func_tol = 1e-12;
param_tol = 1e-6;
gradient_tol = 1e-4;
}
// arguments for stepsize function
step_args(0) = f;
step_args(1) = f_args;
step_args(3) = minarg;
// arguments for numgradient
g_args(0) = f;
g_args(1) = f_args;
g_args(2) = minarg;
// arguments for celleval (to calculate objective function value)
c_args(0) = f;
c_args(1) = f_args;
// get the minimization argument
ColumnVector theta = f_args(minarg - 1).column_vector_value();
k = theta.rows();
// containers for items in limited memory version
Matrix sigmas(k,memory);
Matrix gammas(k,memory);
// initialize things
convergence = -1; // if this doesn't change, it means that maxiters were exceeded
thetain = theta;
H = identity_matrix(k,k);
// Initial obj_value
f_return = feval("celleval", c_args);
obj_in = f_return(0).double_value();
if (error_state) // check that objective function returns a double
{
error("bfgsmin: objective function did not return a scalar numeric value");
return octave_value_list();
}
last_obj_value = obj_in;
// maybe we have analytic gradient?
// if the function returns more than one item, and the second
// is a kx1 vector, it is assumed to be the gradient
if (f_return.length() > 1)
{
if (f_return(1).is_real_matrix())
{
if ((f_return(1).rows() == k) & (f_return(1).columns() == 1))
{
g = f_return(1).column_vector_value();
have_gradient = 1; // future reference
}
}
else have_gradient = 0;
}
if (!have_gradient) // use numeric gradient
{
f_return = feval("numgradient", g_args);
if (error_state)
{
error("bfgsmin: unable to calculate numeric gradient of objective function: need to recompile numgradient?");
return octave_value_list();
}
tempmatrix = f_return(0).matrix_value();
g = tempmatrix.row((octave_idx_type)0).transpose();
}
// check that gradient is ok
gradient_failed = 0; // test = 1 means gradient failed
for (i=0; i<k;i++)
{
gradient_failed = gradient_failed + xisnan(g(i));
}
if (gradient_failed)
{
error("bfgsmin: Initial gradient could not be calculated: exiting");
convergence = 2;
}
// MAIN LOOP STARTS HERE
for (iter = 0; iter < max_iters; iter++)
{
// make sure the messages aren't stale
conv_fun = -1;
conv_param = -1;
conv_grad = -1;
if(memory > 0) // lbfgs
{
if (iter < memory) d = lbfgs_recursion(iter, sigmas, gammas, g);
else d = lbfgs_recursion(memory, sigmas, gammas, g);
d = -d;
}
else d = -H*g; // ordinary bfgs
// stepsize: try (l)bfgs direction, then steepest descent if it fails
step_args(2) = d;
f_args(minarg - 1) = theta;
step_args(1) = f_args;
f_return = feval("newtonstep", step_args);
stepsize = f_return(0).double_value();
obj_value = f_return(1).double_value();
if (stepsize == 0.0) // fall back to steepest descent
{
d = -g; // try steepest descent
step_args(2) = d;
f_return = feval("newtonstep", step_args);
stepsize = f_return(0).double_value();
obj_value = f_return(1).double_value();
}
p = stepsize*d;
// check normal convergence: all 3 must be satisfied
// function convergence
if (fabs(last_obj_value) > 1.0)
{
conv_fun = (fabs(obj_value - last_obj_value)/fabs(last_obj_value)) < func_tol;
}
else
{
conv_fun = fabs(obj_value - last_obj_value) < func_tol;
}
// parameter change convergence
test = sqrt(theta.transpose() * theta);
if (test > 1) conv_param = sqrt(p.transpose() * p) / test < param_tol ;
else conv_param = sqrt(p.transpose() * p) < param_tol;
// gradient convergence
conv_grad = sqrt(g.transpose() * g) < gradient_tol;
// Want intermediate results?
if ((verbosity > 0) && (verbosity < 2))
{
printf("\n======================================================\n");
printf("BFGSMIN intermediate results\n");
printf("\n");
if (memory > 0) printf("Using LBFGS, memory is last %d iterations\n",memory);
if (have_gradient) printf("Using analytic gradient\n");
else printf("Using numeric gradient\n");
printf("\n");
printf("------------------------------------------------------\n");
printf("Function conv %d Param conv %d Gradient conv %d\n", conv_fun, conv_param, conv_grad);
printf("------------------------------------------------------\n");
printf("Objective function value %g\n", last_obj_value);
printf("Stepsize %g\n", stepsize);
printf("%d iterations\n", iter);
printf("------------------------------------------------------\n");
printf("\n param gradient change\n");
for (j = 0; j<k; j++) printf("%8.4f %8.4f %8.4f\n",theta(j),g(j),p(j));
}
// Are we done?
if (criterion == 1)
{
if (conv_fun && conv_param && conv_grad)
{
convergence = 1;
break;
}
}
else if (conv_fun)
{
convergence = 1;
break;
}
last_obj_value = obj_value;
theta = theta + p;
// new gradient
f_args(minarg - 1) = theta;
if (have_gradient)
{
c_args(1) = f_args;
f_return = feval("celleval", c_args);
g_new = f_return(1).column_vector_value();
}
else // use numeric gradient
{
g_args(1) = f_args;
f_return = feval("numgradient", g_args);
tempmatrix = f_return(0).matrix_value();
g_new = tempmatrix.row((octave_idx_type)0).transpose();
}
// Check that gradient is ok
gradient_failed = 0; // test = 1 means gradient failed
for (i=0; i<k;i++) gradient_failed = gradient_failed + xisnan(g_new(i));
if (memory == 0) //use bfgs
{
// Hessian update if gradient ok
if (!gradient_failed)
{
q = g_new-g;
g = g_new;
denominator = q.transpose()*p;
if ((fabs(denominator) < DBL_EPSILON)) // reset Hessian if necessary
{
if (verbosity == 1) printf("bfgsmin: Hessian reset\n");
H = identity_matrix(k,k);
}
else
{
H1 = (1.0+(q.transpose() * H * q) / denominator) / denominator \
* (p * p.transpose());
H2 = (p * q.transpose() * H + H*q*p.transpose());
H2 = H2 / denominator;
H = H + H1 - H2;
}
}
else H = identity_matrix(k,k); // reset hessian if gradient fails
// then try to start again with steepest descent
}
else // use lbfgs
{
// save components for Hessian if gradient ok
if (!gradient_failed)
{
sig = p; // change in parameter
gam = g_new - g; // change in gradient
g = g_new;
// shift remembered vectors to the right (forget last)
for(j = memory - 1; j > 0; j--)
{
for(i = 0; i < k; i++)
{
sigmas(i,j) = sigmas(i,j-1);
gammas(i,j) = gammas(i,j-1);
}
}
// insert new vectors in left-most column
for(i = 0; i < k; i++)
{
sigmas(i, 0) = sig(i);
gammas(i, 0) = gam(i);
}
}
else // failed gradient - loose memory and use previous theta
{
sigmas.fill(0.0);
gammas.fill(0.0);
theta = theta - p;
}
}
}
// Want last iteration results?
if (verbosity > 0)
{
printf("\n======================================================\n");
printf("BFGSMIN final results\n");
printf("\n");
if (memory > 0) printf("Used LBFGS, memory is last %d iterations\n",memory);
if (have_gradient) printf("Used analytic gradient\n");
else printf("Used numeric gradient\n");
printf("\n");
printf("------------------------------------------------------\n");
if (convergence == -1) printf("NO CONVERGENCE: max iters exceeded\n");
if ((convergence == 1) & (criterion == 1)) printf("STRONG CONVERGENCE\n");
if ((convergence == 1) & !(criterion == 1)) printf("WEAK CONVERGENCE\n");
if (convergence == 2) printf("NO CONVERGENCE: algorithm failed\n");
printf("Function conv %d Param conv %d Gradient conv %d\n", conv_fun, conv_param, conv_grad);
printf("------------------------------------------------------\n");
printf("Objective function value %g\n", last_obj_value);
printf("Stepsize %g\n", stepsize);
printf("%d iterations\n", iter);
printf("------------------------------------------------------\n");
printf("\n param gradient change\n");
for (j = 0; j<k; j++) printf("%8.4f %8.4f %8.4f\n",theta(j),g(j),p(j));
}
f_return(0) = theta;
f_return(1) = obj_value;
f_return(2) = convergence;
f_return(3) = iter;
return octave_value_list(f_return);
}
syntax highlighted by Code2HTML, v. 0.9.1