// Copyright (C) 2004,2005 Michael Creel // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // bfgsmin // // bfgs or limited memory bfgs minimization of function of form // // [value, return_2,..., return_m] = f(arg_1, arg_2,..., arg_n) // // By default, minimization is w.r.t. arg_1, but this can be overridden #include #include #include #include #include #include "error.h" // define argument checks static bool any_bad_argument(const octave_value_list& args) { int i; if (!args(0).is_string()) { error("bfgsmin: first argument must be string holding objective function name"); return true; } if (!args(1).is_cell()) { error("bfgsmin: second argument must cell array of function arguments"); return true; } // controls should be 4 or 5 element cell array of integers if (args.length() > 2) { Cell control (args(2).cell_value()); // get them to look at them if (error_state) { error("bfgsmin: 3rd argument must be a cell array of 4 or 5 integers"); return true; } // there should be 4 or 5 elements, depending on if bfgs or lbfgs if (((control.length() < 4) || (control.length() > 5))) { error("bfgsmin: 3rd argument must be a cell array of 4 or 5 integers"); return true; } // special treatment for 1st element - it can be Inf or an integer if (!xisinf (control(0).double_value())) { int tmp = control(0).int_value(); if (error_state) { error("bfgsmin: 3rd argument must be a cell array of 4 or 5 integers"); return true; } } // now the other 3 or 4 elements for (i=1; i < control.length(); i++) { int tmp = control(i).int_value(); if (error_state) { error("bfgsmin: 3rd argument must be a cell array of 4 or 5 integers"); return true; } if (i == 3) // minarg must point to one of args(1) { if ((tmp > args(1).length())||(tmp < 1)) { error("bfgsmin: 4th element of controls must be a positive integer that indicates \n\ which of the elements of the second argument is the one minimization is over"); return true; } } if (i == 4) // memory must be zero (bfgs) or positive integer (lbfgs) { if (tmp < 0) { error("bfgsmin: 5th argument of controls, if provided\n\ must be 0 (bfgs) or positive integer (memory for lbfgs)"); return true; } } } } // tolerances should be 3 element cell array of doubles if (args.length() > 3) { // type is cell Cell tols (args(3).cell_value()); if (error_state) { error("bfgsmin: 4th argument must be a cell array of 3 positive doubles"); return true; } // there should be 3 elements if (!(tols.length()==3)) { error("bfgsmin: 4th argument, if supplied, must a 3-element cell array of tolerances"); return true; } for (i=0; i<3; i++) { double tmp = tols(i).double_value(); if (error_state) { error("bfgsmin: 4th argument must be a cell array of 3 positive doubles"); return true; } if(tmp < 0) { error("bfgsmin: 4th argument must be a cell array of 3 positive doubles"); return true; } } } return false; } // this is the lbfgs direction, used if control has 5 elements ColumnVector lbfgs_recursion(int memory, Matrix& sigmas, Matrix& gammas, ColumnVector d) { if (memory == 0) { const int n = sigmas.columns(); ColumnVector sig = sigmas.column(n-1); ColumnVector gam = gammas.column(n-1); // do conditioning if there is any memory double cond = gam.transpose()*gam; if (cond > 0) { cond = (sig.transpose()*gam) / cond; d = cond*d; } return d; } else { const int k = d.rows(); const int n = sigmas.columns(); int i, j; ColumnVector sig = sigmas.column(memory-1); ColumnVector gam = gammas.column(memory-1); double rho; rho = 1.0 / (gam.transpose() * sig); double alpha; alpha = rho * (sig.transpose() * d); d = d - alpha*gam; d = lbfgs_recursion(memory - 1, sigmas, gammas, d); d = d + (alpha - rho * gam.transpose() * d) * sig; } return d; } DEFUN_DLD(bfgsmin, args, , "bfgsmin: bfgs minimization of a function with respect to a column vector.\n\ \n\ By default, numeric derivatives are used, but see bfgsmin_example.m\n\ for how to use analytic derivatives\n\ \n\ Usage: [x, obj, convergence] = bfgsmin(\"f\", args, control, tols)\n\ \n\ Arguments:\n\ * \"f\": function name (string)\n\ * args: a cell array that holds all arguments of the function\n\ The argument with respect to which minimization is done\n\ MUST be a COLUMN vector\n\ * control: an optional cell array of 4 or 5 elements\n\ * elem 1: maximum iterations (-1 = infinity (default))\n\ * elem 2: verbosity\n\ 0 = no screen output (default)\n\ 1 = summary every iteration\n\ 2 = only final results\n\ * elem 3: convergence criterion\n\ 1 = strict (function, gradient and param change) (default)\n\ 2 = weak - only function convergence required\n\ * elem 4: arg with respect to which minimization is done\n\ (default = first)\n\ * elem 5: (optimal) Memory limit for lbfgs. If it's a positive integer\n\ then lbfgs will be use. Otherwise ordinary bfgs is used\n\ * tols: an optional cell array of 3 elements\n\ * elem 1: function change tolerance, default 1e-12\n\ * elem 2: parameter change tolerance, default 1e-6\n\ * elem 3: gradient tolerance, default 1e-4\n\ \n\ Returns:\n\ * x: the minimizer\n\ * obj: the value of f() at x\n\ * convergence: 1 if normal conv, other values if not\n\ * iters: number of iterations performed\n\ \n\ Example:\n\ function a = f(x,y)\n\ a = x'*x + y'*y;\n\ endfunction\n\ \n\ In this example, x is optimized since it's the first\n\ element of the cell array, y is a fixed constant = 1\n\ \n\ bfgsmin(\"f\", {ones(2,1), 1}, {10,2,1,1})\n\ \n\ bfgsmin final results: Iteration 1\n\ Stepsize 0.0000000\n\ Objective function value 1.0000000000\n\ Function conv 1 Param conv 1 Gradient conv 1\n\ params gradient change\n\ 0.0000 0.0000 0.0000\n\ 0.0000 0.0000 0.0000\n\ \n\ ans =\n\ \n\ 6.1497e-12\n\ 6.1497e-12\n\ ") { int nargin = args.length(); if ((nargin < 2) || (nargin > 4)) { error("bfgsmin: you must supply 2, 3 or 4 arguments"); return octave_value_list(); } // check the arguments if (any_bad_argument(args)) return octave_value_list(); std::string f (args(0).string_value()); Cell f_args (args(1).cell_value()); octave_value_list step_args(4,1); // for stepsize: {f, f_args, direction, minarg} octave_value_list g_args(3,1); // for numgradient {f, f_args, minarg} octave_value_list c_args(2,1); // for celleval {f, f_args} octave_value_list f_return; // holder for feval returns int max_iters, verbosity, criterion, minarg; int convergence, iter; int memory, gradient_failed, i, j, k, conv_fun, conv_param, conv_grad; int have_gradient = 0; double func_tol, param_tol, gradient_tol, stepsize, obj_value; double obj_in, last_obj_value, denominator, test; Matrix H, tempmatrix, H1, H2; ColumnVector thetain, d, g, g_new, p, q, sig, gam; // Default values for controls max_iters = INT_MAX; // no limit on iterations verbosity = 0; // by default don't report results to screen criterion = 1; // strong convergence required minarg = 1; // by default, first arg is one over which we minimize memory = 0; // use provided controls, if applicable if (args.length() > 2) { Cell control (args(2).cell_value()); if (xisinf (control(0).double_value())) // this is to allow 'Inf' as first element of control max_iters = -1; else max_iters = control(0).int_value(); if (max_iters == -1) max_iters = INT_MAX; verbosity = control(1).int_value(); criterion = control(2).int_value(); minarg = control(3).int_value(); if (control.length() > 4) memory = control(4).int_value(); } // type checking for minimization parameter done here, since we don't know minarg // until now if (!(f_args(minarg - 1).is_real_matrix() || (f_args(minarg - 1).is_real_scalar()))) { error("bfgsmin: minimization must be with respect to a column vector"); return octave_value_list(); } if ((f_args(minarg - 1).is_real_matrix()) && (f_args(minarg - 1).columns() != 1)) { error("bfgsmin: minimization must be with respect to a column vector"); return octave_value_list(); } // use provided tolerances, if applicable if (args.length() == 4) { Cell tols (args(3).cell_value()); func_tol = tols(0).double_value(); param_tol = tols(1).double_value(); gradient_tol = tols(2).double_value(); } else { // Default values for tolerances func_tol = 1e-12; param_tol = 1e-6; gradient_tol = 1e-4; } // arguments for stepsize function step_args(0) = f; step_args(1) = f_args; step_args(3) = minarg; // arguments for numgradient g_args(0) = f; g_args(1) = f_args; g_args(2) = minarg; // arguments for celleval (to calculate objective function value) c_args(0) = f; c_args(1) = f_args; // get the minimization argument ColumnVector theta = f_args(minarg - 1).column_vector_value(); k = theta.rows(); // containers for items in limited memory version Matrix sigmas(k,memory); Matrix gammas(k,memory); // initialize things convergence = -1; // if this doesn't change, it means that maxiters were exceeded thetain = theta; H = identity_matrix(k,k); // Initial obj_value f_return = feval("celleval", c_args); obj_in = f_return(0).double_value(); if (error_state) // check that objective function returns a double { error("bfgsmin: objective function did not return a scalar numeric value"); return octave_value_list(); } last_obj_value = obj_in; // maybe we have analytic gradient? // if the function returns more than one item, and the second // is a kx1 vector, it is assumed to be the gradient if (f_return.length() > 1) { if (f_return(1).is_real_matrix()) { if ((f_return(1).rows() == k) & (f_return(1).columns() == 1)) { g = f_return(1).column_vector_value(); have_gradient = 1; // future reference } } else have_gradient = 0; } if (!have_gradient) // use numeric gradient { f_return = feval("numgradient", g_args); if (error_state) { error("bfgsmin: unable to calculate numeric gradient of objective function: need to recompile numgradient?"); return octave_value_list(); } tempmatrix = f_return(0).matrix_value(); g = tempmatrix.row((octave_idx_type)0).transpose(); } // check that gradient is ok gradient_failed = 0; // test = 1 means gradient failed for (i=0; i 0) // lbfgs { if (iter < memory) d = lbfgs_recursion(iter, sigmas, gammas, g); else d = lbfgs_recursion(memory, sigmas, gammas, g); d = -d; } else d = -H*g; // ordinary bfgs // stepsize: try (l)bfgs direction, then steepest descent if it fails step_args(2) = d; f_args(minarg - 1) = theta; step_args(1) = f_args; f_return = feval("newtonstep", step_args); stepsize = f_return(0).double_value(); obj_value = f_return(1).double_value(); if (stepsize == 0.0) // fall back to steepest descent { d = -g; // try steepest descent step_args(2) = d; f_return = feval("newtonstep", step_args); stepsize = f_return(0).double_value(); obj_value = f_return(1).double_value(); } p = stepsize*d; // check normal convergence: all 3 must be satisfied // function convergence if (fabs(last_obj_value) > 1.0) { conv_fun = (fabs(obj_value - last_obj_value)/fabs(last_obj_value)) < func_tol; } else { conv_fun = fabs(obj_value - last_obj_value) < func_tol; } // parameter change convergence test = sqrt(theta.transpose() * theta); if (test > 1) conv_param = sqrt(p.transpose() * p) / test < param_tol ; else conv_param = sqrt(p.transpose() * p) < param_tol; // gradient convergence conv_grad = sqrt(g.transpose() * g) < gradient_tol; // Want intermediate results? if ((verbosity > 0) && (verbosity < 2)) { printf("\n======================================================\n"); printf("BFGSMIN intermediate results\n"); printf("\n"); if (memory > 0) printf("Using LBFGS, memory is last %d iterations\n",memory); if (have_gradient) printf("Using analytic gradient\n"); else printf("Using numeric gradient\n"); printf("\n"); printf("------------------------------------------------------\n"); printf("Function conv %d Param conv %d Gradient conv %d\n", conv_fun, conv_param, conv_grad); printf("------------------------------------------------------\n"); printf("Objective function value %g\n", last_obj_value); printf("Stepsize %g\n", stepsize); printf("%d iterations\n", iter); printf("------------------------------------------------------\n"); printf("\n param gradient change\n"); for (j = 0; j 0; j--) { for(i = 0; i < k; i++) { sigmas(i,j) = sigmas(i,j-1); gammas(i,j) = gammas(i,j-1); } } // insert new vectors in left-most column for(i = 0; i < k; i++) { sigmas(i, 0) = sig(i); gammas(i, 0) = gam(i); } } else // failed gradient - loose memory and use previous theta { sigmas.fill(0.0); gammas.fill(0.0); theta = theta - p; } } } // Want last iteration results? if (verbosity > 0) { printf("\n======================================================\n"); printf("BFGSMIN final results\n"); printf("\n"); if (memory > 0) printf("Used LBFGS, memory is last %d iterations\n",memory); if (have_gradient) printf("Used analytic gradient\n"); else printf("Used numeric gradient\n"); printf("\n"); printf("------------------------------------------------------\n"); if (convergence == -1) printf("NO CONVERGENCE: max iters exceeded\n"); if ((convergence == 1) & (criterion == 1)) printf("STRONG CONVERGENCE\n"); if ((convergence == 1) & !(criterion == 1)) printf("WEAK CONVERGENCE\n"); if (convergence == 2) printf("NO CONVERGENCE: algorithm failed\n"); printf("Function conv %d Param conv %d Gradient conv %d\n", conv_fun, conv_param, conv_grad); printf("------------------------------------------------------\n"); printf("Objective function value %g\n", last_obj_value); printf("Stepsize %g\n", stepsize); printf("%d iterations\n", iter); printf("------------------------------------------------------\n"); printf("\n param gradient change\n"); for (j = 0; j