// -*- Mode: c++ -*- // $Id: policy.cpp,v 1.7 2005/09/19 12:48:03 berniw Exp $ // copyright (c) 2004 by Christos Dimitrakakis /*************************************************************************** * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * ***************************************************************************/ #include #include #include #include #ifdef WIN32 #include #define isnan _isnan #endif // WIN32 #undef POLICY_LOG #ifndef POLICY_LOG #undef logmsg #define logmsg empty_log #endif void empty_log(char* s, ...) { } /// \brief Create a new discrete policy. /// \arg n_states Number of states for the agent /// \arg n_actions Number of actions. /// \arg alpha Learning rate. /// \arg gamma Discount parameter. /// \arg lambda Eligibility trace decay. /// \arg softmax Use softmax if true (can be overridden later) /// \arg randomness Amount of randomness. /// \arg init_eval Initial evaluation of actions. DiscretePolicy::DiscretePolicy (int n_states, int n_actions, real alpha, real gamma, real lambda, bool softmax, real randomness, real init_eval) { if (lambda<0.0f) lambda = 0.0f; if (lambda>0.99f) lambda = 0.99f; if (gamma<0.0f) gamma = 0.0f; if (gamma>0.99f) gamma = 0.99f; if (alpha<0.0f) alpha = 0.0f; if (alpha>1.0f) alpha = 1.0f; this->n_states = n_states; this->n_actions = n_actions; this->gamma = gamma; this->lambda = lambda; this->alpha = alpha; smax = softmax; temp = randomness; //logmsg ("RR:%f", temp); if (smax) { if (temp<0.1f) temp = 0.1f; } else { if (temp<0.0f) { temp = 0.0f; } if (temp>1.0f) { temp = 1.0f; } } learning_method = Sarsa; logmsg ("#Making Sarsa(lambda) "); if (smax) { logmsg ("#softmax"); } else { logmsg ("#e-greedy"); } logmsg (" policy with Q:[%d x %d] -> R, a:%f g:%f, l:%f, t:%f\n", this->n_states, this->n_actions, this->alpha, this->gamma, this->lambda, this->temp); P = new real* [n_states]; Q = new real* [n_states]; e = new real* [n_states]; vQ = new real* [n_states]; for (int s=0; s=n_states)) { return 0; } if ((ps>=0)&&(pa>=0)) { expected_r += r; expected_V += Q[ps][pa]; n_samples++; if (s==0) { real max_estimate = 0.0; real max_estimate_k = 0.0; for (int i=0; i=n_actions) { fprintf (stderr, "Action %d out of bounds.. ", a); a = (int) floor (urandom()*((real) n_actions)); fprintf (stderr, "mapping to %d\n", a); } real EQ_s = 0.0; int i; switch (learning_method) { case Sarsa: amax = a; EQ_s = Q[s][amax]; break; case QLearning: amax = argmax; EQ_s = Q[s][amax]; break; case ELearning: amax = a; //? correct ? Normalise(eval, eval, n_actions); EQ_s = 0.0; for (i=0; i=0)&&(pa>=0)) { // do not update at start of episode real delta = r + gamma*EQ_s - Q[ps][pa]; tdError = delta; if (replacing_traces) { e[ps][pa] = 1.0; } else { e[ps][pa] += 1.0; } real ad = alpha*delta; real gl = gamma * lambda; real variance_threshold = 0.0001f; if (confidence_eligibility == false) { vQ[ps][pa] = (1.0 - zeta)*vQ[ps][pa] + zeta*(ad*ad); if (vQ[ps][pa]max_el_state) max_el_state = ps; for (i=0; i0.01) { Q[i][j] += ad * e[i][j]; if (confidence_eligibility == true) { real zeta_el = zeta * e[i][j]; vQ[i][j] = (1.0 - zeta_el)*vQ[i][j] + zeta_el*(ad*ad); if (vQ[i][j]1000.0)||(isnan(Q[i][j]))) { printf ("u: %d %d %f %f\n", i,j,Q[i][j], ad * e[i][j]); } //This is only needed for Qlearning, but sarsa is not //affected since always amax==a; if (amax==a) { e[i][j] *= gl; } else { e[i][j] = 0.0; } } else { e[i][j] = 0.0; el = false; } } if (el==false) { if (min_el_state==i) min_el_state++; } else { max_el_state = i; } } } //printf ("%d %d #STATE\n", min_el_state, max_el_state); // printf ("Q[%d,%d]=%f r=%f e=%f ad=%f gl=%f #QV\n", // ps, pa, Q[ps][pa], r, e[ps][pa], ad, gl); ps = s; pa = a; return a; } /// Use at the end of every episode, after agent has entered the /// absorbing state. void DiscretePolicy::Reset () { for (int s=0; s100.0)||(isnan(Q[i][j]))) { printf ("l: %d %d %f\n", i,j,Q[i][j]); Q[i][j] = 0.0; } } } for (i=0; i100.0)||(isnan(Q[i][j]))) { printf ("s: %d %d %f\n", i,j,Q[i][j]); } } } fwrite((void *) close_tag, sizeof (char), strlen (start_tag)+1, fh); fclose (fh); } /// \brief Set to use confidence estimates for action selection, with /// variance smoothing zeta. /// Variance smoothing currently uses a very simple method to estimate /// the variance. bool DiscretePolicy::useConfidenceEstimates (bool confidence, real zeta, bool confidence_eligibility) { this->confidence = confidence; this->zeta = zeta; this->confidence_eligibility = confidence_eligibility; if (confidence_eligibility) { logmsg ("#+[ELIG_VAR]"); } if (confidence) { logmsg ("#+[CONDIFENCE]"); } else { logmsg ("#-[CONDIFENCE]\n"); } return confidence; } /// Set the algorithm to QLearning mode void DiscretePolicy::setQLearning() { learning_method = QLearning; logmsg ("#[Q-learning]\n"); } /// Set the algorithm to ELearning mode void DiscretePolicy::setELearning() { learning_method = ELearning; logmsg ("#[E-learning]\n"); } /// \brief Set the algorithm to SARSA mode. /// A unified framework for action selection. void DiscretePolicy::setSarsa() { learning_method = Sarsa; logmsg ("#[Sarsa]\n"); } /// Use Pursuit for action selection. void DiscretePolicy::setPursuit(bool pursuit) { this->pursuit = pursuit; if (pursuit) { logmsg ("#+[PURSUIT]\n"); } else { logmsg ("#-[PURSUIT]\n"); } } /// Use Pursuit for action selection. void DiscretePolicy::setReplacingTraces (bool replacing) { this->replacing_traces = replacing; if (replacing) { logmsg ("#[REPLACING TRACES]\n"); } else { logmsg ("#[ACCUMULATING TRACES]\n"); } } /// Set forced learning (force-feed actions) void DiscretePolicy::setForcedLearning(bool forced) { forced_learning = forced; } /// Set randomness for action selection. Does not affect confidence mode. void DiscretePolicy::setRandomness (real epsilon) { temp = epsilon; if (smax) { if (temp<0.01) { smax = false; } } } /// Set the gamma of the sum to be maximised. void DiscretePolicy::setGamma (real gamma) { this->gamma = gamma; } /// Set action selection to softmax. void DiscretePolicy::useSoftmax (bool softmax) { smax = softmax; if (smax) { logmsg ("#+[SMAX]\n"); } else { logmsg ("#-[SMAX]\n"); } } /// Use the reliability estimate method for action selection. void DiscretePolicy::useReliabilityEstimate (bool ri) { reliability_estimate = ri; if (ri) { logmsg("#+[RI]\n"); } else { logmsg("#-[RI]\n"); } } /// Set the distribution for direct action sampling. void DiscretePolicy::setConfidenceDistribution (enum ConfidenceDistribution cd) { switch (cd) { case SINGULAR: logmsg("#[SINGULAR CONFIDENCE]\n"); break; case BOUNDED: logmsg("#[BOUNDED CONFIDENCE]\n"); break; case GAUSSIAN: logmsg("#[GAUSSIAN CONFIDENCE]\n"); break; case LAPLACIAN: logmsg("#[LAPLACIAN CONFIDENCE]\n"); break; default: Serror ("Unknown type %d\n", cd); } confidence_distribution = cd; } /// \brief Add Gibbs sampling for confidences. /// This can be used in conjuction with any confidence distribution, /// however it mostly makes sense for SINGULAR. void DiscretePolicy::useGibbsConfidence (bool gibbs) { if (gibbs) { logmsg ("#+[GIBBS CONFIDENCE]\n"); } else { logmsg ("#-[GIBBS CONFIDENCE]\n"); } this->confidence_uses_gibbs = gibbs; } // ---------- action selection helpers ------------- int DiscretePolicy::confMax(real* Qs, real* vQs, real p) { real sum=0.0; int a; #if 0 for (a=0; a