mlpack-git/doxygen/one__step__q__learning__worker_8hpp_source.html

 #ifndef MLPACK_METHODS_RL_WORKER_ONE_STEP_Q_LEARNING_WORKER_HPP
 #define MLPACK_METHODS_RL_WORKER_ONE_STEP_Q_LEARNING_WORKER_HPP

 #include <mlpack/methods/reinforcement_learning/training_config.hpp>

 namespace mlpack {
 namespace rl {

 template <
   typename EnvironmentType,
   typename NetworkType,
   typename UpdaterType,
   typename PolicyType
 >
 class OneStepQLearningWorker
 {
  public:
   using StateType = typename EnvironmentType::State;
   using ActionType = typename EnvironmentType::Action;
   using TransitionType = std::tuple<StateType, ActionType, double, StateType>;

   OneStepQLearningWorker(
       const UpdaterType& updater,
       const EnvironmentType& environment,
       const TrainingConfig& config,
       bool deterministic):
       updater(updater),
       #if ENS_VERSION_MAJOR >= 2
       updatePolicy(NULL),
       #endif
       environment(environment),
       config(config),
       deterministic(deterministic),
       pending(config.UpdateInterval())
   { Reset(); }

   OneStepQLearningWorker(const OneStepQLearningWorker& other) :
       updater(other.updater),
       #if ENS_VERSION_MAJOR >= 2
       updatePolicy(NULL),
       #endif
       environment(other.environment),
       config(other.config),
       deterministic(other.deterministic),
       steps(other.steps),
       episodeReturn(other.episodeReturn),
       pending(other.pending),
       pendingIndex(other.pendingIndex),
       network(other.network),
       state(other.state)
   {
     #if ENS_VERSION_MAJOR >= 2
     updatePolicy = new typename UpdaterType::template
         Policy<arma::mat, arma::mat>(updater,
                                      network.Parameters().n_rows,
                                      network.Parameters().n_cols);
     #endif

     Reset();
   }

   OneStepQLearningWorker(OneStepQLearningWorker&& other) :
       updater(std::move(other.updater)),
       #if ENS_VERSION_MAJOR >= 2
       updatePolicy(NULL),
       #endif
       environment(std::move(other.environment)),
       config(std::move(other.config)),
       deterministic(std::move(other.deterministic)),
       steps(std::move(other.steps)),
       episodeReturn(std::move(other.episodeReturn)),
       pending(std::move(other.pending)),
       pendingIndex(std::move(other.pendingIndex)),
       network(std::move(other.network)),
       state(std::move(other.state))
   {
     #if ENS_VERSION_MAJOR >= 2
     other.updatePolicy = NULL;

     updatePolicy = new typename UpdaterType::template
         Policy<arma::mat, arma::mat>(updater,
                                      network.Parameters().n_rows,
                                      network.Parameters().n_cols);
     #endif
   }

   OneStepQLearningWorker& operator=(const OneStepQLearningWorker& other)
   {
     if (&other == this)
       return *this;

     #if ENS_VERSION_MAJOR >= 2
     delete updatePolicy;
     #endif

     updater = other.updater;
     environment = other.environment;
     config = other.config;
     deterministic = other.deterministic;
     steps = other.steps;
     episodeReturn = other.episodeReturn;
     pending = other.pending;
     pendingIndex = other.pendingIndex;
     network = other.network;
     state = other.state;

     #if ENS_VERSION_MAJOR >= 2
     updatePolicy = new typename UpdaterType::template
         Policy<arma::mat, arma::mat>(updater,
                                      network.Parameters().n_rows,
                                      network.Parameters().n_cols);
     #endif

     Reset();

     return *this;
   }

   OneStepQLearningWorker& operator=(OneStepQLearningWorker&& other)
   {
     if (&other == this)
       return *this;

     #if ENS_VERSION_MAJOR >= 2
     delete updatePolicy;
     #endif

     updater = std::move(other.updater);
     environment = std::move(other.environment);
     config = std::move(other.config);
     deterministic = std::move(other.deterministic);
     steps = std::move(other.steps);
     episodeReturn = std::move(other.episodeReturn);
     pending = std::move(other.pending);
     pendingIndex = std::move(other.pendingIndex);
     network = std::move(other.network);
     state = std::move(other.state);

     #if ENS_VERSION_MAJOR >= 2
     other.updatePolicy = NULL;

     updatePolicy = new typename UpdaterType::template
         Policy<arma::mat, arma::mat>(updater,
                                      network.Parameters().n_rows,
                                      network.Parameters().n_cols);
     #endif

     return *this;
   }

   ~OneStepQLearningWorker()
   {
     #if ENS_VERSION_MAJOR >= 2
     delete updatePolicy;
     #endif
   }

   void Initialize(NetworkType& learningNetwork)
   {
     #if ENS_VERSION_MAJOR == 1
     updater.Initialize(learningNetwork.Parameters().n_rows,
                        learningNetwork.Parameters().n_cols);
     #else
     delete updatePolicy;

     updatePolicy = new typename UpdaterType::template
         Policy<arma::mat, arma::mat>(updater,
                                      learningNetwork.Parameters().n_rows,
                                      learningNetwork.Parameters().n_cols);
     #endif

     // Build local network.
     network = learningNetwork;
   }

   bool Step(NetworkType& learningNetwork,
             NetworkType& targetNetwork,
             size_t& totalSteps,
             PolicyType& policy,
             double& totalReward)
   {
     // Interact with the environment.
     arma::colvec actionValue;
     network.Predict(state.Encode(), actionValue);
     ActionType action = policy.Sample(actionValue, deterministic);
     StateType nextState;
     double reward = environment.Sample(state, action, nextState);
     bool terminal = environment.IsTerminal(nextState);

     episodeReturn += reward;
     steps++;

     terminal = terminal || steps >= config.StepLimit();
     if (deterministic)
     {
       if (terminal)
       {
         totalReward = episodeReturn;
         Reset();
         // Sync with latest learning network.
         network = learningNetwork;
         return true;
       }
       state = nextState;
       return false;
     }

     #pragma omp atomic
     totalSteps++;

     pending[pendingIndex] = std::make_tuple(state, action, reward, nextState);
     pendingIndex++;

     if (terminal || pendingIndex >= config.UpdateInterval())
     {
       // Initialize the gradient storage.
       arma::mat totalGradients(learningNetwork.Parameters().n_rows,
           learningNetwork.Parameters().n_cols, arma::fill::zeros);
       for (size_t i = 0; i < pending.size(); ++i)
       {
         TransitionType &transition = pending[i];

         // Compute the target state-action value.
         arma::colvec actionValue;
         #pragma omp critical
         {
           targetNetwork.Predict(
               std::get<3>(transition).Encode(), actionValue);
         };
         double targetActionValue = actionValue.max();
         if (terminal && i == pending.size() - 1)
           targetActionValue = 0;
         targetActionValue = std::get<2>(transition) +
             config.Discount() * targetActionValue;

         // Compute the training target for current state.
         arma::mat input = std::get<0>(transition).Encode();
         network.Forward(input, actionValue);
         actionValue[std::get<1>(transition).action] = targetActionValue;

         // Compute gradient.
         arma::mat gradients;
         network.Backward(input, actionValue, gradients);

         // Accumulate gradients.
         totalGradients += gradients;
       }

       // Clamp the accumulated gradients.
       totalGradients.transform(
           [&](double gradient)
           { return std::min(std::max(gradient, -config.GradientLimit()),
           config.GradientLimit()); });

       // Perform async update of the global network.
       #if ENS_VERSION_MAJOR == 1
       updater.Update(learningNetwork.Parameters(), config.StepSize(),
           totalGradients);
       #else
       updatePolicy->Update(learningNetwork.Parameters(),
           config.StepSize(), totalGradients);
       #endif

       // Sync the local network with the global network.
       network = learningNetwork;

       pendingIndex = 0;
     }

     // Update global target network.
     if (totalSteps % config.TargetNetworkSyncInterval() == 0)
     {
       #pragma omp critical
       { targetNetwork = learningNetwork; }
     }

     policy.Anneal();

     if (terminal)
     {
       totalReward = episodeReturn;
       Reset();
       return true;
     }
     state = nextState;
     return false;
   }

  private:
   void Reset()
   {
     steps = 0;
     episodeReturn = 0;
     pendingIndex = 0;
     state = environment.InitialSample();
   }

   UpdaterType updater;
   #if ENS_VERSION_MAJOR >= 2
   typename UpdaterType::template Policy<arma::mat, arma::mat>* updatePolicy;
   #endif

   EnvironmentType environment;

   TrainingConfig config;

   bool deterministic;

   size_t steps;

   double episodeReturn;

   std::vector<TransitionType> pending;

   size_t pendingIndex;

   NetworkType network;

   StateType state;
 };

 } // namespace rl
 } // namespace mlpack

 #endif
mlpack::rl::OneStepQLearningWorker::Initialize
void Initialize(NetworkType &learningNetwork)
Initialize the worker.
Definition: one_step_q_learning_worker.hpp:214

mlpack::rl::OneStepQLearningWorker::~OneStepQLearningWorker
~OneStepQLearningWorker()
Clean memory.
Definition: one_step_q_learning_worker.hpp:203

mlpack
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: add_to_cli11.hpp:21

if
if(NOT BUILD_GO_SHLIB) macro(add_go_binding name) endmacro() return() endif() endmacro() if(NOT BUILD_GO_BINDINGS) not_found_return("Not building Go bindings.") endif() if(BUILD_GO_BINDINGS) find_package(Go 1.11.0) if(NOT GO_FOUND) set(GO_NOT_FOUND_MSG "$
Definition: CMakeLists.txt:3

std
Definition: prereqs.hpp:67

mlpack::rl::OneStepQLearningWorker::TransitionType
std::tuple< StateType, ActionType, double, StateType > TransitionType
Definition: one_step_q_learning_worker.hpp:40

endif
n Go endif() find_package(Gonum) if(NOT GONUM_FOUND) set(GO_NOT_FOUND_MSG "$
Definition: CMakeLists.txt:23

mlpack::rl::TrainingConfig::StepLimit
size_t StepLimit() const
Get the maximum steps of each episode.
Definition: training_config.hpp:90

mlpack::rl::TrainingConfig::TargetNetworkSyncInterval
size_t TargetNetworkSyncInterval() const
Get the interval for syncing target network.
Definition: training_config.hpp:84

mlpack::rl::OneStepQLearningWorker::operator=
OneStepQLearningWorker & operator=(OneStepQLearningWorker &&other)
Take ownership of another OneStepQLearningWorker.
Definition: one_step_q_learning_worker.hpp:168

mlpack::rl::OneStepQLearningWorker::OneStepQLearningWorker
OneStepQLearningWorker(const OneStepQLearningWorker &other)
Copy another OneStepQLearningWorker.
Definition: one_step_q_learning_worker.hpp:71

mlpack::rl::OneStepQLearningWorker::OneStepQLearningWorker
OneStepQLearningWorker(const UpdaterType &updater, const EnvironmentType &environment, const TrainingConfig &config, bool deterministic)
Construct one step Q-Learning worker with the given parameters and environment.
Definition: one_step_q_learning_worker.hpp:51

mlpack::rl::TrainingConfig
Definition: training_config.hpp:19

mlpack::rl::OneStepQLearningWorker
Forward declaration of OneStepQLearningWorker.
Definition: async_learning.hpp:147

mlpack::rl::TrainingConfig::UpdateInterval
size_t UpdateInterval() const
Get the update interval.
Definition: training_config.hpp:79

mlpack::rl::TrainingConfig::Discount
double Discount() const
Get the discount rate for future reward.
Definition: training_config.hpp:108

training_config.hpp

mlpack::rl::OneStepQLearningWorker::ActionType
typename EnvironmentType::Action ActionType
Definition: one_step_q_learning_worker.hpp:39

mlpack::rl::OneStepQLearningWorker::OneStepQLearningWorker
OneStepQLearningWorker(OneStepQLearningWorker &&other)
Take ownership of another OneStepQLearningWorker.
Definition: one_step_q_learning_worker.hpp:101

mlpack::rl::OneStepQLearningWorker::Step
bool Step(NetworkType &learningNetwork, NetworkType &targetNetwork, size_t &totalSteps, PolicyType &policy, double &totalReward)
The agent will execute one step.
Definition: one_step_q_learning_worker.hpp:243

mlpack::rl::OneStepQLearningWorker::operator=
OneStepQLearningWorker & operator=(const OneStepQLearningWorker &other)
Copy another OneStepQLearningWorker.
Definition: one_step_q_learning_worker.hpp:131

mlpack::rl::TrainingConfig::GradientLimit
double GradientLimit() const
Get the limit of update gradient.
Definition: training_config.hpp:113

mlpack::rl::TrainingConfig::StepSize
double StepSize() const
Get the step size of the optimizer.
Definition: training_config.hpp:103

mlpack::rl::OneStepQLearningWorker::StateType
typename EnvironmentType::State StateType
Definition: one_step_q_learning_worker.hpp:38