mlpack-git/doxygen/prioritized__replay_8hpp_source.html

 #ifndef MLPACK_METHODS_RL_PRIORITIZED_REPLAY_HPP
 #define MLPACK_METHODS_RL_PRIORITIZED_REPLAY_HPP

 #include <mlpack/prereqs.hpp>
 #include "sumtree.hpp"

 namespace mlpack {
 namespace rl {

 template <typename EnvironmentType>
 class PrioritizedReplay
 {
  public:
   using ActionType = typename EnvironmentType::Action;

   using StateType = typename EnvironmentType::State;

   PrioritizedReplay()
   { /* Nothing to do here. */ }

   PrioritizedReplay(const size_t batchSize,
                     const size_t capacity,
                     const double alpha,
                     const size_t dimension = StateType::dimension) :
       batchSize(batchSize),
       capacity(capacity),
       position(0),
       states(dimension, capacity),
       actions(capacity),
       rewards(capacity),
       nextStates(dimension, capacity),
       isTerminal(capacity),
       full(false),
       alpha(alpha),
       maxPriority(1.0),
       initialBeta(0.6),
       replayBetaIters(10000)
   {
     size_t size = 1;
     while (size < capacity)
     {
       size *= 2;
     }

     beta = initialBeta;
     idxSum = SumTree<double>(size);
   }

   void Store(const StateType& state,
              ActionType action,
              double reward,
              const StateType& nextState,
              bool isEnd)
   {
     states.col(position) = state.Encode();
     actions(position) = action;
     rewards(position) = reward;
     nextStates.col(position) = nextState.Encode();
     isTerminal(position) = isEnd;

     idxSum.Set(position, maxPriority * alpha);

     position++;
     if (position == capacity)
     {
       full = true;
       position = 0;
     }
   }

   arma::ucolvec SampleProportional()
   {
     arma::ucolvec idxes(batchSize);
     double totalSum = idxSum.Sum(0, (full ? capacity : position));
     double sumPerRange = totalSum / batchSize;
     for (size_t bt = 0; bt < batchSize; bt++)
     {
       const double mass = arma::randu() * sumPerRange + bt * sumPerRange;
       idxes(bt) = idxSum.FindPrefixSum(mass);
     }
     return idxes;
   }

   void Sample(arma::mat& sampledStates,
               arma::icolvec& sampledActions,
               arma::colvec& sampledRewards,
               arma::mat& sampledNextStates,
               arma::icolvec& isTerminal)
   {
     sampledIndices = SampleProportional();
     BetaAnneal();

     sampledStates = states.cols(sampledIndices);
     sampledActions = actions.elem(sampledIndices);
     sampledRewards = rewards.elem(sampledIndices);
     sampledNextStates = nextStates.cols(sampledIndices);
     isTerminal = this->isTerminal.elem(sampledIndices);

     // Calculate the weights of sampled transitions.

     size_t numSample = full ? capacity : position;
     weights = arma::rowvec(sampledIndices.n_rows);

     for (size_t i = 0; i < sampledIndices.n_rows; i++)
     {
       double p_sample = idxSum.Get(sampledIndices(i)) / idxSum.Sum();
       weights(i) = pow(numSample * p_sample, -beta);
     }
     weights /= weights.max();
   }

   void UpdatePriorities(arma::ucolvec& indices, arma::colvec& priorities)
   {
       arma::colvec alphaPri = alpha * priorities;
       maxPriority = std::max(maxPriority, arma::max(priorities));
       idxSum.BatchUpdate(indices, alphaPri);
   }

   const size_t& Size()
   {
     return full ? capacity : position;
   }

   void BetaAnneal()
   {
     beta = beta + (1 - initialBeta) * 1.0 / replayBetaIters;
   }

   void Update(arma::mat target,
               arma::icolvec sampledActions,
               arma::mat nextActionValues,
               arma::mat& gradients)
   {
     arma::colvec tdError(target.n_cols);
     for (size_t i = 0; i < target.n_cols; i ++)
     {
       tdError(i) = nextActionValues(sampledActions(i), i) -
           target(sampledActions(i), i);
     }
     tdError = arma::abs(tdError);
     UpdatePriorities(sampledIndices, tdError);

     // Update the gradient
     gradients = arma::mean(weights) * gradients;
   }


  private:
   size_t batchSize;

   size_t capacity;

   size_t position;

   arma::mat states;

   arma::icolvec actions;

   arma::colvec rewards;

   arma::mat nextStates;

   arma::icolvec isTerminal;

   bool full;

   double alpha;

   double maxPriority;

   double initialBeta;

   double beta;

   size_t replayBetaIters;

   SumTree<double> idxSum;

   arma::ucolvec sampledIndices;

   arma::rowvec weights;
 };

 } // namespace rl
 } // namespace mlpack

 #endif
mlpack::rl::PrioritizedReplay::BetaAnneal
void BetaAnneal()
Annealing the beta.
Definition: prioritized_replay.hpp:203

mlpack
strip_type.hpp
Definition: add_to_po.hpp:21

mlpack::rl::PrioritizedReplay::Sample
void Sample(arma::mat &sampledStates, arma::icolvec &sampledActions, arma::colvec &sampledRewards, arma::mat &sampledNextStates, arma::icolvec &isTerminal)
Sample some experience according to their priorities.
Definition: prioritized_replay.hpp:149

mlpack::rl::PrioritizedReplay::Store
void Store(const StateType &state, ActionType action, double reward, const StateType &nextState, bool isEnd)
Store the given experience and set the priorities for the given experience.
Definition: prioritized_replay.hpp:99

prereqs.hpp
The core includes that mlpack expects; standard C++ includes and Armadillo.

mlpack::rl::SumTree::Get
T Get(size_t idx)
Get the data array with idx.
Definition: sumtree.hpp:93

mlpack::rl::SumTree::BatchUpdate
void BatchUpdate(const arma::ucolvec &indices, const arma::Col< T > &data)
Update the data with batch rather loop over the indices with set method.
Definition: sumtree.hpp:75

mlpack::rl::PrioritizedReplay::SampleProportional
arma::ucolvec SampleProportional()
Sample some experience according to their priorities.
Definition: prioritized_replay.hpp:126

mlpack::rl::PrioritizedReplay
Implementation of prioritized experience replay.
Definition: prioritized_replay.hpp:39

mlpack::rl::SumTree::FindPrefixSum
size_t FindPrefixSum(T mass)
Find the highest index idx in the array such that sum(arr[0] + arr[1] + ...
Definition: sumtree.hpp:163

mlpack::rl::SumTree::Sum
T Sum(const size_t start, size_t end)
Calculate the sum of contiguous subsequence of the array.
Definition: sumtree.hpp:143

mlpack::rl::PrioritizedReplay::Size
const size_t & Size()
Get the number of transitions in the memory.
Definition: prioritized_replay.hpp:195

mlpack::rl::PrioritizedReplay::PrioritizedReplay
PrioritizedReplay()
Default constructor.
Definition: prioritized_replay.hpp:51

mlpack::rl::PrioritizedReplay::ActionType
typename EnvironmentType::Action ActionType
Convenient typedef for action.
Definition: prioritized_replay.hpp:43

sumtree.hpp

mlpack::rl::SumTree::Set
void Set(size_t idx, const T value)
Set the data array with idx.
Definition: sumtree.hpp:57

mlpack::rl::PrioritizedReplay::UpdatePriorities
void UpdatePriorities(arma::ucolvec &indices, arma::colvec &priorities)
Update priorities of sampled transitions.
Definition: prioritized_replay.hpp:183

mlpack::rl::PrioritizedReplay::PrioritizedReplay
PrioritizedReplay(const size_t batchSize, const size_t capacity, const double alpha, const size_t dimension=StateType::dimension)
Construct an instance of prioritized experience replay class.
Definition: prioritized_replay.hpp:62

mlpack::rl::PrioritizedReplay::StateType
typename EnvironmentType::State StateType
Convenient typedef for state.
Definition: prioritized_replay.hpp:46

mlpack::rl::SumTree< double >

mlpack::rl::PrioritizedReplay::Update
void Update(arma::mat target, arma::icolvec sampledActions, arma::mat nextActionValues, arma::mat &gradients)
Update the priorities of transitions and Update the gradients.
Definition: prioritized_replay.hpp:216