mlpack-git/doxygen/decision__tree_2information__gain_8hpp_source.html

 #ifndef MLPACK_METHODS_DECISION_TREE_INFORMATION_GAIN_HPP
 #define MLPACK_METHODS_DECISION_TREE_INFORMATION_GAIN_HPP

 #include <mlpack/prereqs.hpp>

 namespace mlpack {
 namespace tree {

 class InformationGain
 {
  public:
   template<bool UseWeights, typename CountType>
   static double EvaluatePtr(const CountType* counts,
                             const size_t countLength,
                             const CountType totalCount)
   {
     double gain = 0.0;

     for (size_t i = 0; i < countLength; ++i)
     {
       const double f = ((double) counts[i] / (double) totalCount);
       if (f > 0.0)
         gain += f * std::log2(f);
     }

     return gain;
   }

   template<bool UseWeights>
   static double Evaluate(const arma::Row<size_t>& labels,
                          const size_t numClasses,
                          const arma::Row<double>& weights)
   {
      // Edge case: if there are no elements, the gain is zero.
      if (labels.n_elem == 0)
        return 0.0;

     // Calculate the information gain.
     double gain = 0.0;

     // Count the number of elements in each class.  Use four auxiliary vectors
     // to exploit SIMD instructions if possible.
     arma::vec countSpace(4 * numClasses, arma::fill::zeros);
     arma::vec counts(countSpace.memptr(), numClasses, false, true);
     arma::vec counts2(countSpace.memptr() + numClasses, numClasses, false,
         true);
     arma::vec counts3(countSpace.memptr() + 2 * numClasses, numClasses, false,
         true);
     arma::vec counts4(countSpace.memptr() + 3 * numClasses, numClasses, false,
         true);

     if (UseWeights)
     {
       // Sum all the weights up.
       double accWeights[4] = { 0.0, 0.0, 0.0, 0.0 };

       // SIMD loop: add counts for four elements simultaneously (if the compiler
       // manages to vectorize the loop).
       for (size_t i = 3; i < labels.n_elem; i += 4)
       {
         const double weight1 = weights[i - 3];
         const double weight2 = weights[i - 2];
         const double weight3 = weights[i - 1];
         const double weight4 = weights[i];

         counts[labels[i - 3]] += weight1;
         counts2[labels[i - 2]] += weight2;
         counts3[labels[i - 1]] += weight3;
         counts4[labels[i]] += weight4;

         accWeights[0] += weight1;
         accWeights[1] += weight2;
         accWeights[2] += weight3;
         accWeights[3] += weight4;
       }

       // Handle leftovers.
       if (labels.n_elem % 4 == 1)
       {
         const double weight1 = weights[labels.n_elem - 1];
         counts[labels[labels.n_elem - 1]] += weight1;
         accWeights[0] += weight1;
       }
       else if (labels.n_elem % 4 == 2)
       {
         const double weight1 = weights[labels.n_elem - 2];
         const double weight2 = weights[labels.n_elem - 1];

         counts[labels[labels.n_elem - 2]] += weight1;
         counts2[labels[labels.n_elem - 1]] += weight2;

         accWeights[0] += weight1;
         accWeights[1] += weight2;
       }
       else if (labels.n_elem % 4 == 3)
       {
         const double weight1 = weights[labels.n_elem - 3];
         const double weight2 = weights[labels.n_elem - 2];
         const double weight3 = weights[labels.n_elem - 1];

         counts[labels[labels.n_elem - 3]] += weight1;
         counts2[labels[labels.n_elem - 2]] += weight2;
         counts3[labels[labels.n_elem - 1]] += weight3;

         accWeights[0] += weight1;
         accWeights[1] += weight2;
         accWeights[2] += weight3;
       }

       accWeights[0] += accWeights[1] + accWeights[2] + accWeights[3];
       counts += counts2 + counts3 + counts4;

       // Corner case: return 0 if no weight.
       if (accWeights[0] == 0.0)
         return 0.0;

       for (size_t i = 0; i < numClasses; ++i)
       {
         const double f = ((double) counts[i] / (double) accWeights[0]);
         if (f > 0.0)
           gain += f * std::log2(f);
       }
     }
     else
     {
       // SIMD loop: add counts for four elements simultaneously (if the compiler
       // manages to vectorize the loop).
       for (size_t i = 3; i < labels.n_elem; i += 4)
       {
         counts[labels[i - 3]]++;
         counts2[labels[i - 2]]++;
         counts3[labels[i - 1]]++;
         counts4[labels[i]]++;
       }

       // Handle leftovers.
       if (labels.n_elem % 4 == 1)
       {
         counts[labels[labels.n_elem - 1]]++;
       }
       else if (labels.n_elem % 4 == 2)
       {
         counts[labels[labels.n_elem - 2]]++;
         counts2[labels[labels.n_elem - 1]]++;
       }
       else if (labels.n_elem % 4 == 3)
       {
         counts[labels[labels.n_elem - 3]]++;
         counts2[labels[labels.n_elem - 2]]++;
         counts3[labels[labels.n_elem - 1]]++;
       }

       counts += counts2 + counts3 + counts4;

       for (size_t i = 0; i < numClasses; ++i)
       {
         const double f = ((double) counts[i] / (double) labels.n_elem);
         if (f > 0.0)
           gain += f * std::log2(f);
       }
     }

     return gain;
   }

   static double Range(const size_t numClasses)
   {
     // The best possible case gives an information gain of 0.  The worst
     // possible case is even distribution, which gives n * (1/n * log2(1/n)) =
     // log2(1/n) = -log2(n).  So, the range is log2(n).
     return std::log2(numClasses);
   }
 };

 } // namespace tree
 } // namespace mlpack

 #endif
mlpack
strip_type.hpp
Definition: add_to_po.hpp:21

prereqs.hpp
The core includes that mlpack expects; standard C++ includes and Armadillo.

mlpack::tree::InformationGain::Evaluate
static double Evaluate(const arma::Row< size_t > &labels, const size_t numClasses, const arma::Row< double > &weights)
Given a set of labels, calculate the information gain of those labels.
Definition: information_gain.hpp:59

mlpack::tree::InformationGain
The standard information gain criterion, used for calculating gain in decision trees.
Definition: information_gain.hpp:25

mlpack::tree::InformationGain::Range
static double Range(const size_t numClasses)
Return the range of the information gain for the given number of classes.
Definition: information_gain.hpp:202

mlpack::tree::InformationGain::EvaluatePtr
static double EvaluatePtr(const CountType *counts, const size_t countLength, const CountType totalCount)
Evaluate the Gini impurity given a vector of class weight counts.
Definition: information_gain.hpp:32