Changing precision of the linear algebra library to double. (also moving instance transpose to gradient evaluator) Change-Id: I99a71e41f682ac6033d25a038f78190588d00684

commit: e230078b89254096698a98d0c66f46822d1feb4d [log] [tgz]
author: Bazyli Klockiewicz <bazyli@bazyli.sfo.corp.google.com> Fri Aug 24 15:37:18 2018 -0700
committer: Bazyli Klockiewicz <bazyli@bazyli.sfo.corp.google.com> Mon Aug 27 15:37:06 2018 -0700
tree: 89ff542360ac0328c640794ead34ff1c270fc790
parent: df9117d7e8c715ca950d2819a1cd8726c20d923a [diff]
diff --git a/lossmin/eigen-types.h b/lossmin/eigen-types.h
index 47df398..6cfd72a 100644
--- a/lossmin/eigen-types.h
+++ b/lossmin/eigen-types.h

@@ -12,14 +12,17 @@
 // Arrays.
 typedef Eigen::Array<bool, Eigen::Dynamic, 1> ArrayXb;
 typedef Eigen::ArrayXf ArrayXf;
+typedef Eigen::ArrayXd ArrayXd;
 typedef Eigen::ArrayXi ArrayXi;
 
 // Vectors.
 typedef Eigen::VectorXf VectorXf;
+typedef Eigen::VectorXd VectorXd;
 typedef Eigen::VectorXi VectorXi;
 
 // Sparse Vectors.
 typedef Eigen::SparseVector<float> SparseVectorXf;
+typedef Eigen::SparseVector<double> SparseVectorXd;
 
 // Matrix.
 typedef Eigen::Matrix<
@@ -32,20 +35,25 @@
     Eigen::Dynamic,
     Eigen::Dynamic,
     Eigen::RowMajor> RowMatrixXf;
-
-// Sparse Matrix.
-typedef Eigen::SparseMatrix<float> SparseMatrixXf;
+typedef Eigen::Matrix<
+    double,
+    Eigen::Dynamic,
+    Eigen::Dynamic,
+    Eigen::ColMajor> MatrixXd;
+typedef Eigen::Matrix<
+    double,
+    Eigen::Dynamic,
+    Eigen::Dynamic,
+    Eigen::RowMajor> RowMatrixXd;
 
 // Instances and parameters.
-typedef VectorXf Weights;
+typedef VectorXd Weights;
 
-typedef VectorXf Label;
-typedef RowMatrixXf LabelSet;
+typedef VectorXd Label;
+typedef RowMatrixXd LabelSet;
 
-typedef Eigen::SparseVector<float> Instance;
-typedef Eigen::SparseMatrix<float, Eigen::RowMajor> InstanceSet;
-typedef Eigen::SparseMatrix<float, Eigen::ColMajor> SparseMatrixXf;
-typedef Eigen::SparseVector<float> SparseVectorXf;
+typedef Eigen::SparseVector<double> Instance;
+typedef Eigen::SparseMatrix<double, Eigen::RowMajor> InstanceSet;
+typedef Eigen::SparseMatrix<double, Eigen::ColMajor> SparseMatrixXd;
 
 }  // namespace lossmin
-

diff --git a/lossmin/losses/inner-product-loss-function.cc b/lossmin/losses/inner-product-loss-function.cc
index 2d24293..7011143 100644
--- a/lossmin/losses/inner-product-loss-function.cc
+++ b/lossmin/losses/inner-product-loss-function.cc

@@ -8,37 +8,37 @@
 
 namespace lossmin {
 
-float InnerProductLossFunction::LossCurvature(
+double InnerProductLossFunction::LossCurvature(
     const InstanceSet &instances) const {
-  float data_curvature = (instances.cwiseProduct(instances) *
+  double data_curvature = (instances.cwiseProduct(instances) *
                           Weights::Ones(instances.cols())).maxCoeff();
   return curvature_ * data_curvature;
 }
 
 void InnerProductLossFunction::PerCoordinateCurvature(
-    const InstanceSet &instances, VectorXf *per_coordinate_curvature) const {
+    const InstanceSet &instances, VectorXd *per_coordinate_curvature) const {
   *per_coordinate_curvature =
-      VectorXf::Ones(instances.rows()).transpose() *
+      VectorXd::Ones(instances.rows()).transpose() *
             instances.cwiseProduct(instances) / instances.rows();
   *per_coordinate_curvature *= curvature_;
 }
 
-float InnerProductLossFunction::ExampleLoss(
+double InnerProductLossFunction::ExampleLoss(
     const Weights &weights, const InstanceSet &instances,
     const LabelSet &labels, int example) const {
-  float inner_product = instances.innerVector(example).dot(weights);
+  double inner_product = instances.innerVector(example).dot(weights);
   return InnerProductExampleLoss(inner_product, labels.coeff(example, 0));
 }
 
 void InnerProductLossFunction::AddExampleGradient(
     const Weights &weights, const InstanceSet &instances,
-    const LabelSet &labels, int example, float weights_scale,
-    float example_scale, Weights *gradient) const {
-  float inner_product = instances.innerVector(example).dot(weights);
-  if (weights_scale != 1.0f) inner_product *= weights_scale;
-  float inner_product_gradient =
+    const LabelSet &labels, int example, double weights_scale,
+    double example_scale, Weights *gradient) const {
+  double inner_product = instances.innerVector(example).dot(weights);
+  if (weights_scale != 1.0) inner_product *= weights_scale;
+  double inner_product_gradient =
       InnerProductExampleGradient(inner_product, labels.coeff(example, 0));
-  if (example_scale != 1.0f) inner_product_gradient *= example_scale;
+  if (example_scale != 1.0) inner_product_gradient *= example_scale;
 
   if (synchronous_update()) {
     std::lock_guard<std::mutex> lock(gradient_update_mutex_);
@@ -54,14 +54,14 @@
 
 void InnerProductLossFunction::ExampleGradient(
     const Weights &weights, const InstanceSet &instances,
-    const LabelSet &labels, int example, float weights_scale,
-    float example_scale,
-    std::vector<std::pair<int, float>> *example_gradient) const {
-  float inner_product = instances.innerVector(example).dot(weights);
-  if (weights_scale != 1.0f) inner_product *= weights_scale;
-  float inner_product_gradient =
+    const LabelSet &labels, int example, double weights_scale,
+    double example_scale,
+    std::vector<std::pair<int, double>> *example_gradient) const {
+  double inner_product = instances.innerVector(example).dot(weights);
+  if (weights_scale != 1.0) inner_product *= weights_scale;
+  double inner_product_gradient =
       InnerProductExampleGradient(inner_product, labels.coeff(example, 0));
-  if (example_scale != 1.0f) inner_product_gradient *= example_scale;
+  if (example_scale != 1.0) inner_product_gradient *= example_scale;
 
   example_gradient->resize(instances.row(example).nonZeros());
   int i = 0;
@@ -76,11 +76,11 @@
     const Weights &weights, const InstanceSet &instances,
     LabelSet *labels) const {
   // Compute inner products.
-  VectorXf inner_products = instances * weights;
+  VectorXd inner_products = instances * weights;
 
   // Assign labels by calling InnerProductAssignLabel coefficientwise on
   // 'inner_products'.
-  std::function<float(float)> assign_label_ptr =
+  std::function<double(double)> assign_label_ptr =
       std::bind(&InnerProductLossFunction::InnerProductPredictLabel,
                 this, std::placeholders::_1);
   *labels = inner_products.unaryExpr(assign_label_ptr);

diff --git a/lossmin/losses/inner-product-loss-function.h b/lossmin/losses/inner-product-loss-function.h
index d00cef2..284125d 100644
--- a/lossmin/losses/inner-product-loss-function.h
+++ b/lossmin/losses/inner-product-loss-function.h

@@ -30,51 +30,51 @@
 class InnerProductLossFunction : public LossFunction {
  public:
   // Returns the loss for a single example.
-  float ExampleLoss(
+  double ExampleLoss(
       const Weights &weights, const InstanceSet &instances,
       const LabelSet &labels, int example) const override;
 
   // Adds the gradient of a single example to 'gradient'.
   void AddExampleGradient(
       const Weights &weights, const InstanceSet &instances,
-      const LabelSet &labels, int example, float weights_scale,
-      float example_scale, Weights *gradient) const override;
+      const LabelSet &labels, int example, double weights_scale,
+      double example_scale, Weights *gradient) const override;
 
   // Returns the gradient of a single example.
   void ExampleGradient(
       const Weights &weights, const InstanceSet &instances,
-      const LabelSet &labels, int example, float weights_scale,
-      float example_scale,
-      std::vector<std::pair<int, float>> *example_gradient) const override;
+      const LabelSet &labels, int example, double weights_scale,
+      double example_scale,
+      std::vector<std::pair<int, double>> *example_gradient) const override;
 
   // Assigns labels to 'instances' given 'weights'.
   void PredictLabels(const Weights &weights, const InstanceSet &instances,
                      LabelSet *labels) const override;
 
   // Returns an upper bound on the loss curvature.
-  float LossCurvature(const InstanceSet &instances) const override;
+  double LossCurvature(const InstanceSet &instances) const override;
 
   // Returns an upper bound on the per-coordinate curvature.
   void PerCoordinateCurvature(
       const InstanceSet &instances,
-      VectorXf *per_coordinate_curvature) const override;
+      VectorXd *per_coordinate_curvature) const override;
 
-  virtual float InnerProductExampleLoss(float inner_product, float label)
+  virtual double InnerProductExampleLoss(double inner_product, double label)
       const = 0;
 
-  virtual float InnerProductExampleGradient(float inner_product, float label)
+  virtual double InnerProductExampleGradient(double inner_product, double label)
       const = 0;
 
-  virtual float InnerProductPredictLabel(float inner_product) const = 0;
+  virtual double InnerProductPredictLabel(double inner_product) const = 0;
 
   // Returns 'curvature_'.
-  virtual float InnerProductCurvature(float inner_product, float label) const {
+  virtual double InnerProductCurvature(double inner_product, double label) const {
     return curvature_;
   }
 
  protected:
   // Sets the upper bound on the curvature of the loss function.
-  void set_curvature(float curvature) { curvature_ = curvature; }
+  void set_curvature(double curvature) { curvature_ = curvature; }
 
  private:
   // Mutex for synchronous updates of the gradient vector.
@@ -83,7 +83,7 @@
   // Upper bound on the absolute value of the second derivative of the loss:
   // |d^2 loss(x) / dx^2| <= curvature_, where 'x' is the inner product
   //  <instance, weights>. Should be set by derived classes.
-  float curvature_;
+  double curvature_;
 };
 
 // Linear regression with squared error loss.
@@ -92,21 +92,21 @@
   LinearRegressionLossFunction() { set_curvature(1.0); }
 
   // Returns the squared error loss.
-  float InnerProductExampleLoss(float inner_product, float label)
+  double InnerProductExampleLoss(double inner_product, double label)
       const override {
     return 0.5 * (inner_product - label) * (inner_product - label);
   }
 
   // Returns the gradient of the squared error loss wrt 'inner_product'.
-  float InnerProductExampleGradient(float inner_product, float label)
+  double InnerProductExampleGradient(double inner_product, double label)
       const override {
     return inner_product - label;
   }
 
   // Assigns a label given 'inner_product'.
-  float InnerProductPredictLabel(float inner_product) const override {
+  double InnerProductPredictLabel(double inner_product) const override {
     return inner_product;
   }
 };
 
-}  // namespace lossmin
\ No newline at end of file
+}  // namespace lossmin

diff --git a/lossmin/losses/loss-function.cc b/lossmin/losses/loss-function.cc
index ca11228..f5bb3bf 100644
--- a/lossmin/losses/loss-function.cc
+++ b/lossmin/losses/loss-function.cc

@@ -9,10 +9,10 @@
 
 namespace lossmin {
 
-float LossFunction::BatchLoss(
+double LossFunction::BatchLoss(
     const Weights &weights, const InstanceSet &instances,
     const LabelSet &labels) const {
-  float loss = 0.0f;
+  double loss = 0.0;
   for (int i = 0; i < instances.rows(); ++i) {
     loss += ExampleLoss(weights, instances, labels, i);
   }
@@ -23,7 +23,7 @@
     const Weights &weights, const InstanceSet &instances,
     const LabelSet &labels, Weights *gradient) const {
   for (int i = 0; i < instances.rows(); ++i) {
-    AddExampleGradient(weights, instances, labels, i, 1.0f, 1.0f, gradient);
+    AddExampleGradient(weights, instances, labels, i, 1.0, 1.0, gradient);
   }
 }
 

diff --git a/lossmin/losses/loss-function.h b/lossmin/losses/loss-function.h
index b02aff0..42a28d5 100644
--- a/lossmin/losses/loss-function.h
+++ b/lossmin/losses/loss-function.h

@@ -35,7 +35,7 @@
 
   // Returns the loss for a single example (row 'example' of 'instances' and
   // 'labels').
-  virtual float ExampleLoss(
+  virtual double ExampleLoss(
       const Weights &weights, const InstanceSet &instances,
       const LabelSet &labels, int example) const = 0;
 
@@ -47,15 +47,15 @@
   // protected by a mutex in the implementation.
   virtual void AddExampleGradient(
       const Weights &weights, const InstanceSet &instances,
-      const LabelSet &labels, int example, float weights_scale,
-      float example_scale, Weights *gradient) const = 0;
+      const LabelSet &labels, int example, double weights_scale,
+      double example_scale, Weights *gradient) const = 0;
 
   // Returns the gradient of the loss for a single example. Used in AdaGrad.
   virtual void ExampleGradient(
       const Weights &weights, const InstanceSet &instances,
-      const LabelSet &labels, int example, float weights_scale,
-      float example_scale,
-      std::vector<std::pair<int, float>> *example_gradient) const = 0;
+      const LabelSet &labels, int example, double weights_scale,
+      double example_scale,
+      std::vector<std::pair<int, double>> *example_gradient) const = 0;
 
   // Predicts 'labels' for 'instances' given 'weights'.
   virtual void PredictLabels(
@@ -66,14 +66,14 @@
   // Hessian matrix). Required by DeterministicGradientDescent. Optionally
   // required by StochasticVarianceReducedGradient (for default learning rate)
   // and StochasticGradientDescent for CURVATURE_BASED learning rate scheduling.
-  virtual float LossCurvature(const InstanceSet &instances) const = 0;
+  virtual double LossCurvature(const InstanceSet &instances) const = 0;
 
   // Returns an upper bound on the curvature of the loss along each coordinate
   // (max absolute value of the second derivative) of the data. Required by
   // ParallelBoostingWithmomentum.
   virtual void PerCoordinateCurvature(
       const InstanceSet &instances,
-      VectorXf *per_coordinate_curvature) const = 0;
+      VectorXd *per_coordinate_curvature) const = 0;
 
   // Initializes parameters to a suggested setting for this loss if appropriate.
   virtual void Init(Weights *weights) const {}
@@ -84,7 +84,7 @@
 
   // Returns the total loss for a set of examples. Default implementation runs
   // through the examples and calls ExampleLoss on each.
-  virtual float BatchLoss(const Weights &weights, const InstanceSet &instances,
+  virtual double BatchLoss(const Weights &weights, const InstanceSet &instances,
                           const LabelSet &labels) const;
 
   // Returns the total gradient for a set of examples. Default implementation
@@ -108,5 +108,4 @@
   //DISALLOW_COPY_AND_ASSIGN(LossFunction);
 };
 
-
 }  // namespace lossmin

diff --git a/lossmin/minimizers/gradient-evaluator.cc b/lossmin/minimizers/gradient-evaluator.cc
index 32fafbc..0261b76 100644
--- a/lossmin/minimizers/gradient-evaluator.cc
+++ b/lossmin/minimizers/gradient-evaluator.cc

@@ -9,23 +9,23 @@
 
 namespace lossmin {
 
-float GradientEvaluator::Loss(const Weights &weights) const {
+double GradientEvaluator::Loss(const Weights &weights) const {
   // TODO(azani): Implement multi-threaded version.
   return loss_function_->BatchLoss(weights, instances_, labels_) /
-      NumExamples();
+         NumExamples();
 }
 
-float GradientEvaluator::Loss(
-    const Weights &weights, const InstanceSet &validation_instances,
-    const LabelSet &validation_labels) const {
-  return loss_function_->BatchLoss(
-      weights, validation_instances, validation_labels) /
-      validation_labels.rows();
+double GradientEvaluator::Loss(const Weights &weights,
+                               const InstanceSet &validation_instances,
+                               const LabelSet &validation_labels) const {
+  return loss_function_->BatchLoss(weights, validation_instances,
+                                   validation_labels) /
+         validation_labels.rows();
 }
 
-float GradientEvaluator::Loss(
-    const Weights &weights, const std::vector<int> &example_indices) const {
-  float loss = 0.0f;
+double GradientEvaluator::Loss(const Weights &weights,
+                               const std::vector<int> &example_indices) const {
+  double loss = 0.0;
   for (int example : example_indices) {
     loss += loss_function_->ExampleLoss(weights, instances_, labels_, example);
   }
@@ -34,7 +34,7 @@
 
 void GradientEvaluator::Gradient(const Weights &weights,
                                  Weights *gradient) const {
-  //DCHECK(gradient != nullptr);
+  // DCHECK(gradient != nullptr);
   // TODO(azani): Implement multi-threaded version.
   loss_function_->BatchGradient(weights, instances_, labels_, gradient);
   *gradient /= NumExamples();

diff --git a/lossmin/minimizers/gradient-evaluator.h b/lossmin/minimizers/gradient-evaluator.h
index d756759..4ae1748 100644
--- a/lossmin/minimizers/gradient-evaluator.h
+++ b/lossmin/minimizers/gradient-evaluator.h

@@ -27,25 +27,27 @@
   // Constructor sets up the dataset and the loss function.
   GradientEvaluator(const InstanceSet &instances, const LabelSet &labels,
                     const LossFunction *loss_function)
-      : instances_(instances), labels_(labels),
+      : instances_(instances),
+        instances_transposed_(instances.transpose()),
+        labels_(labels),
         loss_function_(loss_function) {}
 
   virtual ~GradientEvaluator() {}
 
   // Returns the loss for given parameters 'weights'. Multi-threading is used
   // if num_threads_ > 1.
-  virtual float Loss(const Weights &weights) const;
+  virtual double Loss(const Weights &weights) const;
 
   // Returns the loss for given parameters 'weights' and a subset of examples
   // 'example_indices'.
-  virtual float Loss(const Weights &weights,
-                     const std::vector<int> &example_indices) const;
+  virtual double Loss(const Weights &weights,
+                      const std::vector<int> &example_indices) const;
 
   // Returns the loss for given parameters 'weights' and a different
   // dataset (typically used for validation).
-  virtual float Loss(
-      const Weights &weights, const InstanceSet &validation_instances,
-      const LabelSet &validation_labels) const;
+  virtual double Loss(const Weights &weights,
+                      const InstanceSet &validation_instances,
+                      const LabelSet &validation_labels) const;
 
   // Computes the gradient wrt the given parameters 'weights'. 'gradient' is
   // owned by the caller and should be initialized to zero.
@@ -57,24 +59,23 @@
 
   // Adds the gradient wrt 'weight_scale * weights' for 'example' to the vector
   // 'gradient' in place. The gradient is scaled by 'example_scale'.
-  virtual void AddExampleGradient(
-      const Weights &weights, int example, float weights_scale,
-      float example_scale, Weights *gradient) const {
-    loss_function_->AddExampleGradient(
-        weights, instances_, labels_, example, weights_scale, example_scale,
-        gradient);
+  virtual void AddExampleGradient(const Weights &weights, int example,
+                                  double weights_scale, double example_scale,
+                                  Weights *gradient) const {
+    loss_function_->AddExampleGradient(weights, instances_, labels_, example,
+                                       weights_scale, example_scale, gradient);
   }
 
-  // Returns the gradient wrt 'weights' as a vector<pair<int, float>> rather
-  // than Eigen::SparseVector<float>, since Eigen is very inefficient with
+  // Returns the gradient wrt 'weights' as a vector<pair<int, double>> rather
+  // than Eigen::SparseVector<double>, since Eigen is very inefficient with
   // sparse vectors. This is only necessary if running SGDAdaGrad.
   virtual void ExampleGradient(
-      const Weights &weights, int example, float weights_scale,
-      float example_scale,
-      std::vector<std::pair<int, float>> *example_gradient) const {
-    loss_function_->ExampleGradient(
-        weights, instances_, labels_, example, weights_scale, example_scale,
-        example_gradient);
+      const Weights &weights, int example, double weights_scale,
+      double example_scale,
+      std::vector<std::pair<int, double>> *example_gradient) const {
+    loss_function_->ExampleGradient(weights, instances_, labels_, example,
+                                    weights_scale, example_scale,
+                                    example_gradient);
   }
 
   // Returns the number of examples in the dataset.
@@ -90,26 +91,26 @@
 
   // Returns an upper bound on the curvature of the loss function. Used to set
   // the learning rate of some LossMinimizer algorithms.
-  virtual float LossCurvature() const {
+  virtual double LossCurvature() const {
     return loss_function_->LossCurvature(instances_);
   }
 
   // Returns the per-coordinate curvature of the data. Used to set the learning
   // rates of ParallelBoostingWithMomentum.
   virtual void PerCoordinateCurvature(
-      VectorXf *per_coordinate_curvature) const {
+      VectorXd *per_coordinate_curvature) const {
     loss_function_->PerCoordinateCurvature(instances_,
                                            per_coordinate_curvature);
   }
 
   // Returns sparsity, defined as the maximum instance l0 norm. Used to help
   // set learning rates in ParallelBoostingWithMomentum.
-  float Sparsity() const {
+  double Sparsity() const {
     typename Instance::Index sparsity = 0;
     for (int i = 0; i < instances_.rows(); ++i) {
       sparsity = std::max(sparsity, instances_.innerVector(i).nonZeros());
     }
-    return static_cast<float>(sparsity);
+    return static_cast<double>(sparsity);
   }
 
   // Returns the loss function.
@@ -118,6 +119,11 @@
   // Returns the instances.
   const InstanceSet &instances() const { return instances_; }
 
+  // Returns the transpose pf instances.
+  const InstanceSet &instances_transposed() const {
+    return instances_transposed_;
+  }
+
   // Returns the labels.
   const LabelSet &labels() const { return labels_; }
 
@@ -125,6 +131,11 @@
   // Training instances.
   const InstanceSet &instances_;
 
+  // The transpose of instances. This is needed for fast gradient computations
+  // and should be computed once so it is computed at construction (not each
+  // time gradient is computed)
+  const InstanceSet instances_transposed_;
+
   // Instance labels.
   const LabelSet &labels_;
 
@@ -134,4 +145,3 @@
 };
 
 }  // namespace lossmin
-

diff --git a/lossmin/minimizers/loss-minimizer.cc b/lossmin/minimizers/loss-minimizer.cc
index 45ea3d6..a4dc58f 100644
--- a/lossmin/minimizers/loss-minimizer.cc
+++ b/lossmin/minimizers/loss-minimizer.cc

@@ -11,7 +11,7 @@
 namespace lossmin {
 
 bool LossMinimizer::Run(int max_epochs, int loss_epochs, int convergence_epochs,
-                        Weights *weights, std::vector<float> *loss) {
+                        Weights *weights, std::vector<double> *loss) {
   // Run for up to 'max_epochs' epochs.
   int epoch;
   for (epoch = 0; epoch < max_epochs; ++epoch) {
@@ -57,11 +57,11 @@
   }
 }
 
-void LossMinimizer::SimpleConvergenceCheck(const std::vector<float> &loss) {
+void LossMinimizer::SimpleConvergenceCheck(const std::vector<double> &loss) {
   // Check convergence by verifying that the max relative loss decrease
   // (loss[t-1] - loss[t]) / loss[t-1] is below 'simple_convergence_threshold_'.
   if (loss.size() > num_convergence_epochs_) {
-    float loss_difference = 0.0f;
+    double loss_difference = 0.0;
     for (int i = loss.size() - num_convergence_epochs_; i < loss.size(); ++i) {
       if (loss[i - 1] > 0) {
         loss_difference = std::max(loss_difference, 1 - loss[i] / loss[i - 1]);

diff --git a/lossmin/minimizers/loss-minimizer.h b/lossmin/minimizers/loss-minimizer.h
index eb87947..4347b48 100644
--- a/lossmin/minimizers/loss-minimizer.h
+++ b/lossmin/minimizers/loss-minimizer.h

@@ -17,15 +17,15 @@
 //    regularization parameters. StochasticGradientDescent requires an
 //    additional parameter for learning rate scheduling.
 //
-//    float l1 = ...
-//    float l2 = ...
+//    double l1 = ...
+//    double l2 = ...
 //    DeterministicGradientDescent loss_minimizer(l1, l2, gradient_evaluator);
 //
 // 3. Run optimization for up to 'max_epochs' epochs. 'loss' is filled with
 //    loss values across epochs, and 'weights' contains the best parameters.
 //
 //    Weights weights = Weights::Zero(num_features);  // or other initialization
-//    vector<float> loss;
+//    vector<double> loss;
 //    int max_epochs = 100;
 //    bool converged = loss_minimizer.Run(max_epochs, &weights, &loss);
 //
@@ -60,7 +60,7 @@
  public:
   // Constructor sets the l1 and l2 regularization parameters and
   // 'gradient_evalutor_'.
-  LossMinimizer(float l1, float l2, const GradientEvaluator &gradient_evaluator)
+  LossMinimizer(double l1, double l2, const GradientEvaluator &gradient_evaluator)
       : l1_(l1),
         l2_(l2),
         gradient_evaluator_(gradient_evaluator),
@@ -79,11 +79,11 @@
   // last epoch. 'loss' is filled with training loss values produced every
   // 'loss_epochs' epochs. Convergence is checked every 'convergence_epochs'.
   bool Run(int max_epochs, int loss_epochs, int convergence_epochs,
-           Weights *weights, std::vector<float> *loss);
+           Weights *weights, std::vector<double> *loss);
 
   // Convenience Run method that evaluates the loss and checks for convergence
   // at every iteration.
-  bool Run(int max_epochs, Weights *weights, std::vector<float> *loss) {
+  bool Run(int max_epochs, Weights *weights, std::vector<double> *loss) {
     return Run(max_epochs, 1, 1, weights, loss);
   }
 
@@ -93,8 +93,8 @@
 
   // Returns the total loss for given parameters 'weights', including l1 and l2
   // regularization.
-  virtual float Loss(const Weights &weights) const {
-    float loss = gradient_evaluator_.Loss(weights);
+  virtual double Loss(const Weights &weights) const {
+    double loss = gradient_evaluator_.Loss(weights);
     if (l2_ > 0.0f) loss += 0.5 * l2_ * weights.squaredNorm();
     if (l1_ > 0.0f) loss += l1_ * weights.cwiseAbs().sum();
     return loss;
@@ -109,7 +109,7 @@
   // Checks convergence based on the decrease in loss over the last
   // 'num_convergence_epochs_' epochs. If converged, the flag 'converged_' is
   // set to true.
-  void SimpleConvergenceCheck(const std::vector<float> &loss);
+  void SimpleConvergenceCheck(const std::vector<double> &loss);
 
   // Setters and getters for convergence criteria parameters.
   bool converged() const { return converged_; }
@@ -121,21 +121,21 @@
   void set_use_simple_convergence_check(bool use_simple_convergence_check) {
     use_simple_convergence_check_ = use_simple_convergence_check;
   }
-  float convergence_threshold() const { return convergence_threshold_; }
-  void set_convergence_threshold(float convergence_threshold) {
+  double convergence_threshold() const { return convergence_threshold_; }
+  void set_convergence_threshold(double convergence_threshold) {
     convergence_threshold_ = convergence_threshold;
   }
-  float simple_convergence_threshold() const {
+  double simple_convergence_threshold() const {
     return simple_convergence_threshold_;
   }
-  void set_simple_convergence_threshold(float simple_convergence_threshold) {
+  void set_simple_convergence_threshold(double simple_convergence_threshold) {
     simple_convergence_threshold_ = simple_convergence_threshold;
   }
   void set_num_convergence_epochs(int num_convergence_epochs) {
     num_convergence_epochs_ = num_convergence_epochs;
   }
-  float zero_threshold() const { return zero_threshold_; }
-  void set_zero_threshold(float zero_threshold) {
+  double zero_threshold() const { return zero_threshold_; }
+  void set_zero_threshold(double zero_threshold) {
     zero_threshold_ = zero_threshold;
   }
 
@@ -145,18 +145,18 @@
   }
 
   // Getter/setter of the l1 regularization parameter.
-  float l1() const { return l1_; }
-  void set_l1(float l1) { l1_ = l1; }
+  double l1() const { return l1_; }
+  void set_l1(double l1) { l1_ = l1; }
 
   // Getter/setter of the l2 regularization parameter.
-  float l2() const { return l2_; }
-  void set_l2(float l2) { l2_ = l2; }
+  double l2() const { return l2_; }
+  void set_l2(double l2) { l2_ = l2; }
 
   // Returns the number of iterations the last time Run() was executed
   int num_epochs_run() const { return num_epochs_run_; }
 
   // Applies L1Prox coefficientwise to 'weights' and 'threshold'.
-  static void L1Prox(float threshold, Weights *weights) {
+  static void L1Prox(double threshold, Weights *weights) {
     for (int i = 0; i < weights->size(); ++i) {
       weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold);
     }
@@ -164,35 +164,35 @@
 
   // Applies L1Prox coefficientwise to 'weights' and 'threshold', where
   // 'threshold' is a vector of per-coordinate thresholds.
-  static void L1Prox(const VectorXf &threshold, Weights *weights) {
+  static void L1Prox(const VectorXd &threshold, Weights *weights) {
     for (int i = 0; i < weights->size(); ++i) {
       weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold.coeff(i));
     }
   }
 
   // Returns sign('x') * max(0.0, abs('x') - 'threshold').
-  static inline float L1Prox(float x, float threshold) {
-    return Sign(x) * std::max(std::abs(x) - threshold, 0.0f);
+  static inline double L1Prox(double x, double threshold) {
+    return Sign(x) * std::max(std::abs(x) - threshold, 0.0);
   }
 
   // Returns sign('x').
-  static inline float Sign(float x) {
-    if (x > 0.0f) return 1.0f;
-    if (x < 0.0f) return -1.0f;
-    return 0.0f;
+  static inline double Sign(double x) {
+    if (x > 0.0) return 1.0;
+    if (x < 0.0) return -1.0;
+    return 0.0;
   }
 
  private:
   // Regularization parameters.
-  float l1_;
-  float l2_;
+  double l1_;
+  double l2_;
 
   // GradientEvaluator used to compute the (unregularized) loss and gradient.
   const GradientEvaluator &gradient_evaluator_;
 
   // Convergence parameters.
   // Convergence threshold should be strict but not too strict.
-  // This will depend on precision used. As float gives 1e-8 relative accuracy,
+  // This will depend on precision used. As double gives 1e-8 relative accuracy,
   // 1e-6 or 1e-7 is probably strictest one should use (but this also depends
   // on the implementation of convergence checks).
   // This can also be updated during initialization of the minimizer so the
@@ -201,16 +201,16 @@
   bool reached_solution_ =
       false;  // flag indicating whether the algorithm
               // actually reached the solution as determined by ConvergenceCheck
-  float convergence_threshold_ =
-      1e-5;  // threshold for assessing convergence by ConvergenceCheck
-  float simple_convergence_threshold_ =
+  double convergence_threshold_ =
+      1e-7;  // threshold for assessing convergence by ConvergenceCheck
+  double simple_convergence_threshold_ =
       1e-5;  // threshold for assesing convergence by SimpleConvergenceCheck
   bool use_simple_convergence_check_ = false;  // which convergence check to use
   int num_convergence_epochs_ = 5;             // used in SimpleConvergenceCheck
 
   // zero_threshold_ is the threshold below which we treat the coordinate value
   // as zero (in absolute terms). This is used in ConvergenceCheck.
-  float zero_threshold_ = 1e-6;
+  double zero_threshold_ = 1e-6;
 
   // The number of epochs (iterations) when Run() was executed.
   // In other words, each epoch is a step towards minimum during minimization.

diff --git a/lossmin/minimizers/parallel-boosting-with-momentum.cc b/lossmin/minimizers/parallel-boosting-with-momentum.cc
index e3f25f6..3eb1848 100644
--- a/lossmin/minimizers/parallel-boosting-with-momentum.cc
+++ b/lossmin/minimizers/parallel-boosting-with-momentum.cc

@@ -16,8 +16,8 @@
 
 void ParallelBoostingWithMomentum::Setup() {
   compute_and_set_learning_rates();
-  alpha_ = 0.5f;
-  beta_ = 1.0f - alpha_;
+  alpha_ = 0.5;
+  beta_ = 1.0 - alpha_;
   phi_center_ = Weights::Zero(gradient_evaluator().NumWeights());
 }
 
@@ -25,7 +25,7 @@
   // Per-coordinate learning rates are learning_rates[j] = 1 / sparsity * Lj,
   // where sparsity is the maximum instance l0 norm and Lj is upper bound on
   // the loss curvature along coordinate j.
-  float sparsity = gradient_evaluator().Sparsity();
+  double sparsity = gradient_evaluator().Sparsity();
   gradient_evaluator().PerCoordinateCurvature(&learning_rates_);
   learning_rates_ =
       (learning_rates_.array() + l2()).inverse().matrix() / sparsity;
@@ -37,25 +37,25 @@
   // TODO(bazyli) parallel matrix vector multiply with open mp
   Weights residual = gradient_evaluator().instances() * weights;
   residual -= gradient_evaluator().labels();
-  *gradient = instances_transpose_ * residual;
+  *gradient = gradient_evaluator().instances_transposed() * residual;
   *gradient /= gradient_evaluator().NumExamples();
 }
 
-float ParallelBoostingWithMomentum::Loss(const Weights &weights) const {
+double ParallelBoostingWithMomentum::Loss(const Weights &weights) const {
   // Eigen library recommends computations step-by-step for best perfomance
   // TODO(bazyli) parallel matrix vector multiply with open mp
   Weights residual = gradient_evaluator().instances() * weights;
   residual -= gradient_evaluator().labels();
-  float loss =
+  double loss =
       0.5 * residual.squaredNorm() / gradient_evaluator().NumExamples();
-  if (l2() > 0.0f) loss += 0.5 * l2() * weights.squaredNorm();
-  if (l1() > 0.0f) loss += l1() * weights.cwiseAbs().sum();
+  if (l2() > 0.0) loss += 0.5 * l2() * weights.squaredNorm();
+  if (l1() > 0.0) loss += l1() * weights.cwiseAbs().sum();
   return loss;
 }
 
 void ParallelBoostingWithMomentum::ConvergenceCheck(const Weights &weights,
                                                     const Weights &gradient) {
-  float error_squared = 0.0f;
+  double error_squared = 0.0;
   for (int i = 0; i < gradient.size(); i++) {
     // for weights > 0 the gradient should be == -l1
     if (weights(i) > zero_threshold()) {
@@ -67,7 +67,7 @@
     }
     // for weights == 0 the gradient should be between -l1 and l1
     else {
-      float err = std::max(std::abs(gradient(i)) - l1(), 0.0f);
+      double err = std::max(std::abs(gradient(i)) - l1(), 0.0);
       error_squared += err * err;
     }
   }
@@ -81,25 +81,25 @@
 void ParallelBoostingWithMomentum::EpochUpdate(Weights *weights, int epoch,
                                                bool check_convergence) {
   // Compute the intermediate weight vector y.
-  Weights y = (1.0f - alpha_) * *weights + alpha_ * phi_center_;
+  Weights y = (1.0 - alpha_) * *weights + alpha_ * phi_center_;
 
   // Compute the gradient of the loss (except l1 penalty) wrt y.
   Weights gradient_wrt_y = Weights::Zero(y.size());
   SparseInnerProductGradient(y, &gradient_wrt_y);
-  if (l2() > 0.0f) gradient_wrt_y += l2() * y;
+  if (l2() > 0.0) gradient_wrt_y += l2() * y;
 
   // Gradient step.
   *weights -= gradient_wrt_y.cwiseProduct(learning_rates_);
 
   // l1 shrinkage.
-  if (l1() > 0.0f) {
+  if (l1() > 0.0) {
     L1Prox(l1() * learning_rates_, weights);
   }
 
   // Update the approximation function.
   phi_center_ -= (1.0 - alpha_) / alpha_ * (y - *weights);
   alpha_ =
-      -beta_ / 2.0 + pow(beta_ + beta_ * beta_ / 4.0, static_cast<float>(0.5));
+      -beta_ / 2.0 + pow(beta_ + beta_ * beta_ / 4.0, static_cast<double>(0.5));
   beta_ *= (1.0 - alpha_);
 
   // Compute the gradient of the objective except the l1 part and check

diff --git a/lossmin/minimizers/parallel-boosting-with-momentum.h b/lossmin/minimizers/parallel-boosting-with-momentum.h
index 62e75e6..c86c11f 100644
--- a/lossmin/minimizers/parallel-boosting-with-momentum.h
+++ b/lossmin/minimizers/parallel-boosting-with-momentum.h

@@ -21,10 +21,9 @@
 
 class ParallelBoostingWithMomentum : public LossMinimizer {
  public:
-  ParallelBoostingWithMomentum(float l1, float l2,
+  ParallelBoostingWithMomentum(double l1, double l2,
                                const GradientEvaluator &gradient_evaluator)
-      : LossMinimizer(l1, l2, gradient_evaluator),
-        instances_transpose_(gradient_evaluator.instances().transpose()) {
+      : LossMinimizer(l1, l2, gradient_evaluator) {
     Setup();
   }
 
@@ -53,7 +52,7 @@
   // regularization. Uses sparse matrix multiply from Eigen.
   // This is more efficient in terms of performed operations than calling
   // gradient_evaluator().Loss(weights).
-  float Loss(const Weights &weights) const override;
+  double Loss(const Weights &weights) const override;
 
   // Computes the inner product gradient at |weights|, written to |gradient*|;
   // |*gradient| must be of the same size as |weights|. Uses sparse matrix
@@ -69,17 +68,17 @@
   // Following the paper exactly, phi_center_ should be equal to the
   // initial guess for weights when Run() is executed (however, this requirement
   // does not seem to be necessary for convergence in practice).
-  void set_phi_center(const VectorXf &phi) { phi_center_ = phi; }
+  void set_phi_center(const VectorXd &phi) { phi_center_ = phi; }
 
   // Computes the learning rates. This is introduced to enable recomputing
   // learning rates in case l2 penalty changes.
   void compute_and_set_learning_rates();
 
   // Set alpha_; we may need to reset alpha before Run().
-  void set_alpha(const float alpha) { alpha_ = alpha; }
+  void set_alpha(const double alpha) { alpha_ = alpha; }
 
   // Set beta_; we may need to reset beta before Run().
-  void set_beta(const float beta) { beta_ = beta; }
+  void set_beta(const double beta) { beta_ = beta; }
 
  private:
   // Run epoch (iteration) of the algorithm.
@@ -94,24 +93,19 @@
                    bool check_convergence) override;
 
   // Per-coordinate learning rates.
-  VectorXf learning_rates_;
+  VectorXd learning_rates_;
 
   // Center of the approximating quadratic function phi.
-  VectorXf phi_center_;
+  VectorXd phi_center_;
 
   // Parameter for updating the approximation function phi. At each iteration,
   // 'alpha_' is updated to the solution of the quadratic equation
   //     alpha_^2 = beta_ * (1.0 - alpha_)
-  float alpha_;
+  double alpha_;
 
   // Parameter used to update alpha, defined as
   //     beta_{epoch} = \prod_{i=1}^{epoch} (1 - alpha_i).
-  float beta_;
-
-  // The transpose of instances. It is needed for faster gradient computations.
-  // It should be computed when (and only when) instances changes,
-  // so it is computed at the construction of the minimizer.
-  const InstanceSet instances_transpose_;
+  double beta_;
 };
 
 }  // namespace lossmin
commit	e230078b89254096698a98d0c66f46822d1feb4d	[log] [tgz]
author	Bazyli Klockiewicz <bazyli@bazyli.sfo.corp.google.com>	Fri Aug 24 15:37:18 2018 -0700
committer	Bazyli Klockiewicz <bazyli@bazyli.sfo.corp.google.com>	Mon Aug 27 15:37:06 2018 -0700
tree	89ff542360ac0328c640794ead34ff1c270fc790
parent	df9117d7e8c715ca950d2819a1cd8726c20d923a [diff]