Removing most of the unused code from the linear algebra library (lossmin).

This involves mostly code that was specific to solvers that we do not use.

Change-Id: Id61ebc04f77209cbadb549e4b1cb9779c33543cc
diff --git a/lossmin/losses/inner-product-loss-function.h b/lossmin/losses/inner-product-loss-function.h
index a5c8d22..d00cef2 100644
--- a/lossmin/losses/inner-product-loss-function.h
+++ b/lossmin/losses/inner-product-loss-function.h
@@ -86,43 +86,6 @@
   float curvature_;
 };
 
-// Binary logistic regression with labels in {-1, 1}.
-class LogisticRegressionLossFunction : public InnerProductLossFunction {
- public:
-  LogisticRegressionLossFunction() { set_curvature(0.25); }
-
-  // Returns the logistic loss.
-  float InnerProductExampleLoss(float inner_product, float label)
-      const override {
-    return log1pf(exp(-label * inner_product));
-  }
-
-  // Returns the gradient of the logistic loss wrt 'inner_product'.
-  float InnerProductExampleGradient(float inner_product, float label)
-      const override {
-    if (label * inner_product < kInputMin) return -label;
-    return -label / (1.0f + exp(label * inner_product));
-  }
-
-  // Assigns a label in {-1, 1} given 'inner_product'.
-  float InnerProductPredictLabel(float inner_product) const override {
-    if (inner_product > 0) return 1.0f;
-    return -1.0f;
-  }
-
-  // Returns the magnitude of the second derivative of the loss, evaluated at
-  // 'inner_product' and 'label'.
-  float InnerProductCurvature(float inner_product, float label)
-      const override {
-    if (-label * inner_product < kInputMin) return 0.0f;
-    float label_prob = 1.0f / (1.0f + exp(-label * inner_product));
-    return label_prob * (1.0f - label_prob);
-  }
-
-  // Min allowed exp input.
-  const float kInputMin = log(FLT_EPSILON);
-};
-
 // Linear regression with squared error loss.
 class LinearRegressionLossFunction : public InnerProductLossFunction {
  public:
@@ -146,131 +109,4 @@
   }
 };
 
-// Poisson regression, where the distribution mean is the inner product between
-// an instance and parameters.
-class PoissonRegressionLossFunction : public InnerProductLossFunction {
- public:
-  PoissonRegressionLossFunction() {}
-
-  // Returns the Poisson regression loss.
-  float InnerProductExampleLoss(float inner_product, float label)
-      const override {
-    return exp(inner_product) - label * inner_product;
-  }
-
-  // Returns the gradient of the Poisson regression loss wrt 'inner_product'.
-  float InnerProductExampleGradient(float inner_product, float label)
-      const override {
-    return exp(inner_product) - label;
-  }
-
-  // Returns the mode of a Poisson distribution with mean 'inner_product'.
-  float InnerProductPredictLabel(float inner_product) const override {
-    return floor(inner_product);
-  }
-
-  // Returns the magnitude of the second derivative of the loss, evaluated at
-  // 'inner_product' and 'label'.
-  float InnerProductCurvature(float inner_product, float label)
-      const override {
-    return exp(inner_product);
-  }
-};
-
-// Binary logistic regression where 'label' = P(true_label = 1) is in [0, 1],
-// and the loss is averaged over labels:
-// label * logloss(inner_product, 1) + (1 - label) * logloss(inner_product, 0).
-class AveragedLogisticLossFunction : public InnerProductLossFunction {
- public:
-  AveragedLogisticLossFunction() { set_curvature(0.25); }
-
-  // Returns the averaged logistic loss.
-  float InnerProductExampleLoss(float inner_product, float label)
-      const override {
-    return (1.0 - label) * log1pf(exp(inner_product))
-        + label * log1pf(exp(-inner_product));
-  }
-
-  // Returns the gradient of the averaged logistic loss wrt 'inner_product'.
-  float InnerProductExampleGradient(float inner_product, float label)
-      const override {
-    if (-inner_product < kInputMin) return -label + 1.0f;
-    return -label + 1.0f / (1.0f + exp(-inner_product));
-  }
-
-  // Returns the probability that the label is 1 given the inner product.
-  float InnerProductPredictLabel(float inner_product) const override {
-    if (-inner_product < kInputMin) return 1.0f;
-    return 1.0f / (1.0f + exp(-inner_product));
-  }
-
-  // Returns the magnitude of the second derivative of the loss, evaluated at
-  // 'inner_product' and 'label'.
-  float InnerProductCurvature(float inner_product, float label)
-      const override {
-    float label_prob = 1.0f / (1.0f + exp(-inner_product));
-    if (-inner_product < kInputMin) return 0.0f;
-    return label_prob * (1.0f - label_prob);
-  }
-
-  // Min allowed exp input.
-  const float kInputMin = log(FLT_EPSILON);
-};
-
-
-// Smooth hinge loss. If a = inner_product * label, the loss is
-// loss = 0.0                                   a > threshold
-//        0.5 * (a - threshold)^2               0 < a <= threshold
-//        0.5 * threshold^2 - threshold * a     a <= 0
-class SmoothHingeLossFunction : public InnerProductLossFunction {
- public:
-  SmoothHingeLossFunction() { set_curvature(1.0f); }
-
-  /*
-  void Setup(const LossFunctionConfigContainer &config) override {
-    if (config.has_smooth_hinge_threshold()) {
-      threshold_ = config.smooth_hinge_threshold();
-    }
-    set_curvature(threshold_);
-  }
-  */
-
-  float InnerProductExampleLoss(float inner_product, float label)
-      const override {
-    float hinge_input = inner_product *= label;
-    if (hinge_input >= threshold_) {
-      return 0.0;
-    } else if (hinge_input <= 0.0f) {
-      return (0.5 * threshold_ * threshold_ - threshold_ * hinge_input);
-    } else {
-      return 0.5 * (hinge_input - threshold_) * (hinge_input - threshold_);
-    }
-  }
-
-  float InnerProductExampleGradient(float inner_product, float label)
-      const override {
-    float hinge_input = inner_product * label;
-    if (hinge_input >= threshold_) {
-      return 0.0f;
-    } else if (hinge_input <= 0.0f) {
-      return -label * threshold_;
-    } else {
-      return (hinge_input - threshold_) * label;
-    }
-  }
-
-  // Assigns a label in {-1, 1} given 'inner_product'.
-  float InnerProductPredictLabel(float inner_product) const override {
-    if (inner_product > 0) return 1.0f;
-    return -1.0f;
-  }
-
-  void set_threshold(float threshold) { threshold_ = threshold; }
-
- private:
-  float threshold_ = 1.0f;
-};
-
-
-}  // namespace lossmin
-
+}  // namespace lossmin
\ No newline at end of file
diff --git a/lossmin/minimizers/gradient-evaluator.cc b/lossmin/minimizers/gradient-evaluator.cc
index 270d334..32fafbc 100644
--- a/lossmin/minimizers/gradient-evaluator.cc
+++ b/lossmin/minimizers/gradient-evaluator.cc
@@ -40,45 +40,4 @@
   *gradient /= NumExamples();
 }
 
-void GradientEvaluator::set_num_threads(int num_threads,
-                                        int batch_size) {
-  num_threads_ = num_threads;
-  if (num_threads_ == 1) return;
-
-  num_batches_ = (NumExamples() + batch_size - 1) / batch_size;
-
-  // Assign examples to each batch.
-  batch_examples_.assign(num_batches_, {});
-  for (auto &batch : batch_examples_) batch.reserve(batch_size);
-  for (int i = 0; i < NumExamples(); ++i) {
-    batch_examples_[i % num_batches_].push_back(i);
-  }
-}
-
-/*
-void GradientEvaluator::ThreadLoss(
-    const Weights *weights, int example, VectorXf *loss_values,
-    BlockingCounter *num_jobs_remaining) const {
-  // Compute example loss.
-  loss_values->coeffRef(example) =
-      loss_function_->ExampleLoss(*weights, instances_, labels_, example);
-
-  // Decrement thread counter.
-  num_jobs_remaining->DecrementCount();
-}
-
-void GradientEvaluator::ThreadGradient(
-    const Weights *weights, int batch, Weights *gradient,
-    BlockingCounter *num_jobs_remaining) const {
-  // Add the example gradient.
-  for (int example : batch_examples_[batch]) {
-    loss_function_->AddExampleGradient(
-        *weights, instances_, labels_, example, 1.0f, 1.0f, gradient);
-  }
-
-  // Decrement thread counter.
-  num_jobs_remaining->DecrementCount();
-}
-*/
-
 }  // namespace lossmin
diff --git a/lossmin/minimizers/gradient-evaluator.h b/lossmin/minimizers/gradient-evaluator.h
index e63b842..d756759 100644
--- a/lossmin/minimizers/gradient-evaluator.h
+++ b/lossmin/minimizers/gradient-evaluator.h
@@ -77,16 +77,6 @@
         example_gradient);
   }
 
-  // Sets the number of threads and batch size for multithreaded computation.
-  // Assigns examples to 'batch_examples_'.
-  void set_num_threads(int num_threads, int batch_size);
-
-  // Convenience method that sets the number of batches to a default.
-  void set_num_threads(int num_threads) {
-    int batch_size = static_cast<int>(NumExamples() / num_threads / 2);
-    set_num_threads(num_threads, batch_size);
-  }
-
   // Returns the number of examples in the dataset.
   virtual int NumExamples() const { return instances_.rows(); }
 
@@ -98,12 +88,6 @@
     return loss_function_->NumWeights(NumFeatures());
   }
 
-  // Returns the number of batches for multithreaded computation.
-  int num_batches() const { return num_batches_; }
-
-  // Returns the number of threads.
-  int num_threads() const { return num_threads_; }
-
   // Returns an upper bound on the curvature of the loss function. Used to set
   // the learning rate of some LossMinimizer algorithms.
   virtual float LossCurvature() const {
@@ -137,25 +121,6 @@
   // Returns the labels.
   const LabelSet &labels() const { return labels_; }
 
-  // Returns a list of examples in batch 'batch'.
-  const std::vector<int> &batch_examples(int batch) const {
-    return batch_examples_[batch];
-  }
-
- protected:
-  /*
-  // Computes the loss for a single example 'example' and returns it in
-  // 'loss_values[example_index]'.
-  virtual void ThreadLoss(
-      const Weights *weights, int example, VectorXf *loss_values,
-      BlockingCounter *num_jobs_remaining) const;
-
-  // Computes the gradient for examples in batch 'batch', adds it to 'gradient'.
-  virtual void ThreadGradient(
-      const Weights *weights, int batch, Weights *gradient,
-      BlockingCounter *num_jobs_remaining) const;
-  */
-
  private:
   // Training instances.
   const InstanceSet &instances_;
@@ -166,19 +131,6 @@
   // Function for computing the loss and gradient of a single training example.
   // Not owned.
   const LossFunction *loss_function_;
-
-  // Mutex for multithreaded gradient computation.
-  mutable std::mutex gradient_update_mutex_;
-
-  // Number of threads used to parallelize gradient computation.
-  // If num_threads_ = 1, multithreading is not used.
-  int num_threads_ = 1;
-
-  // List of examples assigned to each thread.
-  std::vector<std::vector<int>> batch_examples_;
-
-  // Number of batches.
-  int num_batches_;
 };
 
 }  // namespace lossmin
diff --git a/lossmin/minimizers/loss-minimizer.cc b/lossmin/minimizers/loss-minimizer.cc
index fef8e6b..45ea3d6 100644
--- a/lossmin/minimizers/loss-minimizer.cc
+++ b/lossmin/minimizers/loss-minimizer.cc
@@ -48,103 +48,18 @@
   return converged_;
 }
 
-bool LossMinimizer::Run(int max_epochs, int loss_epochs, int convergence_epochs,
-                        const InstanceSet &validation_instances,
-                        const LabelSet &validation_labels, Weights *weights,
-                        std::vector<float> *training_loss,
-                        std::vector<float> *validation_loss) {
-  // Run for up to 'max_epochs' epochs.
-  int epoch;
-  for (epoch = 0; epoch < max_epochs; ++epoch) {
-    // Compute the loss.
-    if (epoch % loss_epochs == 0) {
-      training_loss->push_back(Loss(*weights));
-      validation_loss->push_back(gradient_evaluator_.Loss(
-          *weights, validation_instances, validation_labels));
-      // LOG(INFO) << epoch << ": " << training_loss->at(training_loss->size() -
-      // 1)
-      //<< "  " << validation_loss->at(validation_loss->size() - 1);
-    }
-
-    // Set the 'check_convergence' flag.
-    bool check_convergence = (epoch > 0) && (epoch % convergence_epochs == 0);
-
-    // Run for one epoch to update the parameters.
-    EpochUpdate(weights, epoch, check_convergence);
-
-    // Optionally do a simple convergence check.
-    if (check_convergence && use_simple_convergence_check_) {
-      SimpleConvergenceCheck(*training_loss);
-    }
-
-    // Check the convergence flag.
-    if (converged_) {
-      break;
-    }
-  }
-
-  // Compute final loss.
-  training_loss->push_back(Loss(*weights));
-  validation_loss->push_back(gradient_evaluator_.Loss(
-      *weights, validation_instances, validation_labels));
-  // LOG(INFO) << "final loss: "
-  //<< training_loss->at(training_loss->size() - 1);
-  // LOG(INFO) << "final validation loss: "
-  //<< validation_loss->at(validation_loss->size() - 1);
-  num_epochs_run_ = std::min(epoch + 1, max_epochs);
-  return converged_;
-}
-
-float LossMinimizer::BestLearningRate() {
-  // Sample a subset of the data.
-  std::vector<int> examples(num_search_examples_);
-  std::mt19937 rnd;
-  std::uniform_int_distribution<> dis(0, gradient_evaluator_.NumExamples());
-  for (int i = 0; i < num_search_examples_; ++i) {
-    examples[i] = dis(rnd);
-  }
-
-  float min_loss_rate = initial_rates_[0];
-  float min_loss = HUGE_VAL;
-  for (int i = 0; i < initial_rates_.size(); ++i) {
-    // Initialize parameters to zero.
-    Weights weights = Weights::Zero(gradient_evaluator_.NumWeights());
-
-    // Make updates with the current learning rate.
-    Weights gradient(weights.size());
-    for (int example : examples) {
-      gradient = l2() * weights;
-      gradient_evaluator_.AddExampleGradient(weights, example, 1.0f, 1.0f,
-                                             &gradient);
-      weights -= initial_rates_[i] * gradient;
-    }
-
-    // Compute the corresponding loss.
-    float loss = gradient_evaluator_.Loss(weights, examples);
-    if (i == 0) {
-      min_loss = loss;
-    } else if (loss < min_loss) {
-      min_loss = loss;
-      min_loss_rate = initial_rates_[i];
-    }
-  }
-
-  // Return the learning rate corresponding to the smallest loss.
-  // LOG(INFO) << "best initial learning rate: " << min_loss_rate;
-  return min_loss_rate;
-}
-
 // By default only check if gradient is == 0
 void LossMinimizer::ConvergenceCheck(const Weights &weights,
                                      const Weights &gradient) {
   if (gradient.squaredNorm() / weights.size() < convergence_threshold_) {
+    set_reached_solution(true);
     set_converged(true);
   }
 }
 
 void LossMinimizer::SimpleConvergenceCheck(const std::vector<float> &loss) {
   // Check convergence by verifying that the max relative loss decrease
-  // (loss[t-1] - loss[t]) / loss[t-1] is below 'convergence_threshold_'.
+  // (loss[t-1] - loss[t]) / loss[t-1] is below 'simple_convergence_threshold_'.
   if (loss.size() > num_convergence_epochs_) {
     float loss_difference = 0.0f;
     for (int i = loss.size() - num_convergence_epochs_; i < loss.size(); ++i) {
@@ -159,16 +74,4 @@
   }
 }
 
-void LossMinimizer::set_batch_size(int batch_size) {
-  int num_batches =
-      (gradient_evaluator_.NumExamples() + batch_size - 1) / batch_size;
-
-  // Assign examples to each batch.
-  batch_examples_.assign(num_batches, {});
-  for (auto &batch : batch_examples_) batch.reserve(batch_size);
-  for (int i = 0; i < gradient_evaluator().NumExamples(); ++i) {
-    batch_examples_[i % num_batches].push_back(i);
-  }
-}
-
 }  // namespace lossmin
diff --git a/lossmin/minimizers/loss-minimizer.h b/lossmin/minimizers/loss-minimizer.h
index 0f05d10..eb87947 100644
--- a/lossmin/minimizers/loss-minimizer.h
+++ b/lossmin/minimizers/loss-minimizer.h
@@ -33,6 +33,7 @@
 // the data and the loss function. If the dataset or the loss pointed to by
 // 'gradient_evaluator' change, the user must call loss_minimizer.Setup() method
 // before loss_minimizer.Run().
+// TODO(bazyli) Update description.
 
 #pragma once
 
@@ -63,10 +64,7 @@
       : l1_(l1),
         l2_(l2),
         gradient_evaluator_(gradient_evaluator),
-        converged_(false) {
-    std::random_device rd;
-    rnd_.seed(rd());
-  }
+        converged_(false) {}
   virtual ~LossMinimizer() {}
 
   // Initializes algorithm-specific parameters such as learning rates, based
@@ -83,13 +81,6 @@
   bool Run(int max_epochs, int loss_epochs, int convergence_epochs,
            Weights *weights, std::vector<float> *loss);
 
-  // Runs minimization, evaluating loss on both training and validation data.
-  bool Run(int max_epochs, int loss_epochs, int convergence_epochs,
-           const InstanceSet &validation_instances,
-           const LabelSet &validation_labels, Weights *weights,
-           std::vector<float> *training_loss,
-           std::vector<float> *validation_loss);
-
   // Convenience Run method that evaluates the loss and checks for convergence
   // at every iteration.
   bool Run(int max_epochs, Weights *weights, std::vector<float> *loss) {
@@ -109,8 +100,9 @@
     return loss;
   }
 
-  // Checks convergence of deterministic methods.
-  // If converged, the flag 'converged_' is set to true.
+  // Checks convergence by checking the sufficient and necessary conditions for
+  // minimizer directly. If converged, the flags 'converged_' and
+  // 'reached_solution_' are set to true.
   virtual void ConvergenceCheck(const Weights &weights,
                                 const Weights &gradient);
 
@@ -147,49 +139,6 @@
     zero_threshold_ = zero_threshold;
   }
 
-  // Returns the best initial learning rate for stochastic methods by evaluating
-  // elements of 'initial_rates_' on a subset of 'num_search_examples_' training
-  // examples.
-  float BestLearningRate();
-
-  // Methods that set parameters related to the initial learning rate in
-  // stochastic methods.
-  virtual void set_initial_learning_rate(float initial_learning_rate) {
-    initial_learning_rate_ = initial_learning_rate;
-    // LOG(INFO) << "initial_learning_rate set to " << initial_learning_rate_;
-  }
-  float initial_learning_rate() const { return initial_learning_rate_; }
-  void set_initial_rates(const std::vector<float> &initial_rates) {
-    initial_rates_ = initial_rates;
-  }
-  void set_num_search_examples(int num_search_examples) {
-    num_search_examples_ = num_search_examples;
-  }
-
-  // Sets the seed of the random number generator.
-  void SetSeed(int32_t seed) { rnd_.seed(seed); }
-
-  // Number of threads for stochastic methods.
-  void set_num_threads(int num_threads) { num_threads_ = num_threads; }
-  int num_threads() const { return num_threads_; }
-
-  // Minibatch size for stochastic methods.
-  void set_batch_size(int batch_size);
-  int NumBatches() const { return batch_examples_.size(); }
-
-  // Shuffles the batches with examples.
-  void ShuffleBatches() {
-    std::shuffle(batch_examples_.begin(), batch_examples_.end(), rnd_);
-  }
-
-  // Shuffles and returns the examples in 'batch_examples_[batch]'.
-  const std::vector<int> &batch_examples(int batch) {
-    // DCHECK(batch < batch_examples_.size());
-    std::shuffle(batch_examples_[batch].begin(), batch_examples_[batch].end(),
-                 rnd_);
-    return batch_examples_[batch];
-  }
-
   // Returns a reference to 'gradient_evaluator_'.
   const GradientEvaluator &gradient_evaluator() const {
     return gradient_evaluator_;
@@ -203,27 +152,6 @@
   float l2() const { return l2_; }
   void set_l2(float l2) { l2_ = l2; }
 
-  // Getter/setter for the 'num_scale_iterations_', used for sparse updates in
-  // stochastic methods.
-  int num_scale_iterations() const { return num_scale_iterations_; }
-  void set_num_scale_iterations(int num_scale_iterations) {
-    num_scale_iterations_ = num_scale_iterations;
-  }
-
-  // Getter/setter for 'num_iterations_per_stage_', used in
-  // StochasticVarianceReducedGradient.
-  int num_iterations_per_stage() const { return num_iterations_per_stage_; }
-  void set_num_iterations_per_stage(int num_iterations_per_stage) {
-    num_iterations_per_stage_ = num_iterations_per_stage;
-  }
-
-  // Getter/setter for projecting weights onto a ball in
-  // StochasticGradientDescent.
-  bool project_weights() const { return project_weights_; }
-  void set_project_weights(bool project_weights) {
-    project_weights_ = project_weights;
-  }
-
   // Returns the number of iterations the last time Run() was executed
   int num_epochs_run() const { return num_epochs_run_; }
 
@@ -254,9 +182,6 @@
     return 0.0f;
   }
 
-  // Default initial learning rate for stochastic methods.
-  const float kDefaultInitialLearningRate = 0.05;
-
  private:
   // Regularization parameters.
   float l1_;
@@ -291,36 +216,6 @@
   // In other words, each epoch is a step towards minimum during minimization.
   // This variable gets updated when Run() is called.
   int num_epochs_run_ = 0;
-
-  // Initial learning rate, used in stochastic methods.
-  float initial_learning_rate_ = 0.01;
-
-  // Parameters used to search for a good initial learning rate for stochastic
-  // methods in the method BestLearningRate(). 'initial_rates_' are evaluated on
-  // a data subset of size 'num_search_examples_'.
-  std::vector<float> initial_rates_ = {0.001, 0.005, 0.01, 0.05};
-  int num_search_examples_ = 500;
-
-  // Multithreading parameters, optionally used in stochastic methods.
-  int num_threads_;
-  std::vector<std::vector<int>> batch_examples_;
-
-  // Pseudo-random number generator for shuffling 'batch_examples_'.
-  std::mt19937 rnd_;
-
-  // In stochastic methods, weights = scale * Weights. For numerical stability,
-  // this is reset to scale = 1.0 every 'num_scale_iterations_' iterations.
-  int num_scale_iterations_ = 100;
-
-  // Number of iterations per stage; used in StochasticVarianceReducedGradient.
-  int num_iterations_per_stage_ = 100000;
-
-  // For online/stochastic methods, a simple bound on the optimal solution is
-  // |weights|^2 <= 2 / L2. Knowing this, if an intermediate solution is
-  // outside of a ball of radius sqrt(2 / L2), we can project it onto the ball
-  // to potentially speed up convergence. This approach is used in
-  // StochasticGradientDescent, and controlled by the flag 'project_weights_'.
-  bool project_weights_ = true;
 };
 
 }  // namespace lossmin
diff --git a/lossmin/minimizers/parallel-boosting-with-momentum.cc b/lossmin/minimizers/parallel-boosting-with-momentum.cc
index 127842c..e3f25f6 100644
--- a/lossmin/minimizers/parallel-boosting-with-momentum.cc
+++ b/lossmin/minimizers/parallel-boosting-with-momentum.cc
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 // Author: nevena@google.com (Nevena Lazic)
+// TODO(bazyli) update license header
 
 #include "lossmin/minimizers/parallel-boosting-with-momentum.h"
 
@@ -14,7 +15,6 @@
 namespace lossmin {
 
 void ParallelBoostingWithMomentum::Setup() {
-  // Initialize the approximating function parameters.
   compute_and_set_learning_rates();
   alpha_ = 0.5f;
   beta_ = 1.0f - alpha_;
@@ -34,6 +34,7 @@
 void ParallelBoostingWithMomentum::SparseInnerProductGradient(
     const Weights &weights, Weights *gradient) const {
   // Eigen library recommends computations step-by-step for best perfomance
+  // TODO(bazyli) parallel matrix vector multiply with open mp
   Weights residual = gradient_evaluator().instances() * weights;
   residual -= gradient_evaluator().labels();
   *gradient = instances_transpose_ * residual;
@@ -42,6 +43,7 @@
 
 float ParallelBoostingWithMomentum::Loss(const Weights &weights) const {
   // Eigen library recommends computations step-by-step for best perfomance
+  // TODO(bazyli) parallel matrix vector multiply with open mp
   Weights residual = gradient_evaluator().instances() * weights;
   residual -= gradient_evaluator().labels();
   float loss =
@@ -81,10 +83,9 @@
   // Compute the intermediate weight vector y.
   Weights y = (1.0f - alpha_) * *weights + alpha_ * phi_center_;
 
-  // Compute the gradient of the loss wrt y.
+  // Compute the gradient of the loss (except l1 penalty) wrt y.
   Weights gradient_wrt_y = Weights::Zero(y.size());
   SparseInnerProductGradient(y, &gradient_wrt_y);
-
   if (l2() > 0.0f) gradient_wrt_y += l2() * y;
 
   // Gradient step.
diff --git a/lossmin/minimizers/parallel-boosting-with-momentum.h b/lossmin/minimizers/parallel-boosting-with-momentum.h
index d8e5bb2..62e75e6 100644
--- a/lossmin/minimizers/parallel-boosting-with-momentum.h
+++ b/lossmin/minimizers/parallel-boosting-with-momentum.h
@@ -7,6 +7,7 @@
 // I. Mukherjee, K. Canini, R. Frongillo, and Y. Singer, "Parallel Boosting with
 // Momentum", ECML PKDD 2013.
 // Variable names follow the notation in the paper.
+// TODO(bazyli) update license
 
 #pragma once
 
@@ -27,7 +28,7 @@
     Setup();
   }
 
-  // Sets learning rates and other parameters.
+  // Sets learning rates, initializes alpha_,beta_ and phi_center_.
   void Setup() override;
 
   // Checks convergence by verifying the KKT conditions directly.
@@ -42,14 +43,16 @@
   // If weights_i == 0 then -l1() <= gradient_i <= l1().
   //
   // The squared violations at each coordinate are summed and the square
-  // root divided by weights.size() is compared to convergence_thresold()
+  // root is divided by weights.size() is compared to convergence_thresold().
+  // If convergence is determined, sets the appropriate flags so that
+  // converged() == true and reached_solution() == true.
   void ConvergenceCheck(const Weights &weights,
                         const Weights &gradient) override;
 
-  // Returns the total loss for given parameters 'weights', including l1 and l2
+  // Returns the total loss for given parameters |weights|, including l1 and l2
   // regularization. Uses sparse matrix multiply from Eigen.
   // This is more efficient in terms of performed operations than calling
-  // gradient_evaluator().Loss(weights) but is not intended for parallelization.
+  // gradient_evaluator().Loss(weights).
   float Loss(const Weights &weights) const override;
 
   // Computes the inner product gradient at |weights|, written to |gradient*|;
@@ -57,6 +60,8 @@
   // multiply from Eigen. This is much more efficient in terms of performed
   // operations than calling gradient_evaluator().gradient(weights) but is not
   // intended for parallelization.
+  // TODO(bazyli) parallelize sparse matrix * dense vector multiply using open
+  // mp
   void SparseInnerProductGradient(const Weights &weights,
                                   Weights *gradient) const;
 
@@ -77,6 +82,7 @@
   void set_beta(const float beta) { beta_ = beta; }
 
  private:
+  // Run epoch (iteration) of the algorithm.
   // Updates 'weights' and the quadratic approximation function phi(w), such
   // that at iteration k, loss(weights_k) <= min_w phi_k(w).
   //     y = (1 - alpha) * weights + alpha * phi_center