Initial commit.
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f2eaccc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.pyc
+/out/
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..1ea6393
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,6 @@
+# This is the list of Fuchsia Authors.
+# Names should be added to this file as one of
+#     Organization's name
+#     Individual's name <submission email address>
+#     Individual's name <submission email address> <email2> <emailN>
+Google Inc.
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..2b511db
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,31 @@
+# Copyright 2017 The Fuchsia Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+cmake_minimum_required (VERSION 2.8.10)
+
+
+# The only way we found to override the compiler is setting these variables
+# prior to the project definition.  Otherwise enable_language() sets these
+# variables and gcc will be picked up instead.  Also, new versions of cmake will
+# complain if one compiler is first set (say to gcc via enable_language), and
+# then we later switch it to (say) clang.
+SET (CMAKE_C_COMPILER "clang")
+SET (CMAKE_CXX_COMPILER "clang++")
+
+project(lossmin)
+
+
+SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -std=c++11 -stdlib=libstdc++ -fcolor-diagnostics")
+
+enable_language(C)
+enable_language(CXX)
+
+include_directories(..)
+
+add_executable(sample sample.cc)
+target_link_libraries(sample
+                      loss_functions
+                      minimizers)
+
+add_subdirectory(losses)
+add_subdirectory(minimizers)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..ac6402f
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,27 @@
+// Copyright 2016 The Fuchsia Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/PATENTS b/PATENTS
new file mode 100644
index 0000000..2746e78
--- /dev/null
+++ b/PATENTS
@@ -0,0 +1,22 @@
+Additional IP Rights Grant (Patents)
+
+"This implementation" means the copyrightable works distributed by
+Google as part of the Fuchsia project.
+
+Google hereby grants to you a perpetual, worldwide, non-exclusive,
+no-charge, royalty-free, irrevocable (except as stated in this
+section) patent license to make, have made, use, offer to sell, sell,
+import, transfer, and otherwise run, modify and propagate the contents
+of this implementation of Fuchsia, where such license applies only to
+those patent claims, both currently owned by Google and acquired in
+the future, licensable by Google that are necessarily infringed by
+this implementation. This grant does not include claims that would be
+infringed only as a consequence of further modification of this
+implementation. If you or your agent or exclusive licensee institute
+or order or agree to the institution of patent litigation or any other
+patent enforcement activity against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that this
+implementation of Fuchsia constitutes direct or contributory patent
+infringement, or inducement of patent infringement, then any patent
+rights granted to you under this License for this implementation of
+Fuchsia shall terminate as of the date such litigation is filed.
diff --git a/README b/README
new file mode 100644
index 0000000..ccfc5f2
--- /dev/null
+++ b/README
@@ -0,0 +1,6 @@
+lossmin
+=======
+
+Lossmin is a library to solve some optimization problems.
+
+TODO(azani): Maybe write a bit more than that.
diff --git a/eigen-types.h b/eigen-types.h
new file mode 100644
index 0000000..47df398
--- /dev/null
+++ b/eigen-types.h
@@ -0,0 +1,51 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#pragma once
+
+#include "third_party/eigen/Eigen/Core"
+#include "third_party/eigen/Eigen/SparseCore"
+
+namespace lossmin {
+
+// Arrays.
+typedef Eigen::Array<bool, Eigen::Dynamic, 1> ArrayXb;
+typedef Eigen::ArrayXf ArrayXf;
+typedef Eigen::ArrayXi ArrayXi;
+
+// Vectors.
+typedef Eigen::VectorXf VectorXf;
+typedef Eigen::VectorXi VectorXi;
+
+// Sparse Vectors.
+typedef Eigen::SparseVector<float> SparseVectorXf;
+
+// Matrix.
+typedef Eigen::Matrix<
+    float,
+    Eigen::Dynamic,
+    Eigen::Dynamic,
+    Eigen::ColMajor> MatrixXf;
+typedef Eigen::Matrix<
+    float,
+    Eigen::Dynamic,
+    Eigen::Dynamic,
+    Eigen::RowMajor> RowMatrixXf;
+
+// Sparse Matrix.
+typedef Eigen::SparseMatrix<float> SparseMatrixXf;
+
+// Instances and parameters.
+typedef VectorXf Weights;
+
+typedef VectorXf Label;
+typedef RowMatrixXf LabelSet;
+
+typedef Eigen::SparseVector<float> Instance;
+typedef Eigen::SparseMatrix<float, Eigen::RowMajor> InstanceSet;
+typedef Eigen::SparseMatrix<float, Eigen::ColMajor> SparseMatrixXf;
+typedef Eigen::SparseVector<float> SparseVectorXf;
+
+}  // namespace lossmin
+
diff --git a/losses/CMakeLists.txt b/losses/CMakeLists.txt
new file mode 100644
index 0000000..46ae888
--- /dev/null
+++ b/losses/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Copyright 2017 The Fuchsia Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+add_library(loss_functions
+            loss-function.cc
+            inner-product-loss-function.cc)
diff --git a/losses/inner-product-loss-function.cc b/losses/inner-product-loss-function.cc
new file mode 100644
index 0000000..2d24293
--- /dev/null
+++ b/losses/inner-product-loss-function.cc
@@ -0,0 +1,89 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+
+#include "lossmin/losses/inner-product-loss-function.h"
+#include <math.h>
+
+namespace lossmin {
+
+float InnerProductLossFunction::LossCurvature(
+    const InstanceSet &instances) const {
+  float data_curvature = (instances.cwiseProduct(instances) *
+                          Weights::Ones(instances.cols())).maxCoeff();
+  return curvature_ * data_curvature;
+}
+
+void InnerProductLossFunction::PerCoordinateCurvature(
+    const InstanceSet &instances, VectorXf *per_coordinate_curvature) const {
+  *per_coordinate_curvature =
+      VectorXf::Ones(instances.rows()).transpose() *
+            instances.cwiseProduct(instances) / instances.rows();
+  *per_coordinate_curvature *= curvature_;
+}
+
+float InnerProductLossFunction::ExampleLoss(
+    const Weights &weights, const InstanceSet &instances,
+    const LabelSet &labels, int example) const {
+  float inner_product = instances.innerVector(example).dot(weights);
+  return InnerProductExampleLoss(inner_product, labels.coeff(example, 0));
+}
+
+void InnerProductLossFunction::AddExampleGradient(
+    const Weights &weights, const InstanceSet &instances,
+    const LabelSet &labels, int example, float weights_scale,
+    float example_scale, Weights *gradient) const {
+  float inner_product = instances.innerVector(example).dot(weights);
+  if (weights_scale != 1.0f) inner_product *= weights_scale;
+  float inner_product_gradient =
+      InnerProductExampleGradient(inner_product, labels.coeff(example, 0));
+  if (example_scale != 1.0f) inner_product_gradient *= example_scale;
+
+  if (synchronous_update()) {
+    std::lock_guard<std::mutex> lock(gradient_update_mutex_);
+    for (InstanceSet::InnerIterator it(instances, example); it; ++it) {
+      gradient->coeffRef(it.index()) += inner_product_gradient * it.value();
+    }
+  } else {
+    for (InstanceSet::InnerIterator it(instances, example); it; ++it) {
+      gradient->coeffRef(it.index()) += inner_product_gradient * it.value();
+    }
+  }
+}
+
+void InnerProductLossFunction::ExampleGradient(
+    const Weights &weights, const InstanceSet &instances,
+    const LabelSet &labels, int example, float weights_scale,
+    float example_scale,
+    std::vector<std::pair<int, float>> *example_gradient) const {
+  float inner_product = instances.innerVector(example).dot(weights);
+  if (weights_scale != 1.0f) inner_product *= weights_scale;
+  float inner_product_gradient =
+      InnerProductExampleGradient(inner_product, labels.coeff(example, 0));
+  if (example_scale != 1.0f) inner_product_gradient *= example_scale;
+
+  example_gradient->resize(instances.row(example).nonZeros());
+  int i = 0;
+  for (InstanceSet::InnerIterator it(instances, example); it; ++it) {
+    example_gradient->at(i++) =
+        std::make_pair(it.index(), inner_product_gradient * it.value());
+  }
+}
+
+// Assigns labels to 'instances' given 'weights'.
+void InnerProductLossFunction::PredictLabels(
+    const Weights &weights, const InstanceSet &instances,
+    LabelSet *labels) const {
+  // Compute inner products.
+  VectorXf inner_products = instances * weights;
+
+  // Assign labels by calling InnerProductAssignLabel coefficientwise on
+  // 'inner_products'.
+  std::function<float(float)> assign_label_ptr =
+      std::bind(&InnerProductLossFunction::InnerProductPredictLabel,
+                this, std::placeholders::_1);
+  *labels = inner_products.unaryExpr(assign_label_ptr);
+}
+
+}  // namespace lossmin
diff --git a/losses/inner-product-loss-function.h b/losses/inner-product-loss-function.h
new file mode 100644
index 0000000..a5c8d22
--- /dev/null
+++ b/losses/inner-product-loss-function.h
@@ -0,0 +1,276 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+//
+// In InnerProductLossFunction, the loss of a labeled example
+// (instance_i, label_i) is only a function of the inner product
+// <weights, instance_i> and label_i. label_i is a scalar (single element
+// VectorXf).
+//
+// Derived classes need to provide InnerProductExampleLoss,
+// InnerProductExampleGradient, and InnerProductPredictLabel, and set the
+// 'curvature_' parameter.
+//
+// Current implementations include logistic-regression, averaged-logistic,
+// linear-regression, poisson-regression, smooth-hinge.
+
+#pragma once
+
+#include <float.h>
+#include <math.h>
+#include <functional>
+#include <mutex>
+
+#include "lossmin/eigen-types.h"
+#include "lossmin/losses/loss-function.h"
+
+namespace lossmin {
+
+class InnerProductLossFunction : public LossFunction {
+ public:
+  // Returns the loss for a single example.
+  float ExampleLoss(
+      const Weights &weights, const InstanceSet &instances,
+      const LabelSet &labels, int example) const override;
+
+  // Adds the gradient of a single example to 'gradient'.
+  void AddExampleGradient(
+      const Weights &weights, const InstanceSet &instances,
+      const LabelSet &labels, int example, float weights_scale,
+      float example_scale, Weights *gradient) const override;
+
+  // Returns the gradient of a single example.
+  void ExampleGradient(
+      const Weights &weights, const InstanceSet &instances,
+      const LabelSet &labels, int example, float weights_scale,
+      float example_scale,
+      std::vector<std::pair<int, float>> *example_gradient) const override;
+
+  // Assigns labels to 'instances' given 'weights'.
+  void PredictLabels(const Weights &weights, const InstanceSet &instances,
+                     LabelSet *labels) const override;
+
+  // Returns an upper bound on the loss curvature.
+  float LossCurvature(const InstanceSet &instances) const override;
+
+  // Returns an upper bound on the per-coordinate curvature.
+  void PerCoordinateCurvature(
+      const InstanceSet &instances,
+      VectorXf *per_coordinate_curvature) const override;
+
+  virtual float InnerProductExampleLoss(float inner_product, float label)
+      const = 0;
+
+  virtual float InnerProductExampleGradient(float inner_product, float label)
+      const = 0;
+
+  virtual float InnerProductPredictLabel(float inner_product) const = 0;
+
+  // Returns 'curvature_'.
+  virtual float InnerProductCurvature(float inner_product, float label) const {
+    return curvature_;
+  }
+
+ protected:
+  // Sets the upper bound on the curvature of the loss function.
+  void set_curvature(float curvature) { curvature_ = curvature; }
+
+ private:
+  // Mutex for synchronous updates of the gradient vector.
+  mutable std::mutex gradient_update_mutex_;
+
+  // Upper bound on the absolute value of the second derivative of the loss:
+  // |d^2 loss(x) / dx^2| <= curvature_, where 'x' is the inner product
+  //  <instance, weights>. Should be set by derived classes.
+  float curvature_;
+};
+
+// Binary logistic regression with labels in {-1, 1}.
+class LogisticRegressionLossFunction : public InnerProductLossFunction {
+ public:
+  LogisticRegressionLossFunction() { set_curvature(0.25); }
+
+  // Returns the logistic loss.
+  float InnerProductExampleLoss(float inner_product, float label)
+      const override {
+    return log1pf(exp(-label * inner_product));
+  }
+
+  // Returns the gradient of the logistic loss wrt 'inner_product'.
+  float InnerProductExampleGradient(float inner_product, float label)
+      const override {
+    if (label * inner_product < kInputMin) return -label;
+    return -label / (1.0f + exp(label * inner_product));
+  }
+
+  // Assigns a label in {-1, 1} given 'inner_product'.
+  float InnerProductPredictLabel(float inner_product) const override {
+    if (inner_product > 0) return 1.0f;
+    return -1.0f;
+  }
+
+  // Returns the magnitude of the second derivative of the loss, evaluated at
+  // 'inner_product' and 'label'.
+  float InnerProductCurvature(float inner_product, float label)
+      const override {
+    if (-label * inner_product < kInputMin) return 0.0f;
+    float label_prob = 1.0f / (1.0f + exp(-label * inner_product));
+    return label_prob * (1.0f - label_prob);
+  }
+
+  // Min allowed exp input.
+  const float kInputMin = log(FLT_EPSILON);
+};
+
+// Linear regression with squared error loss.
+class LinearRegressionLossFunction : public InnerProductLossFunction {
+ public:
+  LinearRegressionLossFunction() { set_curvature(1.0); }
+
+  // Returns the squared error loss.
+  float InnerProductExampleLoss(float inner_product, float label)
+      const override {
+    return 0.5 * (inner_product - label) * (inner_product - label);
+  }
+
+  // Returns the gradient of the squared error loss wrt 'inner_product'.
+  float InnerProductExampleGradient(float inner_product, float label)
+      const override {
+    return inner_product - label;
+  }
+
+  // Assigns a label given 'inner_product'.
+  float InnerProductPredictLabel(float inner_product) const override {
+    return inner_product;
+  }
+};
+
+// Poisson regression, where the distribution mean is the inner product between
+// an instance and parameters.
+class PoissonRegressionLossFunction : public InnerProductLossFunction {
+ public:
+  PoissonRegressionLossFunction() {}
+
+  // Returns the Poisson regression loss.
+  float InnerProductExampleLoss(float inner_product, float label)
+      const override {
+    return exp(inner_product) - label * inner_product;
+  }
+
+  // Returns the gradient of the Poisson regression loss wrt 'inner_product'.
+  float InnerProductExampleGradient(float inner_product, float label)
+      const override {
+    return exp(inner_product) - label;
+  }
+
+  // Returns the mode of a Poisson distribution with mean 'inner_product'.
+  float InnerProductPredictLabel(float inner_product) const override {
+    return floor(inner_product);
+  }
+
+  // Returns the magnitude of the second derivative of the loss, evaluated at
+  // 'inner_product' and 'label'.
+  float InnerProductCurvature(float inner_product, float label)
+      const override {
+    return exp(inner_product);
+  }
+};
+
+// Binary logistic regression where 'label' = P(true_label = 1) is in [0, 1],
+// and the loss is averaged over labels:
+// label * logloss(inner_product, 1) + (1 - label) * logloss(inner_product, 0).
+class AveragedLogisticLossFunction : public InnerProductLossFunction {
+ public:
+  AveragedLogisticLossFunction() { set_curvature(0.25); }
+
+  // Returns the averaged logistic loss.
+  float InnerProductExampleLoss(float inner_product, float label)
+      const override {
+    return (1.0 - label) * log1pf(exp(inner_product))
+        + label * log1pf(exp(-inner_product));
+  }
+
+  // Returns the gradient of the averaged logistic loss wrt 'inner_product'.
+  float InnerProductExampleGradient(float inner_product, float label)
+      const override {
+    if (-inner_product < kInputMin) return -label + 1.0f;
+    return -label + 1.0f / (1.0f + exp(-inner_product));
+  }
+
+  // Returns the probability that the label is 1 given the inner product.
+  float InnerProductPredictLabel(float inner_product) const override {
+    if (-inner_product < kInputMin) return 1.0f;
+    return 1.0f / (1.0f + exp(-inner_product));
+  }
+
+  // Returns the magnitude of the second derivative of the loss, evaluated at
+  // 'inner_product' and 'label'.
+  float InnerProductCurvature(float inner_product, float label)
+      const override {
+    float label_prob = 1.0f / (1.0f + exp(-inner_product));
+    if (-inner_product < kInputMin) return 0.0f;
+    return label_prob * (1.0f - label_prob);
+  }
+
+  // Min allowed exp input.
+  const float kInputMin = log(FLT_EPSILON);
+};
+
+
+// Smooth hinge loss. If a = inner_product * label, the loss is
+// loss = 0.0                                   a > threshold
+//        0.5 * (a - threshold)^2               0 < a <= threshold
+//        0.5 * threshold^2 - threshold * a     a <= 0
+class SmoothHingeLossFunction : public InnerProductLossFunction {
+ public:
+  SmoothHingeLossFunction() { set_curvature(1.0f); }
+
+  /*
+  void Setup(const LossFunctionConfigContainer &config) override {
+    if (config.has_smooth_hinge_threshold()) {
+      threshold_ = config.smooth_hinge_threshold();
+    }
+    set_curvature(threshold_);
+  }
+  */
+
+  float InnerProductExampleLoss(float inner_product, float label)
+      const override {
+    float hinge_input = inner_product *= label;
+    if (hinge_input >= threshold_) {
+      return 0.0;
+    } else if (hinge_input <= 0.0f) {
+      return (0.5 * threshold_ * threshold_ - threshold_ * hinge_input);
+    } else {
+      return 0.5 * (hinge_input - threshold_) * (hinge_input - threshold_);
+    }
+  }
+
+  float InnerProductExampleGradient(float inner_product, float label)
+      const override {
+    float hinge_input = inner_product * label;
+    if (hinge_input >= threshold_) {
+      return 0.0f;
+    } else if (hinge_input <= 0.0f) {
+      return -label * threshold_;
+    } else {
+      return (hinge_input - threshold_) * label;
+    }
+  }
+
+  // Assigns a label in {-1, 1} given 'inner_product'.
+  float InnerProductPredictLabel(float inner_product) const override {
+    if (inner_product > 0) return 1.0f;
+    return -1.0f;
+  }
+
+  void set_threshold(float threshold) { threshold_ = threshold; }
+
+ private:
+  float threshold_ = 1.0f;
+};
+
+
+}  // namespace lossmin
+
diff --git a/losses/loss-function.cc b/losses/loss-function.cc
new file mode 100644
index 0000000..ca11228
--- /dev/null
+++ b/losses/loss-function.cc
@@ -0,0 +1,30 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+
+#include "lossmin/losses/loss-function.h"
+#include <float.h>
+#include <math.h>
+
+namespace lossmin {
+
+float LossFunction::BatchLoss(
+    const Weights &weights, const InstanceSet &instances,
+    const LabelSet &labels) const {
+  float loss = 0.0f;
+  for (int i = 0; i < instances.rows(); ++i) {
+    loss += ExampleLoss(weights, instances, labels, i);
+  }
+  return loss;
+}
+
+void LossFunction::BatchGradient(
+    const Weights &weights, const InstanceSet &instances,
+    const LabelSet &labels, Weights *gradient) const {
+  for (int i = 0; i < instances.rows(); ++i) {
+    AddExampleGradient(weights, instances, labels, i, 1.0f, 1.0f, gradient);
+  }
+}
+
+}  // namespace lossmin
diff --git a/losses/loss-function.h b/losses/loss-function.h
new file mode 100644
index 0000000..b02aff0
--- /dev/null
+++ b/losses/loss-function.h
@@ -0,0 +1,112 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+//
+// Interface for computing the value and gradient of a loss function given
+// parameters 'weights' and labeled examples {(instance_i, label_i)}
+
+#pragma once
+
+#include <math.h>
+
+#include "lossmin/eigen-types.h"
+
+namespace lossmin {
+
+// Abstract interface for computing the value and gradient of a loss function.
+// Derived classes must provide the following methods: LossCurvature,
+// PerCoordinateCurvature, ExampleLoss, AddExampleGradient, and
+// PredictLabels.
+// If multiple threads concurrently call the AddExampleGradient() method of the
+// same LossFunction object to update the same 'gradient' vector, changes to
+// 'gradient' need to be protected by a mutex in the implementation. If the
+// AddExampleGradient() method of the same LossFunction object is used for
+// different 'gradient' vectors, this will result in unnecessary synchronization
+// costs. 'is_synchronous_' flag indicates whether to use synchronization.
+// If multiple threads concurrently call the AddExampleGradient() method of
+// different LossFunction objects to update the same 'gradient' vector, this
+// may result in race conditions. In this case, changes to 'gradient' should be
+// protected by the caller.
+class LossFunction {
+ public:
+  LossFunction() {}
+  virtual ~LossFunction() {}
+
+  // Returns the loss for a single example (row 'example' of 'instances' and
+  // 'labels').
+  virtual float ExampleLoss(
+      const Weights &weights, const InstanceSet &instances,
+      const LabelSet &labels, int example) const = 0;
+
+  // Computes the gradient of the loss for a single example (row 'example' of
+  // 'instances' and labels) with respect to 'weights_scale * weights'. Adds
+  // 'example_scale * example_gradient' to 'gradient'.
+  // If multiple threads call the AddExampleGradient method of the same object
+  // to update the same 'gradient' vector, updates to 'gradient' need to be
+  // protected by a mutex in the implementation.
+  virtual void AddExampleGradient(
+      const Weights &weights, const InstanceSet &instances,
+      const LabelSet &labels, int example, float weights_scale,
+      float example_scale, Weights *gradient) const = 0;
+
+  // Returns the gradient of the loss for a single example. Used in AdaGrad.
+  virtual void ExampleGradient(
+      const Weights &weights, const InstanceSet &instances,
+      const LabelSet &labels, int example, float weights_scale,
+      float example_scale,
+      std::vector<std::pair<int, float>> *example_gradient) const = 0;
+
+  // Predicts 'labels' for 'instances' given 'weights'.
+  virtual void PredictLabels(
+      const Weights &weights, const InstanceSet &instances,
+      LabelSet *labels) const = 0;
+
+  // Returns an upper bound on the loss curvature (max eigenvalue of the loss
+  // Hessian matrix). Required by DeterministicGradientDescent. Optionally
+  // required by StochasticVarianceReducedGradient (for default learning rate)
+  // and StochasticGradientDescent for CURVATURE_BASED learning rate scheduling.
+  virtual float LossCurvature(const InstanceSet &instances) const = 0;
+
+  // Returns an upper bound on the curvature of the loss along each coordinate
+  // (max absolute value of the second derivative) of the data. Required by
+  // ParallelBoostingWithmomentum.
+  virtual void PerCoordinateCurvature(
+      const InstanceSet &instances,
+      VectorXf *per_coordinate_curvature) const = 0;
+
+  // Initializes parameters to a suggested setting for this loss if appropriate.
+  virtual void Init(Weights *weights) const {}
+
+  // Returns the number of model parameters for the given number of features.
+  // Needs to be overriden when #parameters != #features.
+  virtual int NumWeights(int num_features) const { return num_features; }
+
+  // Returns the total loss for a set of examples. Default implementation runs
+  // through the examples and calls ExampleLoss on each.
+  virtual float BatchLoss(const Weights &weights, const InstanceSet &instances,
+                          const LabelSet &labels) const;
+
+  // Returns the total gradient for a set of examples. Default implementation
+  // runs through the examples and calls AddExampleGradient on each. Should
+  // always be called in a single-threaded context.
+  virtual void BatchGradient(
+      const Weights &weights, const InstanceSet &instances,
+      const LabelSet &labels, Weights *gradient) const;
+
+  // Sets the 'is_synchronous_' flag.
+  void set_synchronous_update(bool is_sync) { synchronous_update_ = is_sync; }
+
+  // Returns the 'is_synchronous_' flag.
+  bool synchronous_update() const { return synchronous_update_; }
+
+ private:
+  // Flag indicating whether to use synchronous or asynchronous updates in
+  // AddExampleGradient();
+  bool synchronous_update_ = false;
+
+  //DISALLOW_COPY_AND_ASSIGN(LossFunction);
+};
+
+
+}  // namespace lossmin
diff --git a/lossmin.py b/lossmin.py
new file mode 100755
index 0000000..670db7c
--- /dev/null
+++ b/lossmin.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# Copyright 2017 The Fuchsia Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import os
+import subprocess
+import sys
+
+THIS_DIR = os.path.abspath(os.path.dirname(__file__))
+OUT_DIR = os.path.abspath(os.path.join(THIS_DIR, 'out'))
+
+def ensureDir(dir_path):
+  """Ensures that the directory at |dir_path| exists. If not it is created.
+
+  Args:
+    dir_path{string} The path to a directory. If it does not exist it will be
+    created.
+  """
+  if not os.path.exists(dir_path):
+    os.makedirs(dir_path)
+
+def main():
+  ensureDir(OUT_DIR)
+  savedir = os.getcwd()
+  os.chdir(OUT_DIR)
+  # TODO(azani): Switch to ninja.
+  subprocess.check_call(['cmake', '..']) #'-G', 'Ninja','..'])
+  subprocess.check_call(['make'])
+  subprocess.check_call(['./sample'])
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/make.sh b/make.sh
new file mode 100644
index 0000000..66920c4
--- /dev/null
+++ b/make.sh
@@ -0,0 +1,8 @@
+set -x
+g++ sample.cc \
+  losses/loss-function.cc \
+  losses/inner-product-loss-function.cc \
+  minimizers/gradient-evaluator.cc \
+  minimizers/loss-minimizer.cc \
+  minimizers/parallel-boosting-with-momentum.cc \
+  -std=c++11  -I ..
diff --git a/minimizers/CMakeLists.txt b/minimizers/CMakeLists.txt
new file mode 100644
index 0000000..41b5871
--- /dev/null
+++ b/minimizers/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright 2017 The Fuchsia Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+add_library(minimizers
+            gradient-evaluator.cc
+            loss-minimizer.cc
+            parallel-boosting-with-momentum.cc)
diff --git a/minimizers/gradient-evaluator.cc b/minimizers/gradient-evaluator.cc
new file mode 100644
index 0000000..270d334
--- /dev/null
+++ b/minimizers/gradient-evaluator.cc
@@ -0,0 +1,84 @@
+// Copyright 2014 Google Inc. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+
+#include "lossmin/minimizers/gradient-evaluator.h"
+
+#include <functional>
+
+namespace lossmin {
+
+float GradientEvaluator::Loss(const Weights &weights) const {
+  // TODO(azani): Implement multi-threaded version.
+  return loss_function_->BatchLoss(weights, instances_, labels_) /
+      NumExamples();
+}
+
+float GradientEvaluator::Loss(
+    const Weights &weights, const InstanceSet &validation_instances,
+    const LabelSet &validation_labels) const {
+  return loss_function_->BatchLoss(
+      weights, validation_instances, validation_labels) /
+      validation_labels.rows();
+}
+
+float GradientEvaluator::Loss(
+    const Weights &weights, const std::vector<int> &example_indices) const {
+  float loss = 0.0f;
+  for (int example : example_indices) {
+    loss += loss_function_->ExampleLoss(weights, instances_, labels_, example);
+  }
+  return loss / example_indices.size();
+}
+
+void GradientEvaluator::Gradient(const Weights &weights,
+                                 Weights *gradient) const {
+  //DCHECK(gradient != nullptr);
+  // TODO(azani): Implement multi-threaded version.
+  loss_function_->BatchGradient(weights, instances_, labels_, gradient);
+  *gradient /= NumExamples();
+}
+
+void GradientEvaluator::set_num_threads(int num_threads,
+                                        int batch_size) {
+  num_threads_ = num_threads;
+  if (num_threads_ == 1) return;
+
+  num_batches_ = (NumExamples() + batch_size - 1) / batch_size;
+
+  // Assign examples to each batch.
+  batch_examples_.assign(num_batches_, {});
+  for (auto &batch : batch_examples_) batch.reserve(batch_size);
+  for (int i = 0; i < NumExamples(); ++i) {
+    batch_examples_[i % num_batches_].push_back(i);
+  }
+}
+
+/*
+void GradientEvaluator::ThreadLoss(
+    const Weights *weights, int example, VectorXf *loss_values,
+    BlockingCounter *num_jobs_remaining) const {
+  // Compute example loss.
+  loss_values->coeffRef(example) =
+      loss_function_->ExampleLoss(*weights, instances_, labels_, example);
+
+  // Decrement thread counter.
+  num_jobs_remaining->DecrementCount();
+}
+
+void GradientEvaluator::ThreadGradient(
+    const Weights *weights, int batch, Weights *gradient,
+    BlockingCounter *num_jobs_remaining) const {
+  // Add the example gradient.
+  for (int example : batch_examples_[batch]) {
+    loss_function_->AddExampleGradient(
+        *weights, instances_, labels_, example, 1.0f, 1.0f, gradient);
+  }
+
+  // Decrement thread counter.
+  num_jobs_remaining->DecrementCount();
+}
+*/
+
+}  // namespace lossmin
diff --git a/minimizers/gradient-evaluator.h b/minimizers/gradient-evaluator.h
new file mode 100644
index 0000000..e63b842
--- /dev/null
+++ b/minimizers/gradient-evaluator.h
@@ -0,0 +1,185 @@
+// Copyright 2014 Google Inc. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+//
+// GradientEvaluator is a class for computing the value and gradient of a loss
+// LossFunction on a labeled dataset {(instance_i, label_i)}, given parameters
+// 'weights'. Its methods are called by gradient descent algorithms implementing
+// the LossMinimizer interface.
+
+#pragma once
+
+#include <algorithm>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include "lossmin/eigen-types.h"
+#include "lossmin/losses/loss-function.h"
+
+class BlockingCounter;
+
+namespace lossmin {
+
+class GradientEvaluator {
+ public:
+  // Constructor sets up the dataset and the loss function.
+  GradientEvaluator(const InstanceSet &instances, const LabelSet &labels,
+                    const LossFunction *loss_function)
+      : instances_(instances), labels_(labels),
+        loss_function_(loss_function) {}
+
+  virtual ~GradientEvaluator() {}
+
+  // Returns the loss for given parameters 'weights'. Multi-threading is used
+  // if num_threads_ > 1.
+  virtual float Loss(const Weights &weights) const;
+
+  // Returns the loss for given parameters 'weights' and a subset of examples
+  // 'example_indices'.
+  virtual float Loss(const Weights &weights,
+                     const std::vector<int> &example_indices) const;
+
+  // Returns the loss for given parameters 'weights' and a different
+  // dataset (typically used for validation).
+  virtual float Loss(
+      const Weights &weights, const InstanceSet &validation_instances,
+      const LabelSet &validation_labels) const;
+
+  // Computes the gradient wrt the given parameters 'weights'. 'gradient' is
+  // owned by the caller and should be initialized to zero.
+  // Multithreading is used if 'num_threads' > 1. The training examples are
+  // divided into 'num_batches' batches; each thread computes the gradient of a
+  // batch, adds it to 'gradient', and takes the next batch. These updates are
+  // asynchronous, and behaviour is non-deterministic.
+  virtual void Gradient(const Weights &weights, Weights *gradient) const;
+
+  // Adds the gradient wrt 'weight_scale * weights' for 'example' to the vector
+  // 'gradient' in place. The gradient is scaled by 'example_scale'.
+  virtual void AddExampleGradient(
+      const Weights &weights, int example, float weights_scale,
+      float example_scale, Weights *gradient) const {
+    loss_function_->AddExampleGradient(
+        weights, instances_, labels_, example, weights_scale, example_scale,
+        gradient);
+  }
+
+  // Returns the gradient wrt 'weights' as a vector<pair<int, float>> rather
+  // than Eigen::SparseVector<float>, since Eigen is very inefficient with
+  // sparse vectors. This is only necessary if running SGDAdaGrad.
+  virtual void ExampleGradient(
+      const Weights &weights, int example, float weights_scale,
+      float example_scale,
+      std::vector<std::pair<int, float>> *example_gradient) const {
+    loss_function_->ExampleGradient(
+        weights, instances_, labels_, example, weights_scale, example_scale,
+        example_gradient);
+  }
+
+  // Sets the number of threads and batch size for multithreaded computation.
+  // Assigns examples to 'batch_examples_'.
+  void set_num_threads(int num_threads, int batch_size);
+
+  // Convenience method that sets the number of batches to a default.
+  void set_num_threads(int num_threads) {
+    int batch_size = static_cast<int>(NumExamples() / num_threads / 2);
+    set_num_threads(num_threads, batch_size);
+  }
+
+  // Returns the number of examples in the dataset.
+  virtual int NumExamples() const { return instances_.rows(); }
+
+  // Returns the number of features.
+  virtual int NumFeatures() const { return instances_.cols(); }
+
+  // Returns the number of weights for the given number of features.
+  virtual int NumWeights() const {
+    return loss_function_->NumWeights(NumFeatures());
+  }
+
+  // Returns the number of batches for multithreaded computation.
+  int num_batches() const { return num_batches_; }
+
+  // Returns the number of threads.
+  int num_threads() const { return num_threads_; }
+
+  // Returns an upper bound on the curvature of the loss function. Used to set
+  // the learning rate of some LossMinimizer algorithms.
+  virtual float LossCurvature() const {
+    return loss_function_->LossCurvature(instances_);
+  }
+
+  // Returns the per-coordinate curvature of the data. Used to set the learning
+  // rates of ParallelBoostingWithMomentum.
+  virtual void PerCoordinateCurvature(
+      VectorXf *per_coordinate_curvature) const {
+    loss_function_->PerCoordinateCurvature(instances_,
+                                           per_coordinate_curvature);
+  }
+
+  // Returns sparsity, defined as the maximum instance l0 norm. Used to help
+  // set learning rates in ParallelBoostingWithMomentum.
+  float Sparsity() const {
+    typename Instance::Index sparsity = 0;
+    for (int i = 0; i < instances_.rows(); ++i) {
+      sparsity = std::max(sparsity, instances_.innerVector(i).nonZeros());
+    }
+    return static_cast<float>(sparsity);
+  }
+
+  // Returns the loss function.
+  const LossFunction *loss_function() const { return loss_function_; }
+
+  // Returns the instances.
+  const InstanceSet &instances() const { return instances_; }
+
+  // Returns the labels.
+  const LabelSet &labels() const { return labels_; }
+
+  // Returns a list of examples in batch 'batch'.
+  const std::vector<int> &batch_examples(int batch) const {
+    return batch_examples_[batch];
+  }
+
+ protected:
+  /*
+  // Computes the loss for a single example 'example' and returns it in
+  // 'loss_values[example_index]'.
+  virtual void ThreadLoss(
+      const Weights *weights, int example, VectorXf *loss_values,
+      BlockingCounter *num_jobs_remaining) const;
+
+  // Computes the gradient for examples in batch 'batch', adds it to 'gradient'.
+  virtual void ThreadGradient(
+      const Weights *weights, int batch, Weights *gradient,
+      BlockingCounter *num_jobs_remaining) const;
+  */
+
+ private:
+  // Training instances.
+  const InstanceSet &instances_;
+
+  // Instance labels.
+  const LabelSet &labels_;
+
+  // Function for computing the loss and gradient of a single training example.
+  // Not owned.
+  const LossFunction *loss_function_;
+
+  // Mutex for multithreaded gradient computation.
+  mutable std::mutex gradient_update_mutex_;
+
+  // Number of threads used to parallelize gradient computation.
+  // If num_threads_ = 1, multithreading is not used.
+  int num_threads_ = 1;
+
+  // List of examples assigned to each thread.
+  std::vector<std::vector<int>> batch_examples_;
+
+  // Number of batches.
+  int num_batches_;
+};
+
+}  // namespace lossmin
+
diff --git a/minimizers/loss-minimizer.cc b/minimizers/loss-minimizer.cc
new file mode 100644
index 0000000..d6610f8
--- /dev/null
+++ b/minimizers/loss-minimizer.cc
@@ -0,0 +1,161 @@
+// Copyright 2014 Google Inc. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+
+#include "lossmin/minimizers/loss-minimizer.h"
+
+#include <random>
+#include <algorithm>
+
+namespace lossmin {
+
+bool LossMinimizer::Run(
+    int max_epochs, int loss_epochs, int convergence_epochs,
+    Weights *weights, std::vector<float> *loss) {
+  // Run for up to 'max_epochs' epochs.
+  for (int epoch = 0; epoch < max_epochs; ++epoch) {
+    // Compute the loss.
+    if (epoch % loss_epochs == 0) {
+      loss->push_back(Loss(*weights));
+      //LOG(INFO) << epoch << ": " << loss->at(loss->size() - 1);
+    }
+
+    // Set the 'check_convergence' flag.
+    bool check_convergence = (epoch > 0) && (epoch % convergence_epochs == 0);
+
+    // Run for one epoch to update the parameters.
+    EpochUpdate(weights, epoch, check_convergence);
+
+    // Optionally do a simple convergence check.
+    if (check_convergence && use_simple_convergence_check_) {
+      SimpleConvergenceCheck(*loss);
+    }
+
+    // Check the convergence flag.
+    if (converged_) break;
+  }
+
+  loss->push_back(Loss(*weights));
+  //LOG(INFO) << "final loss: " << loss->at(loss->size() - 1);
+  return converged_;
+}
+
+bool LossMinimizer::Run(
+    int max_epochs, int loss_epochs, int convergence_epochs,
+    const InstanceSet &validation_instances,
+    const LabelSet &validation_labels, Weights *weights,
+    std::vector<float> *training_loss, std::vector<float> *validation_loss) {
+  // Run for up to 'max_epochs' epochs.
+  for (int epoch = 0; epoch < max_epochs; ++epoch) {
+    // Compute the loss.
+    if (epoch % loss_epochs == 0) {
+      training_loss->push_back(Loss(*weights));
+      validation_loss->push_back(
+          gradient_evaluator_.Loss(*weights, validation_instances,
+                                    validation_labels));
+      //LOG(INFO) << epoch << ": " << training_loss->at(training_loss->size() - 1)
+                //<< "  " << validation_loss->at(validation_loss->size() - 1);
+    }
+
+    // Set the 'check_convergence' flag.
+    bool check_convergence = (epoch > 0) && (epoch % convergence_epochs == 0);
+
+    // Run for one epoch to update the parameters.
+    EpochUpdate(weights, epoch, check_convergence);
+
+    // Optionally do a simple convergence check.
+    if (check_convergence && use_simple_convergence_check_) {
+      SimpleConvergenceCheck(*training_loss);
+    }
+
+    // Check the convergence flag.
+    if (converged_) break;
+  }
+
+  // Compute final loss.
+  training_loss->push_back(Loss(*weights));
+  validation_loss->push_back(
+      gradient_evaluator_.Loss(*weights, validation_instances,
+                               validation_labels));
+  //LOG(INFO) << "final loss: "
+            //<< training_loss->at(training_loss->size() - 1);
+  //LOG(INFO) << "final validation loss: "
+            //<< validation_loss->at(validation_loss->size() - 1);
+  return converged_;
+}
+
+float LossMinimizer::BestLearningRate() {
+  // Sample a subset of the data.
+  std::vector<int> examples(num_search_examples_);
+  std::mt19937 rnd;
+  std::uniform_int_distribution<> dis(0, gradient_evaluator_.NumExamples());
+  for (int i = 0; i < num_search_examples_; ++i) {
+    examples[i] = dis(rnd);
+  }
+
+  float min_loss_rate = initial_rates_[0];
+  float min_loss = HUGE_VAL;
+  for (int i = 0; i < initial_rates_.size(); ++i) {
+    // Initialize parameters to zero.
+    Weights weights = Weights::Zero(gradient_evaluator_.NumWeights());
+
+    // Make updates with the current learning rate.
+    Weights gradient(weights.size());
+    for (int example : examples) {
+      gradient = l2() * weights;
+      gradient_evaluator_.AddExampleGradient(weights, example, 1.0f, 1.0f,
+                                             &gradient);
+      weights -= initial_rates_[i] * gradient;
+    }
+
+    // Compute the corresponding loss.
+    float loss = gradient_evaluator_.Loss(weights, examples);
+    if (i == 0) {
+      min_loss = loss;
+    } else if (loss < min_loss) {
+      min_loss = loss;
+      min_loss_rate = initial_rates_[i];
+    }
+  }
+
+  // Return the learning rate corresponding to the smallest loss.
+  //LOG(INFO) << "best initial learning rate: " << min_loss_rate;
+  return min_loss_rate;
+}
+
+void LossMinimizer::ConvergenceCheck(
+    const Weights &weights, const Weights &gradient) {
+  Weights gradient_for_convergence =
+      (weights.array() == 0.0f).select(abs(gradient.array()) - l1_, gradient);
+  if (gradient_for_convergence.norm() / weights.size() <
+      convergence_threshold_) {
+    converged_ = true;
+  }
+}
+
+void LossMinimizer::SimpleConvergenceCheck(const std::vector<float> &loss) {
+  // Check convergence by verifying that the max relative loss decrease
+  // (loss[t-1] - loss[t]) / loss[t-1] is below 'convergence_threshold_'.
+  if (loss.size() > num_convergence_epochs_) {
+    float loss_difference = 0.0f;
+    for (int i = loss.size() - num_convergence_epochs_; i < loss.size(); ++i) {
+      loss_difference = std::max(loss_difference, 1 - loss[i] / loss[i - 1]);
+    }
+    if (loss_difference < convergence_threshold_) set_converged(true);
+  }
+}
+
+void LossMinimizer::set_batch_size(int batch_size) {
+  int num_batches =
+      (gradient_evaluator_.NumExamples() + batch_size - 1) / batch_size;
+
+  // Assign examples to each batch.
+  batch_examples_.assign(num_batches, {});
+  for (auto &batch : batch_examples_) batch.reserve(batch_size);
+  for (int i = 0; i < gradient_evaluator().NumExamples(); ++i) {
+    batch_examples_[i % num_batches].push_back(i);
+  }
+}
+
+}  // namespace lossmin
diff --git a/minimizers/loss-minimizer.h b/minimizers/loss-minimizer.h
new file mode 100644
index 0000000..066aba9
--- /dev/null
+++ b/minimizers/loss-minimizer.h
@@ -0,0 +1,288 @@
+// Copyright 2014 Google Inc. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+//
+// Interface for gradient descent algorithms. Example usage:
+//
+// 1. Create a GradientEvaluator object that computes the value and gradient of
+//    a loss function on a given dataset.
+//
+//    InstanceSet instances = ...
+//    LabelSet labels = ...
+//    LossFunction *loss_function = LossFunction::Create("logistic-regression");
+//    GradientEvaluator gradient_evaluator(instances, labels, loss_function);
+//
+// 2. Create a LossMinimizer object from 'gradient_evaluator' and l1 and l2
+//    regularization parameters. StochasticGradientDescent requires an
+//    additional parameter for learning rate scheduling.
+//
+//    float l1 = ...
+//    float l2 = ...
+//    DeterministicGradientDescent loss_minimizer(l1, l2, gradient_evaluator);
+//
+// 3. Run optimization for up to 'max_epochs' epochs. 'loss' is filled with
+//    loss values across epochs, and 'weights' contains the best parameters.
+//
+//    Weights weights = Weights::Zero(num_features);  // or other initialization
+//    vector<float> loss;
+//    int max_epochs = 100;
+//    bool converged = loss_minimizer.Run(max_epochs, &weights, &loss);
+//
+// Note: algorithm-specific parameters such as learning rates are set based on
+// the data and the loss function. If the dataset or the loss pointed to by
+// 'gradient_evaluator' change, the user must call loss_minimizer.Setup() method
+// before loss_minimizer.Run().
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "lossmin/eigen-types.h"
+#include "lossmin/minimizers/gradient-evaluator.h"
+#include "lossmin/losses/loss-function.h"
+#include "third_party/eigen/Eigen/Core"
+
+namespace lossmin {
+
+// Interface for gradient descent algorithms that minimize a loss function over
+// a labeled dataset. A GradientEvaluator object provides the loss value and
+// gradient for given parameters at each epoch. Derived classes should be
+// thread-compatible, and must provide the methods EpochUpdate() and Setup().
+class LossMinimizer {
+ public:
+  // Constructor sets the l1 and l2 regularization parameters and
+  // 'gradient_evalutor_'.
+  LossMinimizer(
+      float l1, float l2, const GradientEvaluator &gradient_evaluator)
+      : l1_(l1), l2_(l2), gradient_evaluator_(gradient_evaluator),
+        converged_(false) {
+    std::random_device rd;
+    rnd_.seed(rd());
+  }
+  virtual ~LossMinimizer() {}
+
+  // Initializes algorithm-specific parameters such as learning rates, based
+  // on the loss function and the dataset. Called in the constructor of the
+  // derived classes. If the dataset that 'gradient_evaluator' points to
+  // changes, Setup() must be called again by the user.
+  virtual void Setup() = 0;
+
+  // Runs minimization until convergence, or up to 'max_epochs' epochs. Returns
+  // true if the algorithm has converged. Parameters 'weights' should be
+  // initialized by the user, and will contain the parameters produced at the
+  // last epoch. 'loss' is filled with training loss values produced every
+  // 'loss_epochs' epochs. Convergence is checked every 'convergence_epochs'.
+  bool Run(int max_epochs, int loss_epochs, int convergence_epochs,
+           Weights *weights, std::vector<float> *loss);
+
+  // Runs minimization, evaluating loss on both training and validation data.
+  bool Run(
+      int max_epochs, int loss_epochs, int convergence_epochs,
+      const InstanceSet &validation_instances,
+      const LabelSet &validation_labels, Weights *weights,
+      std::vector<float> *training_loss, std::vector<float> *validation_loss);
+
+  // Convenience Run method that evaluates the loss and checks for convergence
+  // at every iteration.
+  bool Run(int max_epochs, Weights *weights, std::vector<float> *loss) {
+    return Run(max_epochs, 1, 1, weights, loss);
+  }
+
+  // Abstract method that updates the weights in the current iteration.
+  virtual void EpochUpdate(Weights *weights, int epoch,
+                           bool check_convergence) = 0;
+
+  // Returns the total loss for given parameters 'weights', including l1 and l2
+  // regularization.
+  float Loss(const Weights &weights) const {
+    float loss = gradient_evaluator_.Loss(weights);
+    if (l2_ > 0.0f) loss += 0.5 * l2_ * weights.squaredNorm();
+    if (l1_ > 0.0f) loss += l1_ * weights.cwiseAbs().sum();
+    return loss;
+  }
+
+  // Checks convergence of deterministic methods based on two criteria:
+  // (1) the squared l2 norm of the gradient is below 'convergence_threshold_'
+  // (2) for zero-valued weights, the corresponding gradient values are < l1.
+  // If converged, the flag 'converged_' is set to true.
+  void ConvergenceCheck(const Weights &weights, const Weights &gradient);
+
+  // Checks convergence based on the decrease in loss over the last
+  // 'num_convergence_epochs_' epochs. If converged, the flag 'converged_' is
+  // set to true.
+  void SimpleConvergenceCheck(const std::vector<float> &loss);
+
+  // Setters and getters for convergence criteria parameters.
+  bool converged() const { return converged_; }
+  void set_converged(bool converged) { converged_ = converged; }
+  void set_use_simple_convergence_check(bool use_simple_convergence_check) {
+    use_simple_convergence_check_ = use_simple_convergence_check;
+  }
+  void set_convergence_threshold(float convergence_threshold) {
+    convergence_threshold_ = convergence_threshold;
+  }
+  void set_num_convergence_epochs(int num_convergence_epochs) {
+    num_convergence_epochs_ = num_convergence_epochs;
+  }
+
+  // Returns the best initial learning rate for stochastic methods by evaluating
+  // elements of 'initial_rates_' on a subset of 'num_search_examples_' training
+  // examples.
+  float BestLearningRate();
+
+  // Methods that set parameters related to the initial learning rate in
+  // stochastic methods.
+  virtual void set_initial_learning_rate(float initial_learning_rate) {
+    initial_learning_rate_ = initial_learning_rate;
+    //LOG(INFO) << "initial_learning_rate set to " << initial_learning_rate_;
+  }
+  float initial_learning_rate() const { return initial_learning_rate_; }
+  void set_initial_rates(const std::vector<float> &initial_rates) {
+    initial_rates_ = initial_rates;
+  }
+  void set_num_search_examples(int num_search_examples) {
+    num_search_examples_ = num_search_examples;
+  }
+
+  // Sets the seed of the random number generator.
+  void SetSeed(int32_t seed) { rnd_.seed(seed); }
+
+  // Number of threads for stochastic methods.
+  void set_num_threads(int num_threads) { num_threads_ = num_threads; }
+  int num_threads() const { return num_threads_; }
+
+  // Minibatch size for stochastic methods.
+  void set_batch_size(int batch_size);
+  int NumBatches() const { return batch_examples_.size(); }
+
+  // Shuffles the batches with examples.
+  void ShuffleBatches() {
+    std::shuffle(batch_examples_.begin(), batch_examples_.end(), rnd_);
+  }
+
+  // Shuffles and returns the examples in 'batch_examples_[batch]'.
+  const std::vector<int> &batch_examples(int batch) {
+    //DCHECK(batch < batch_examples_.size());
+    std::shuffle(batch_examples_[batch].begin(),
+        batch_examples_[batch].end(), rnd_);
+    return batch_examples_[batch];
+  }
+
+  // Returns a reference to 'gradient_evaluator_'.
+  const GradientEvaluator &gradient_evaluator() const {
+    return gradient_evaluator_;
+  }
+
+  // Returns the l1 regularization parameter.
+  float l1() const { return l1_; }
+
+  // Returns the l2 regularization parameter.
+  float l2() const { return l2_; }
+
+  // Getter/setter for the 'num_scale_iterations_', used for sparse updates in
+  // stochastic methods.
+  int num_scale_iterations() const { return num_scale_iterations_; }
+  void set_num_scale_iterations(int num_scale_iterations) {
+    num_scale_iterations_ = num_scale_iterations;
+  }
+
+  // Getter/setter for 'num_iterations_per_stage_', used in
+  // StochasticVarianceReducedGradient.
+  int num_iterations_per_stage() const {
+    return num_iterations_per_stage_;
+  }
+  void set_num_iterations_per_stage(int num_iterations_per_stage) {
+    num_iterations_per_stage_ = num_iterations_per_stage;
+  }
+
+  // Getter/setter for projecting weights onto a ball in
+  // StochasticGradientDescent.
+  bool project_weights() const { return project_weights_; }
+  void set_project_weights(bool project_weights) {
+    project_weights_ = project_weights;
+  }
+
+  // Applies L1Prox coefficientwise to 'weights' and 'threshold'.
+  static void L1Prox(float threshold, Weights *weights) {
+    for (int i = 0; i < weights->size(); ++i) {
+      weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold);
+    }
+  }
+
+  // Applies L1Prox coefficientwise to 'weights' and 'threshold', where
+  // 'threshold' is a vector of per-coordinate thresholds.
+  static void L1Prox(const VectorXf &threshold, Weights *weights) {
+    for (int i = 0; i < weights->size(); ++i) {
+      weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold.coeff(i));
+    }
+  }
+
+  // Returns sign('x') * max(0.0, abs('x') - 'threshold').
+  static inline float L1Prox(float x, float threshold) {
+    return Sign(x) * std::max(std::abs(x) - threshold, 0.0f);
+  }
+
+  // Returns sign('x').
+  static inline float Sign(float x) {
+    if (x > 0.0f) return 1.0f;
+    if (x < 0.0f) return -1.0f;
+    return 0.0f;
+  }
+
+  // Default initial learning rate for stochastic methods.
+  const float kDefaultInitialLearningRate = 0.05;
+
+ private:
+  // Regularization parameters.
+  float l1_;
+  float l2_;
+
+  // GradientEvaluator used to compute the (unregularized) loss and gradient.
+  const GradientEvaluator &gradient_evaluator_;
+
+  // Convergence parameters.
+  bool converged_ = false;  // convergence flag set by convergence checks
+  float convergence_threshold_ = 1e-5;  // threshold for assessing convergence
+  bool use_simple_convergence_check_ = false;  // which convergence check to use
+  int num_convergence_epochs_ = 5;  // used in SimpleConvergenceCheck
+
+  // Initial learning rate, used in stochastic methods.
+  float initial_learning_rate_ = 0.01;
+
+  // Parameters used to search for a good initial learning rate for stochastic
+  // methods in the method BestLearningRate(). 'initial_rates_' are evaluated on
+  // a data subset of size 'num_search_examples_'.
+  std::vector<float> initial_rates_ = {0.001, 0.005, 0.01, 0.05};
+  int num_search_examples_ = 500;
+
+  // Multithreading parameters, optionally used in stochastic methods.
+  int num_threads_;
+  std::vector<std::vector<int>> batch_examples_;
+
+  // Pseudo-random number generator for shuffling 'batch_examples_'.
+  std::mt19937 rnd_;
+
+  // In stochastic methods, weights = scale * Weights. For numerical stability,
+  // this is reset to scale = 1.0 every 'num_scale_iterations_' iterations.
+  int num_scale_iterations_ = 100;
+
+  // Number of iterations per stage; used in StochasticVarianceReducedGradient.
+  int num_iterations_per_stage_ = 100000;
+
+  // For online/stochastic methods, a simple bound on the optimal solution is
+  // |weights|^2 <= 2 / L2. Knowing this, if an intermediate solution is
+  // outside of a ball of radius sqrt(2 / L2), we can project it onto the ball
+  // to potentially speed up convergence. This approach is used in
+  // StochasticGradientDescent, and controlled by the flag 'project_weights_'.
+  bool project_weights_ = true;
+};
+
+}  // namespace lossmin
+
diff --git a/minimizers/parallel-boosting-with-momentum.cc b/minimizers/parallel-boosting-with-momentum.cc
new file mode 100644
index 0000000..7254401
--- /dev/null
+++ b/minimizers/parallel-boosting-with-momentum.cc
@@ -0,0 +1,61 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+
+#include "lossmin/minimizers/parallel-boosting-with-momentum.h"
+
+#include <math.h>
+#include <functional>
+
+#include "lossmin/minimizers/gradient-evaluator.h"
+#include "third_party/eigen/Eigen/Core"
+
+namespace lossmin {
+
+void ParallelBoostingWithMomentum::Setup() {
+  // Per-coordinate learning rates are learning_rates[j] = 1 / sparsity * Lj,
+  // where sparsity is the maximum instance l0 norm and Lj is upper bound on
+  // the loss curvature along coordinate j.
+  float sparsity = gradient_evaluator().Sparsity();
+  gradient_evaluator().PerCoordinateCurvature(&learning_rates_);
+  learning_rates_ =
+      (learning_rates_.array() + l2()).inverse().matrix() / sparsity;
+
+  // Initialize the approximating function parameters.
+  alpha_ = 0.5f;
+  beta_ = 1.0f - alpha_;
+  phi_center_ = Weights::Zero(gradient_evaluator().NumWeights());
+}
+
+void ParallelBoostingWithMomentum::EpochUpdate(Weights *weights, int epoch,
+                                               bool check_convergence) {
+  // Compute the intermediate weight vector y.
+  Weights y = (1.0f - alpha_) * *weights + alpha_ * phi_center_;
+
+  // Compute the gradient of the loss wrt y.
+  Weights gradient_wrt_y = Weights::Zero(y.size());
+  gradient_evaluator().Gradient(y, &gradient_wrt_y);
+  if (l2() > 0.0f) gradient_wrt_y += l2() * y;
+
+  // Gradient step.
+  *weights -= gradient_wrt_y.cwiseProduct(learning_rates_);
+
+  // l1 shrinkage.
+  if (l1() > 0.0f) {
+    L1Prox(l1() * learning_rates_, weights);
+    gradient_wrt_y += l1() * weights->unaryExpr(std::ptr_fun(Sign));
+  }
+
+  // Update the approximation function.
+  phi_center_ -= (1.0 - alpha_) / alpha_ * (y - *weights);
+  alpha_ =
+      -beta_ / 2.0  +
+      pow(beta_ + beta_ * beta_ / 4.0, static_cast<float>(0.5));
+  beta_ *= (1.0 - alpha_);
+
+  // Check convergence.
+  if (check_convergence) ConvergenceCheck(*weights, gradient_wrt_y);
+}
+
+}  // namespace lossmin
diff --git a/minimizers/parallel-boosting-with-momentum.h b/minimizers/parallel-boosting-with-momentum.h
new file mode 100644
index 0000000..619dfe8
--- /dev/null
+++ b/minimizers/parallel-boosting-with-momentum.h
@@ -0,0 +1,60 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+//
+// An implementation of:
+// I. Mukherjee, K. Canini, R. Frongillo, and Y. Singer, "Parallel Boosting with
+// Momentum", ECML PKDD 2013.
+// Variable names follow the notation in the paper.
+
+#pragma once
+
+#include "lossmin/eigen-types.h"
+#include "lossmin/minimizers/gradient-evaluator.h"
+#include "lossmin/minimizers/loss-minimizer.h"
+
+namespace lossmin {
+
+class GradientEvaluator;
+
+class ParallelBoostingWithMomentum : public LossMinimizer {
+ public:
+  ParallelBoostingWithMomentum(
+      float l1, float l2, const GradientEvaluator &gradient_evaluator)
+      : LossMinimizer(l1, l2, gradient_evaluator) {
+    Setup();
+  }
+
+  // Sets learning rates and other parameters.
+  void Setup() override;
+
+ private:
+  // Updates 'weights' and the quadratic approximation function phi(w), such
+  // that at iteration k, loss(weights_k) <= min_w phi_k(w).
+  //     y = (1 - alpha) * weights + alpha * phi_center
+  //     grad_y = loss_grad(y) + l2 * y
+  //     weights[j] = weights[j] - grad_y[j] / learning_rates[j]
+  //     weights[j] =
+  //         sign(weights[j]) * max(0, weights[j], l1 / learning_rates[j])
+  void EpochUpdate(Weights *weights, int epoch,
+                   bool check_convergence) override;
+
+  // Per-coordinate learning rates.
+  VectorXf learning_rates_;
+
+  // Center of the approximating quadratic function phi.
+  VectorXf phi_center_;
+
+  // Parameter for updating the approximation function phi. At each iteration,
+  // 'alpha_' is updated to the solution of the quadratic equation
+  //     alpha_^2 = beta_ * (1.0 - alpha_)
+  float alpha_;
+
+  // Parameter used to update alpha, defined as
+  //     beta_{epoch} = \prod_{i=1}^{epoch} (1 - alpha_i).
+  float beta_;
+};
+
+}  // namespace lossmin
+
diff --git a/sample.cc b/sample.cc
new file mode 100644
index 0000000..ee6d31c
--- /dev/null
+++ b/sample.cc
@@ -0,0 +1,53 @@
+// Copyright 2017 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <memory>
+#include <random>
+#include <iostream>
+
+#include "lossmin/eigen-types.h"
+#include "lossmin/losses/loss-function.h"
+#include "lossmin/losses/inner-product-loss-function.h"
+#include "minimizers/gradient-evaluator.h"
+#include "minimizers/loss-minimizer.h"
+#include "minimizers/parallel-boosting-with-momentum.h"
+
+using namespace lossmin;
+
+int main(int argc, char **argv) {
+  LossFunction *loss_func = new LinearRegressionLossFunction();
+
+  const int rows_num = 10000;
+  const int features_num = 3;
+
+
+  InstanceSet instances(rows_num, features_num);
+
+  std::default_random_engine generator;
+  std::uniform_real_distribution<float> distribution(-1.0,1.0);
+
+  for (int row = 0; row < rows_num; row++) {
+    for (int feature = 0; feature < features_num; feature++) {
+      instances.insert(row, feature) = distribution(generator);
+    }
+  }
+
+  Weights real_weights(features_num, 1);
+  for (int feature = 0; feature < features_num; feature++) {
+    real_weights.coeffRef(feature, 0) = distribution(generator);
+  }
+  std::cout << "Real Weights:" << std::endl << real_weights << std::endl;
+
+  LabelSet labels = instances * real_weights;
+
+  GradientEvaluator gradient_evaluator(instances, labels, loss_func);
+  LossMinimizer *minimizer = new ParallelBoostingWithMomentum(
+      0.0, 0.0, gradient_evaluator);
+  minimizer->Setup();
+
+  Weights weights(features_num, 1);
+  std::vector<float> loss;
+  minimizer->Run(5000, &weights, &loss);
+  std::cout << "Learned Weights:" << std::endl << weights << std::endl;
+}