Initial commit.
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f2eaccc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.pyc
+/out/
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..1ea6393
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,6 @@
+# This is the list of Fuchsia Authors.
+# Names should be added to this file as one of
+# Organization's name
+# Individual's name <submission email address>
+# Individual's name <submission email address> <email2> <emailN>
+Google Inc.
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..2b511db
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,31 @@
+# Copyright 2017 The Fuchsia Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+cmake_minimum_required (VERSION 2.8.10)
+
+
+# The only way we found to override the compiler is setting these variables
+# prior to the project definition. Otherwise enable_language() sets these
+# variables and gcc will be picked up instead. Also, new versions of cmake will
+# complain if one compiler is first set (say to gcc via enable_language), and
+# then we later switch it to (say) clang.
+SET (CMAKE_C_COMPILER "clang")
+SET (CMAKE_CXX_COMPILER "clang++")
+
+project(lossmin)
+
+
+SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -std=c++11 -stdlib=libstdc++ -fcolor-diagnostics")
+
+enable_language(C)
+enable_language(CXX)
+
+include_directories(..)
+
+add_executable(sample sample.cc)
+target_link_libraries(sample
+ loss_functions
+ minimizers)
+
+add_subdirectory(losses)
+add_subdirectory(minimizers)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..ac6402f
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,27 @@
+// Copyright 2016 The Fuchsia Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/PATENTS b/PATENTS
new file mode 100644
index 0000000..2746e78
--- /dev/null
+++ b/PATENTS
@@ -0,0 +1,22 @@
+Additional IP Rights Grant (Patents)
+
+"This implementation" means the copyrightable works distributed by
+Google as part of the Fuchsia project.
+
+Google hereby grants to you a perpetual, worldwide, non-exclusive,
+no-charge, royalty-free, irrevocable (except as stated in this
+section) patent license to make, have made, use, offer to sell, sell,
+import, transfer, and otherwise run, modify and propagate the contents
+of this implementation of Fuchsia, where such license applies only to
+those patent claims, both currently owned by Google and acquired in
+the future, licensable by Google that are necessarily infringed by
+this implementation. This grant does not include claims that would be
+infringed only as a consequence of further modification of this
+implementation. If you or your agent or exclusive licensee institute
+or order or agree to the institution of patent litigation or any other
+patent enforcement activity against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that this
+implementation of Fuchsia constitutes direct or contributory patent
+infringement, or inducement of patent infringement, then any patent
+rights granted to you under this License for this implementation of
+Fuchsia shall terminate as of the date such litigation is filed.
diff --git a/README b/README
new file mode 100644
index 0000000..ccfc5f2
--- /dev/null
+++ b/README
@@ -0,0 +1,6 @@
+lossmin
+=======
+
+Lossmin is a library to solve some optimization problems.
+
+TODO(azani): Maybe write a bit more than that.
diff --git a/eigen-types.h b/eigen-types.h
new file mode 100644
index 0000000..47df398
--- /dev/null
+++ b/eigen-types.h
@@ -0,0 +1,51 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#pragma once
+
+#include "third_party/eigen/Eigen/Core"
+#include "third_party/eigen/Eigen/SparseCore"
+
+namespace lossmin {
+
+// Arrays.
+typedef Eigen::Array<bool, Eigen::Dynamic, 1> ArrayXb;
+typedef Eigen::ArrayXf ArrayXf;
+typedef Eigen::ArrayXi ArrayXi;
+
+// Vectors.
+typedef Eigen::VectorXf VectorXf;
+typedef Eigen::VectorXi VectorXi;
+
+// Sparse Vectors.
+typedef Eigen::SparseVector<float> SparseVectorXf;
+
+// Matrix.
+typedef Eigen::Matrix<
+ float,
+ Eigen::Dynamic,
+ Eigen::Dynamic,
+ Eigen::ColMajor> MatrixXf;
+typedef Eigen::Matrix<
+ float,
+ Eigen::Dynamic,
+ Eigen::Dynamic,
+ Eigen::RowMajor> RowMatrixXf;
+
+// Sparse Matrix.
+typedef Eigen::SparseMatrix<float> SparseMatrixXf;
+
+// Instances and parameters.
+typedef VectorXf Weights;
+
+typedef VectorXf Label;
+typedef RowMatrixXf LabelSet;
+
+typedef Eigen::SparseVector<float> Instance;
+typedef Eigen::SparseMatrix<float, Eigen::RowMajor> InstanceSet;
+typedef Eigen::SparseMatrix<float, Eigen::ColMajor> SparseMatrixXf;
+typedef Eigen::SparseVector<float> SparseVectorXf;
+
+} // namespace lossmin
+
diff --git a/losses/CMakeLists.txt b/losses/CMakeLists.txt
new file mode 100644
index 0000000..46ae888
--- /dev/null
+++ b/losses/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Copyright 2017 The Fuchsia Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+add_library(loss_functions
+ loss-function.cc
+ inner-product-loss-function.cc)
diff --git a/losses/inner-product-loss-function.cc b/losses/inner-product-loss-function.cc
new file mode 100644
index 0000000..2d24293
--- /dev/null
+++ b/losses/inner-product-loss-function.cc
@@ -0,0 +1,89 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+
+#include "lossmin/losses/inner-product-loss-function.h"
+#include <math.h>
+
+namespace lossmin {
+
+float InnerProductLossFunction::LossCurvature(
+ const InstanceSet &instances) const {
+ float data_curvature = (instances.cwiseProduct(instances) *
+ Weights::Ones(instances.cols())).maxCoeff();
+ return curvature_ * data_curvature;
+}
+
+void InnerProductLossFunction::PerCoordinateCurvature(
+ const InstanceSet &instances, VectorXf *per_coordinate_curvature) const {
+ *per_coordinate_curvature =
+ VectorXf::Ones(instances.rows()).transpose() *
+ instances.cwiseProduct(instances) / instances.rows();
+ *per_coordinate_curvature *= curvature_;
+}
+
+float InnerProductLossFunction::ExampleLoss(
+ const Weights &weights, const InstanceSet &instances,
+ const LabelSet &labels, int example) const {
+ float inner_product = instances.innerVector(example).dot(weights);
+ return InnerProductExampleLoss(inner_product, labels.coeff(example, 0));
+}
+
+void InnerProductLossFunction::AddExampleGradient(
+ const Weights &weights, const InstanceSet &instances,
+ const LabelSet &labels, int example, float weights_scale,
+ float example_scale, Weights *gradient) const {
+ float inner_product = instances.innerVector(example).dot(weights);
+ if (weights_scale != 1.0f) inner_product *= weights_scale;
+ float inner_product_gradient =
+ InnerProductExampleGradient(inner_product, labels.coeff(example, 0));
+ if (example_scale != 1.0f) inner_product_gradient *= example_scale;
+
+ if (synchronous_update()) {
+ std::lock_guard<std::mutex> lock(gradient_update_mutex_);
+ for (InstanceSet::InnerIterator it(instances, example); it; ++it) {
+ gradient->coeffRef(it.index()) += inner_product_gradient * it.value();
+ }
+ } else {
+ for (InstanceSet::InnerIterator it(instances, example); it; ++it) {
+ gradient->coeffRef(it.index()) += inner_product_gradient * it.value();
+ }
+ }
+}
+
+void InnerProductLossFunction::ExampleGradient(
+ const Weights &weights, const InstanceSet &instances,
+ const LabelSet &labels, int example, float weights_scale,
+ float example_scale,
+ std::vector<std::pair<int, float>> *example_gradient) const {
+ float inner_product = instances.innerVector(example).dot(weights);
+ if (weights_scale != 1.0f) inner_product *= weights_scale;
+ float inner_product_gradient =
+ InnerProductExampleGradient(inner_product, labels.coeff(example, 0));
+ if (example_scale != 1.0f) inner_product_gradient *= example_scale;
+
+ example_gradient->resize(instances.row(example).nonZeros());
+ int i = 0;
+ for (InstanceSet::InnerIterator it(instances, example); it; ++it) {
+ example_gradient->at(i++) =
+ std::make_pair(it.index(), inner_product_gradient * it.value());
+ }
+}
+
+// Assigns labels to 'instances' given 'weights'.
+void InnerProductLossFunction::PredictLabels(
+ const Weights &weights, const InstanceSet &instances,
+ LabelSet *labels) const {
+ // Compute inner products.
+ VectorXf inner_products = instances * weights;
+
+ // Assign labels by calling InnerProductAssignLabel coefficientwise on
+ // 'inner_products'.
+ std::function<float(float)> assign_label_ptr =
+ std::bind(&InnerProductLossFunction::InnerProductPredictLabel,
+ this, std::placeholders::_1);
+ *labels = inner_products.unaryExpr(assign_label_ptr);
+}
+
+} // namespace lossmin
diff --git a/losses/inner-product-loss-function.h b/losses/inner-product-loss-function.h
new file mode 100644
index 0000000..a5c8d22
--- /dev/null
+++ b/losses/inner-product-loss-function.h
@@ -0,0 +1,276 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+//
+// In InnerProductLossFunction, the loss of a labeled example
+// (instance_i, label_i) is only a function of the inner product
+// <weights, instance_i> and label_i. label_i is a scalar (single element
+// VectorXf).
+//
+// Derived classes need to provide InnerProductExampleLoss,
+// InnerProductExampleGradient, and InnerProductPredictLabel, and set the
+// 'curvature_' parameter.
+//
+// Current implementations include logistic-regression, averaged-logistic,
+// linear-regression, poisson-regression, smooth-hinge.
+
+#pragma once
+
+#include <float.h>
+#include <math.h>
+#include <functional>
+#include <mutex>
+
+#include "lossmin/eigen-types.h"
+#include "lossmin/losses/loss-function.h"
+
+namespace lossmin {
+
+class InnerProductLossFunction : public LossFunction {
+ public:
+ // Returns the loss for a single example.
+ float ExampleLoss(
+ const Weights &weights, const InstanceSet &instances,
+ const LabelSet &labels, int example) const override;
+
+ // Adds the gradient of a single example to 'gradient'.
+ void AddExampleGradient(
+ const Weights &weights, const InstanceSet &instances,
+ const LabelSet &labels, int example, float weights_scale,
+ float example_scale, Weights *gradient) const override;
+
+ // Returns the gradient of a single example.
+ void ExampleGradient(
+ const Weights &weights, const InstanceSet &instances,
+ const LabelSet &labels, int example, float weights_scale,
+ float example_scale,
+ std::vector<std::pair<int, float>> *example_gradient) const override;
+
+ // Assigns labels to 'instances' given 'weights'.
+ void PredictLabels(const Weights &weights, const InstanceSet &instances,
+ LabelSet *labels) const override;
+
+ // Returns an upper bound on the loss curvature.
+ float LossCurvature(const InstanceSet &instances) const override;
+
+ // Returns an upper bound on the per-coordinate curvature.
+ void PerCoordinateCurvature(
+ const InstanceSet &instances,
+ VectorXf *per_coordinate_curvature) const override;
+
+ virtual float InnerProductExampleLoss(float inner_product, float label)
+ const = 0;
+
+ virtual float InnerProductExampleGradient(float inner_product, float label)
+ const = 0;
+
+ virtual float InnerProductPredictLabel(float inner_product) const = 0;
+
+ // Returns 'curvature_'.
+ virtual float InnerProductCurvature(float inner_product, float label) const {
+ return curvature_;
+ }
+
+ protected:
+ // Sets the upper bound on the curvature of the loss function.
+ void set_curvature(float curvature) { curvature_ = curvature; }
+
+ private:
+ // Mutex for synchronous updates of the gradient vector.
+ mutable std::mutex gradient_update_mutex_;
+
+ // Upper bound on the absolute value of the second derivative of the loss:
+ // |d^2 loss(x) / dx^2| <= curvature_, where 'x' is the inner product
+ // <instance, weights>. Should be set by derived classes.
+ float curvature_;
+};
+
+// Binary logistic regression with labels in {-1, 1}.
+class LogisticRegressionLossFunction : public InnerProductLossFunction {
+ public:
+ LogisticRegressionLossFunction() { set_curvature(0.25); }
+
+ // Returns the logistic loss.
+ float InnerProductExampleLoss(float inner_product, float label)
+ const override {
+ return log1pf(exp(-label * inner_product));
+ }
+
+ // Returns the gradient of the logistic loss wrt 'inner_product'.
+ float InnerProductExampleGradient(float inner_product, float label)
+ const override {
+ if (label * inner_product < kInputMin) return -label;
+ return -label / (1.0f + exp(label * inner_product));
+ }
+
+ // Assigns a label in {-1, 1} given 'inner_product'.
+ float InnerProductPredictLabel(float inner_product) const override {
+ if (inner_product > 0) return 1.0f;
+ return -1.0f;
+ }
+
+ // Returns the magnitude of the second derivative of the loss, evaluated at
+ // 'inner_product' and 'label'.
+ float InnerProductCurvature(float inner_product, float label)
+ const override {
+ if (-label * inner_product < kInputMin) return 0.0f;
+ float label_prob = 1.0f / (1.0f + exp(-label * inner_product));
+ return label_prob * (1.0f - label_prob);
+ }
+
+ // Min allowed exp input.
+ const float kInputMin = log(FLT_EPSILON);
+};
+
+// Linear regression with squared error loss.
+class LinearRegressionLossFunction : public InnerProductLossFunction {
+ public:
+ LinearRegressionLossFunction() { set_curvature(1.0); }
+
+ // Returns the squared error loss.
+ float InnerProductExampleLoss(float inner_product, float label)
+ const override {
+ return 0.5 * (inner_product - label) * (inner_product - label);
+ }
+
+ // Returns the gradient of the squared error loss wrt 'inner_product'.
+ float InnerProductExampleGradient(float inner_product, float label)
+ const override {
+ return inner_product - label;
+ }
+
+ // Assigns a label given 'inner_product'.
+ float InnerProductPredictLabel(float inner_product) const override {
+ return inner_product;
+ }
+};
+
+// Poisson regression, where the distribution mean is the inner product between
+// an instance and parameters.
+class PoissonRegressionLossFunction : public InnerProductLossFunction {
+ public:
+ PoissonRegressionLossFunction() {}
+
+ // Returns the Poisson regression loss.
+ float InnerProductExampleLoss(float inner_product, float label)
+ const override {
+ return exp(inner_product) - label * inner_product;
+ }
+
+ // Returns the gradient of the Poisson regression loss wrt 'inner_product'.
+ float InnerProductExampleGradient(float inner_product, float label)
+ const override {
+ return exp(inner_product) - label;
+ }
+
+ // Returns the mode of a Poisson distribution with mean 'inner_product'.
+ float InnerProductPredictLabel(float inner_product) const override {
+ return floor(inner_product);
+ }
+
+ // Returns the magnitude of the second derivative of the loss, evaluated at
+ // 'inner_product' and 'label'.
+ float InnerProductCurvature(float inner_product, float label)
+ const override {
+ return exp(inner_product);
+ }
+};
+
+// Binary logistic regression where 'label' = P(true_label = 1) is in [0, 1],
+// and the loss is averaged over labels:
+// label * logloss(inner_product, 1) + (1 - label) * logloss(inner_product, 0).
+class AveragedLogisticLossFunction : public InnerProductLossFunction {
+ public:
+ AveragedLogisticLossFunction() { set_curvature(0.25); }
+
+ // Returns the averaged logistic loss.
+ float InnerProductExampleLoss(float inner_product, float label)
+ const override {
+ return (1.0 - label) * log1pf(exp(inner_product))
+ + label * log1pf(exp(-inner_product));
+ }
+
+ // Returns the gradient of the averaged logistic loss wrt 'inner_product'.
+ float InnerProductExampleGradient(float inner_product, float label)
+ const override {
+ if (-inner_product < kInputMin) return -label + 1.0f;
+ return -label + 1.0f / (1.0f + exp(-inner_product));
+ }
+
+ // Returns the probability that the label is 1 given the inner product.
+ float InnerProductPredictLabel(float inner_product) const override {
+ if (-inner_product < kInputMin) return 1.0f;
+ return 1.0f / (1.0f + exp(-inner_product));
+ }
+
+ // Returns the magnitude of the second derivative of the loss, evaluated at
+ // 'inner_product' and 'label'.
+ float InnerProductCurvature(float inner_product, float label)
+ const override {
+ float label_prob = 1.0f / (1.0f + exp(-inner_product));
+ if (-inner_product < kInputMin) return 0.0f;
+ return label_prob * (1.0f - label_prob);
+ }
+
+ // Min allowed exp input.
+ const float kInputMin = log(FLT_EPSILON);
+};
+
+
+// Smooth hinge loss. If a = inner_product * label, the loss is
+// loss = 0.0 a > threshold
+// 0.5 * (a - threshold)^2 0 < a <= threshold
+// 0.5 * threshold^2 - threshold * a a <= 0
+class SmoothHingeLossFunction : public InnerProductLossFunction {
+ public:
+ SmoothHingeLossFunction() { set_curvature(1.0f); }
+
+ /*
+ void Setup(const LossFunctionConfigContainer &config) override {
+ if (config.has_smooth_hinge_threshold()) {
+ threshold_ = config.smooth_hinge_threshold();
+ }
+ set_curvature(threshold_);
+ }
+ */
+
+ float InnerProductExampleLoss(float inner_product, float label)
+ const override {
+ float hinge_input = inner_product *= label;
+ if (hinge_input >= threshold_) {
+ return 0.0;
+ } else if (hinge_input <= 0.0f) {
+ return (0.5 * threshold_ * threshold_ - threshold_ * hinge_input);
+ } else {
+ return 0.5 * (hinge_input - threshold_) * (hinge_input - threshold_);
+ }
+ }
+
+ float InnerProductExampleGradient(float inner_product, float label)
+ const override {
+ float hinge_input = inner_product * label;
+ if (hinge_input >= threshold_) {
+ return 0.0f;
+ } else if (hinge_input <= 0.0f) {
+ return -label * threshold_;
+ } else {
+ return (hinge_input - threshold_) * label;
+ }
+ }
+
+ // Assigns a label in {-1, 1} given 'inner_product'.
+ float InnerProductPredictLabel(float inner_product) const override {
+ if (inner_product > 0) return 1.0f;
+ return -1.0f;
+ }
+
+ void set_threshold(float threshold) { threshold_ = threshold; }
+
+ private:
+ float threshold_ = 1.0f;
+};
+
+
+} // namespace lossmin
+
diff --git a/losses/loss-function.cc b/losses/loss-function.cc
new file mode 100644
index 0000000..ca11228
--- /dev/null
+++ b/losses/loss-function.cc
@@ -0,0 +1,30 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+
+#include "lossmin/losses/loss-function.h"
+#include <float.h>
+#include <math.h>
+
+namespace lossmin {
+
+float LossFunction::BatchLoss(
+ const Weights &weights, const InstanceSet &instances,
+ const LabelSet &labels) const {
+ float loss = 0.0f;
+ for (int i = 0; i < instances.rows(); ++i) {
+ loss += ExampleLoss(weights, instances, labels, i);
+ }
+ return loss;
+}
+
+void LossFunction::BatchGradient(
+ const Weights &weights, const InstanceSet &instances,
+ const LabelSet &labels, Weights *gradient) const {
+ for (int i = 0; i < instances.rows(); ++i) {
+ AddExampleGradient(weights, instances, labels, i, 1.0f, 1.0f, gradient);
+ }
+}
+
+} // namespace lossmin
diff --git a/losses/loss-function.h b/losses/loss-function.h
new file mode 100644
index 0000000..b02aff0
--- /dev/null
+++ b/losses/loss-function.h
@@ -0,0 +1,112 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+//
+// Interface for computing the value and gradient of a loss function given
+// parameters 'weights' and labeled examples {(instance_i, label_i)}
+
+#pragma once
+
+#include <math.h>
+
+#include "lossmin/eigen-types.h"
+
+namespace lossmin {
+
+// Abstract interface for computing the value and gradient of a loss function.
+// Derived classes must provide the following methods: LossCurvature,
+// PerCoordinateCurvature, ExampleLoss, AddExampleGradient, and
+// PredictLabels.
+// If multiple threads concurrently call the AddExampleGradient() method of the
+// same LossFunction object to update the same 'gradient' vector, changes to
+// 'gradient' need to be protected by a mutex in the implementation. If the
+// AddExampleGradient() method of the same LossFunction object is used for
+// different 'gradient' vectors, this will result in unnecessary synchronization
+// costs. 'is_synchronous_' flag indicates whether to use synchronization.
+// If multiple threads concurrently call the AddExampleGradient() method of
+// different LossFunction objects to update the same 'gradient' vector, this
+// may result in race conditions. In this case, changes to 'gradient' should be
+// protected by the caller.
+class LossFunction {
+ public:
+ LossFunction() {}
+ virtual ~LossFunction() {}
+
+ // Returns the loss for a single example (row 'example' of 'instances' and
+ // 'labels').
+ virtual float ExampleLoss(
+ const Weights &weights, const InstanceSet &instances,
+ const LabelSet &labels, int example) const = 0;
+
+ // Computes the gradient of the loss for a single example (row 'example' of
+ // 'instances' and labels) with respect to 'weights_scale * weights'. Adds
+ // 'example_scale * example_gradient' to 'gradient'.
+ // If multiple threads call the AddExampleGradient method of the same object
+ // to update the same 'gradient' vector, updates to 'gradient' need to be
+ // protected by a mutex in the implementation.
+ virtual void AddExampleGradient(
+ const Weights &weights, const InstanceSet &instances,
+ const LabelSet &labels, int example, float weights_scale,
+ float example_scale, Weights *gradient) const = 0;
+
+ // Returns the gradient of the loss for a single example. Used in AdaGrad.
+ virtual void ExampleGradient(
+ const Weights &weights, const InstanceSet &instances,
+ const LabelSet &labels, int example, float weights_scale,
+ float example_scale,
+ std::vector<std::pair<int, float>> *example_gradient) const = 0;
+
+ // Predicts 'labels' for 'instances' given 'weights'.
+ virtual void PredictLabels(
+ const Weights &weights, const InstanceSet &instances,
+ LabelSet *labels) const = 0;
+
+ // Returns an upper bound on the loss curvature (max eigenvalue of the loss
+ // Hessian matrix). Required by DeterministicGradientDescent. Optionally
+ // required by StochasticVarianceReducedGradient (for default learning rate)
+ // and StochasticGradientDescent for CURVATURE_BASED learning rate scheduling.
+ virtual float LossCurvature(const InstanceSet &instances) const = 0;
+
+ // Returns an upper bound on the curvature of the loss along each coordinate
+ // (max absolute value of the second derivative) of the data. Required by
+ // ParallelBoostingWithmomentum.
+ virtual void PerCoordinateCurvature(
+ const InstanceSet &instances,
+ VectorXf *per_coordinate_curvature) const = 0;
+
+ // Initializes parameters to a suggested setting for this loss if appropriate.
+ virtual void Init(Weights *weights) const {}
+
+ // Returns the number of model parameters for the given number of features.
+ // Needs to be overriden when #parameters != #features.
+ virtual int NumWeights(int num_features) const { return num_features; }
+
+ // Returns the total loss for a set of examples. Default implementation runs
+ // through the examples and calls ExampleLoss on each.
+ virtual float BatchLoss(const Weights &weights, const InstanceSet &instances,
+ const LabelSet &labels) const;
+
+ // Returns the total gradient for a set of examples. Default implementation
+ // runs through the examples and calls AddExampleGradient on each. Should
+ // always be called in a single-threaded context.
+ virtual void BatchGradient(
+ const Weights &weights, const InstanceSet &instances,
+ const LabelSet &labels, Weights *gradient) const;
+
+ // Sets the 'is_synchronous_' flag.
+ void set_synchronous_update(bool is_sync) { synchronous_update_ = is_sync; }
+
+ // Returns the 'is_synchronous_' flag.
+ bool synchronous_update() const { return synchronous_update_; }
+
+ private:
+ // Flag indicating whether to use synchronous or asynchronous updates in
+ // AddExampleGradient();
+ bool synchronous_update_ = false;
+
+ //DISALLOW_COPY_AND_ASSIGN(LossFunction);
+};
+
+
+} // namespace lossmin
diff --git a/lossmin.py b/lossmin.py
new file mode 100755
index 0000000..670db7c
--- /dev/null
+++ b/lossmin.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# Copyright 2017 The Fuchsia Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import os
+import subprocess
+import sys
+
+THIS_DIR = os.path.abspath(os.path.dirname(__file__))
+OUT_DIR = os.path.abspath(os.path.join(THIS_DIR, 'out'))
+
+def ensureDir(dir_path):
+ """Ensures that the directory at |dir_path| exists. If not it is created.
+
+ Args:
+ dir_path{string} The path to a directory. If it does not exist it will be
+ created.
+ """
+ if not os.path.exists(dir_path):
+ os.makedirs(dir_path)
+
+def main():
+ ensureDir(OUT_DIR)
+ savedir = os.getcwd()
+ os.chdir(OUT_DIR)
+ # TODO(azani): Switch to ninja.
+ subprocess.check_call(['cmake', '..']) #'-G', 'Ninja','..'])
+ subprocess.check_call(['make'])
+ subprocess.check_call(['./sample'])
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/make.sh b/make.sh
new file mode 100644
index 0000000..66920c4
--- /dev/null
+++ b/make.sh
@@ -0,0 +1,8 @@
+set -x
+g++ sample.cc \
+ losses/loss-function.cc \
+ losses/inner-product-loss-function.cc \
+ minimizers/gradient-evaluator.cc \
+ minimizers/loss-minimizer.cc \
+ minimizers/parallel-boosting-with-momentum.cc \
+ -std=c++11 -I ..
diff --git a/minimizers/CMakeLists.txt b/minimizers/CMakeLists.txt
new file mode 100644
index 0000000..41b5871
--- /dev/null
+++ b/minimizers/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright 2017 The Fuchsia Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+add_library(minimizers
+ gradient-evaluator.cc
+ loss-minimizer.cc
+ parallel-boosting-with-momentum.cc)
diff --git a/minimizers/gradient-evaluator.cc b/minimizers/gradient-evaluator.cc
new file mode 100644
index 0000000..270d334
--- /dev/null
+++ b/minimizers/gradient-evaluator.cc
@@ -0,0 +1,84 @@
+// Copyright 2014 Google Inc. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+
+#include "lossmin/minimizers/gradient-evaluator.h"
+
+#include <functional>
+
+namespace lossmin {
+
+float GradientEvaluator::Loss(const Weights &weights) const {
+ // TODO(azani): Implement multi-threaded version.
+ return loss_function_->BatchLoss(weights, instances_, labels_) /
+ NumExamples();
+}
+
+float GradientEvaluator::Loss(
+ const Weights &weights, const InstanceSet &validation_instances,
+ const LabelSet &validation_labels) const {
+ return loss_function_->BatchLoss(
+ weights, validation_instances, validation_labels) /
+ validation_labels.rows();
+}
+
+float GradientEvaluator::Loss(
+ const Weights &weights, const std::vector<int> &example_indices) const {
+ float loss = 0.0f;
+ for (int example : example_indices) {
+ loss += loss_function_->ExampleLoss(weights, instances_, labels_, example);
+ }
+ return loss / example_indices.size();
+}
+
+void GradientEvaluator::Gradient(const Weights &weights,
+ Weights *gradient) const {
+ //DCHECK(gradient != nullptr);
+ // TODO(azani): Implement multi-threaded version.
+ loss_function_->BatchGradient(weights, instances_, labels_, gradient);
+ *gradient /= NumExamples();
+}
+
+void GradientEvaluator::set_num_threads(int num_threads,
+ int batch_size) {
+ num_threads_ = num_threads;
+ if (num_threads_ == 1) return;
+
+ num_batches_ = (NumExamples() + batch_size - 1) / batch_size;
+
+ // Assign examples to each batch.
+ batch_examples_.assign(num_batches_, {});
+ for (auto &batch : batch_examples_) batch.reserve(batch_size);
+ for (int i = 0; i < NumExamples(); ++i) {
+ batch_examples_[i % num_batches_].push_back(i);
+ }
+}
+
+/*
+void GradientEvaluator::ThreadLoss(
+ const Weights *weights, int example, VectorXf *loss_values,
+ BlockingCounter *num_jobs_remaining) const {
+ // Compute example loss.
+ loss_values->coeffRef(example) =
+ loss_function_->ExampleLoss(*weights, instances_, labels_, example);
+
+ // Decrement thread counter.
+ num_jobs_remaining->DecrementCount();
+}
+
+void GradientEvaluator::ThreadGradient(
+ const Weights *weights, int batch, Weights *gradient,
+ BlockingCounter *num_jobs_remaining) const {
+ // Add the example gradient.
+ for (int example : batch_examples_[batch]) {
+ loss_function_->AddExampleGradient(
+ *weights, instances_, labels_, example, 1.0f, 1.0f, gradient);
+ }
+
+ // Decrement thread counter.
+ num_jobs_remaining->DecrementCount();
+}
+*/
+
+} // namespace lossmin
diff --git a/minimizers/gradient-evaluator.h b/minimizers/gradient-evaluator.h
new file mode 100644
index 0000000..e63b842
--- /dev/null
+++ b/minimizers/gradient-evaluator.h
@@ -0,0 +1,185 @@
+// Copyright 2014 Google Inc. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+//
+// GradientEvaluator is a class for computing the value and gradient of a loss
+// LossFunction on a labeled dataset {(instance_i, label_i)}, given parameters
+// 'weights'. Its methods are called by gradient descent algorithms implementing
+// the LossMinimizer interface.
+
+#pragma once
+
+#include <algorithm>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include "lossmin/eigen-types.h"
+#include "lossmin/losses/loss-function.h"
+
+class BlockingCounter;
+
+namespace lossmin {
+
+class GradientEvaluator {
+ public:
+ // Constructor sets up the dataset and the loss function.
+ GradientEvaluator(const InstanceSet &instances, const LabelSet &labels,
+ const LossFunction *loss_function)
+ : instances_(instances), labels_(labels),
+ loss_function_(loss_function) {}
+
+ virtual ~GradientEvaluator() {}
+
+ // Returns the loss for given parameters 'weights'. Multi-threading is used
+ // if num_threads_ > 1.
+ virtual float Loss(const Weights &weights) const;
+
+ // Returns the loss for given parameters 'weights' and a subset of examples
+ // 'example_indices'.
+ virtual float Loss(const Weights &weights,
+ const std::vector<int> &example_indices) const;
+
+ // Returns the loss for given parameters 'weights' and a different
+ // dataset (typically used for validation).
+ virtual float Loss(
+ const Weights &weights, const InstanceSet &validation_instances,
+ const LabelSet &validation_labels) const;
+
+ // Computes the gradient wrt the given parameters 'weights'. 'gradient' is
+ // owned by the caller and should be initialized to zero.
+ // Multithreading is used if 'num_threads' > 1. The training examples are
+ // divided into 'num_batches' batches; each thread computes the gradient of a
+ // batch, adds it to 'gradient', and takes the next batch. These updates are
+ // asynchronous, and behaviour is non-deterministic.
+ virtual void Gradient(const Weights &weights, Weights *gradient) const;
+
+ // Adds the gradient wrt 'weight_scale * weights' for 'example' to the vector
+ // 'gradient' in place. The gradient is scaled by 'example_scale'.
+ virtual void AddExampleGradient(
+ const Weights &weights, int example, float weights_scale,
+ float example_scale, Weights *gradient) const {
+ loss_function_->AddExampleGradient(
+ weights, instances_, labels_, example, weights_scale, example_scale,
+ gradient);
+ }
+
+ // Returns the gradient wrt 'weights' as a vector<pair<int, float>> rather
+ // than Eigen::SparseVector<float>, since Eigen is very inefficient with
+ // sparse vectors. This is only necessary if running SGDAdaGrad.
+ virtual void ExampleGradient(
+ const Weights &weights, int example, float weights_scale,
+ float example_scale,
+ std::vector<std::pair<int, float>> *example_gradient) const {
+ loss_function_->ExampleGradient(
+ weights, instances_, labels_, example, weights_scale, example_scale,
+ example_gradient);
+ }
+
+ // Sets the number of threads and batch size for multithreaded computation.
+ // Assigns examples to 'batch_examples_'.
+ void set_num_threads(int num_threads, int batch_size);
+
+ // Convenience method that sets the number of batches to a default.
+ void set_num_threads(int num_threads) {
+ int batch_size = static_cast<int>(NumExamples() / num_threads / 2);
+ set_num_threads(num_threads, batch_size);
+ }
+
+ // Returns the number of examples in the dataset.
+ virtual int NumExamples() const { return instances_.rows(); }
+
+ // Returns the number of features.
+ virtual int NumFeatures() const { return instances_.cols(); }
+
+ // Returns the number of weights for the given number of features.
+ virtual int NumWeights() const {
+ return loss_function_->NumWeights(NumFeatures());
+ }
+
+ // Returns the number of batches for multithreaded computation.
+ int num_batches() const { return num_batches_; }
+
+ // Returns the number of threads.
+ int num_threads() const { return num_threads_; }
+
+ // Returns an upper bound on the curvature of the loss function. Used to set
+ // the learning rate of some LossMinimizer algorithms.
+ virtual float LossCurvature() const {
+ return loss_function_->LossCurvature(instances_);
+ }
+
+ // Returns the per-coordinate curvature of the data. Used to set the learning
+ // rates of ParallelBoostingWithMomentum.
+ virtual void PerCoordinateCurvature(
+ VectorXf *per_coordinate_curvature) const {
+ loss_function_->PerCoordinateCurvature(instances_,
+ per_coordinate_curvature);
+ }
+
+ // Returns sparsity, defined as the maximum instance l0 norm. Used to help
+ // set learning rates in ParallelBoostingWithMomentum.
+ float Sparsity() const {
+ typename Instance::Index sparsity = 0;
+ for (int i = 0; i < instances_.rows(); ++i) {
+ sparsity = std::max(sparsity, instances_.innerVector(i).nonZeros());
+ }
+ return static_cast<float>(sparsity);
+ }
+
+ // Returns the loss function.
+ const LossFunction *loss_function() const { return loss_function_; }
+
+ // Returns the instances.
+ const InstanceSet &instances() const { return instances_; }
+
+ // Returns the labels.
+ const LabelSet &labels() const { return labels_; }
+
+ // Returns a list of examples in batch 'batch'.
+ const std::vector<int> &batch_examples(int batch) const {
+ return batch_examples_[batch];
+ }
+
+ protected:
+ /*
+ // Computes the loss for a single example 'example' and returns it in
+ // 'loss_values[example_index]'.
+ virtual void ThreadLoss(
+ const Weights *weights, int example, VectorXf *loss_values,
+ BlockingCounter *num_jobs_remaining) const;
+
+ // Computes the gradient for examples in batch 'batch', adds it to 'gradient'.
+ virtual void ThreadGradient(
+ const Weights *weights, int batch, Weights *gradient,
+ BlockingCounter *num_jobs_remaining) const;
+ */
+
+ private:
+ // Training instances.
+ const InstanceSet &instances_;
+
+ // Instance labels.
+ const LabelSet &labels_;
+
+ // Function for computing the loss and gradient of a single training example.
+ // Not owned.
+ const LossFunction *loss_function_;
+
+ // Mutex for multithreaded gradient computation.
+ mutable std::mutex gradient_update_mutex_;
+
+ // Number of threads used to parallelize gradient computation.
+ // If num_threads_ = 1, multithreading is not used.
+ int num_threads_ = 1;
+
+ // List of examples assigned to each thread.
+ std::vector<std::vector<int>> batch_examples_;
+
+ // Number of batches.
+ int num_batches_;
+};
+
+} // namespace lossmin
+
diff --git a/minimizers/loss-minimizer.cc b/minimizers/loss-minimizer.cc
new file mode 100644
index 0000000..d6610f8
--- /dev/null
+++ b/minimizers/loss-minimizer.cc
@@ -0,0 +1,161 @@
+// Copyright 2014 Google Inc. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+
+#include "lossmin/minimizers/loss-minimizer.h"
+
+#include <random>
+#include <algorithm>
+
+namespace lossmin {
+
+bool LossMinimizer::Run(
+ int max_epochs, int loss_epochs, int convergence_epochs,
+ Weights *weights, std::vector<float> *loss) {
+ // Run for up to 'max_epochs' epochs.
+ for (int epoch = 0; epoch < max_epochs; ++epoch) {
+ // Compute the loss.
+ if (epoch % loss_epochs == 0) {
+ loss->push_back(Loss(*weights));
+ //LOG(INFO) << epoch << ": " << loss->at(loss->size() - 1);
+ }
+
+ // Set the 'check_convergence' flag.
+ bool check_convergence = (epoch > 0) && (epoch % convergence_epochs == 0);
+
+ // Run for one epoch to update the parameters.
+ EpochUpdate(weights, epoch, check_convergence);
+
+ // Optionally do a simple convergence check.
+ if (check_convergence && use_simple_convergence_check_) {
+ SimpleConvergenceCheck(*loss);
+ }
+
+ // Check the convergence flag.
+ if (converged_) break;
+ }
+
+ loss->push_back(Loss(*weights));
+ //LOG(INFO) << "final loss: " << loss->at(loss->size() - 1);
+ return converged_;
+}
+
+bool LossMinimizer::Run(
+ int max_epochs, int loss_epochs, int convergence_epochs,
+ const InstanceSet &validation_instances,
+ const LabelSet &validation_labels, Weights *weights,
+ std::vector<float> *training_loss, std::vector<float> *validation_loss) {
+ // Run for up to 'max_epochs' epochs.
+ for (int epoch = 0; epoch < max_epochs; ++epoch) {
+ // Compute the loss.
+ if (epoch % loss_epochs == 0) {
+ training_loss->push_back(Loss(*weights));
+ validation_loss->push_back(
+ gradient_evaluator_.Loss(*weights, validation_instances,
+ validation_labels));
+ //LOG(INFO) << epoch << ": " << training_loss->at(training_loss->size() - 1)
+ //<< " " << validation_loss->at(validation_loss->size() - 1);
+ }
+
+ // Set the 'check_convergence' flag.
+ bool check_convergence = (epoch > 0) && (epoch % convergence_epochs == 0);
+
+ // Run for one epoch to update the parameters.
+ EpochUpdate(weights, epoch, check_convergence);
+
+ // Optionally do a simple convergence check.
+ if (check_convergence && use_simple_convergence_check_) {
+ SimpleConvergenceCheck(*training_loss);
+ }
+
+ // Check the convergence flag.
+ if (converged_) break;
+ }
+
+ // Compute final loss.
+ training_loss->push_back(Loss(*weights));
+ validation_loss->push_back(
+ gradient_evaluator_.Loss(*weights, validation_instances,
+ validation_labels));
+ //LOG(INFO) << "final loss: "
+ //<< training_loss->at(training_loss->size() - 1);
+ //LOG(INFO) << "final validation loss: "
+ //<< validation_loss->at(validation_loss->size() - 1);
+ return converged_;
+}
+
+float LossMinimizer::BestLearningRate() {
+ // Sample a subset of the data.
+ std::vector<int> examples(num_search_examples_);
+ std::mt19937 rnd;
+ std::uniform_int_distribution<> dis(0, gradient_evaluator_.NumExamples());
+ for (int i = 0; i < num_search_examples_; ++i) {
+ examples[i] = dis(rnd);
+ }
+
+ float min_loss_rate = initial_rates_[0];
+ float min_loss = HUGE_VAL;
+ for (int i = 0; i < initial_rates_.size(); ++i) {
+ // Initialize parameters to zero.
+ Weights weights = Weights::Zero(gradient_evaluator_.NumWeights());
+
+ // Make updates with the current learning rate.
+ Weights gradient(weights.size());
+ for (int example : examples) {
+ gradient = l2() * weights;
+ gradient_evaluator_.AddExampleGradient(weights, example, 1.0f, 1.0f,
+ &gradient);
+ weights -= initial_rates_[i] * gradient;
+ }
+
+ // Compute the corresponding loss.
+ float loss = gradient_evaluator_.Loss(weights, examples);
+ if (i == 0) {
+ min_loss = loss;
+ } else if (loss < min_loss) {
+ min_loss = loss;
+ min_loss_rate = initial_rates_[i];
+ }
+ }
+
+ // Return the learning rate corresponding to the smallest loss.
+ //LOG(INFO) << "best initial learning rate: " << min_loss_rate;
+ return min_loss_rate;
+}
+
+void LossMinimizer::ConvergenceCheck(
+ const Weights &weights, const Weights &gradient) {
+ Weights gradient_for_convergence =
+ (weights.array() == 0.0f).select(abs(gradient.array()) - l1_, gradient);
+ if (gradient_for_convergence.norm() / weights.size() <
+ convergence_threshold_) {
+ converged_ = true;
+ }
+}
+
+void LossMinimizer::SimpleConvergenceCheck(const std::vector<float> &loss) {
+ // Check convergence by verifying that the max relative loss decrease
+ // (loss[t-1] - loss[t]) / loss[t-1] is below 'convergence_threshold_'.
+ if (loss.size() > num_convergence_epochs_) {
+ float loss_difference = 0.0f;
+ for (int i = loss.size() - num_convergence_epochs_; i < loss.size(); ++i) {
+ loss_difference = std::max(loss_difference, 1 - loss[i] / loss[i - 1]);
+ }
+ if (loss_difference < convergence_threshold_) set_converged(true);
+ }
+}
+
+void LossMinimizer::set_batch_size(int batch_size) {
+ int num_batches =
+ (gradient_evaluator_.NumExamples() + batch_size - 1) / batch_size;
+
+ // Assign examples to each batch.
+ batch_examples_.assign(num_batches, {});
+ for (auto &batch : batch_examples_) batch.reserve(batch_size);
+ for (int i = 0; i < gradient_evaluator().NumExamples(); ++i) {
+ batch_examples_[i % num_batches].push_back(i);
+ }
+}
+
+} // namespace lossmin
diff --git a/minimizers/loss-minimizer.h b/minimizers/loss-minimizer.h
new file mode 100644
index 0000000..066aba9
--- /dev/null
+++ b/minimizers/loss-minimizer.h
@@ -0,0 +1,288 @@
+// Copyright 2014 Google Inc. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+//
+// Interface for gradient descent algorithms. Example usage:
+//
+// 1. Create a GradientEvaluator object that computes the value and gradient of
+// a loss function on a given dataset.
+//
+// InstanceSet instances = ...
+// LabelSet labels = ...
+// LossFunction *loss_function = LossFunction::Create("logistic-regression");
+// GradientEvaluator gradient_evaluator(instances, labels, loss_function);
+//
+// 2. Create a LossMinimizer object from 'gradient_evaluator' and l1 and l2
+// regularization parameters. StochasticGradientDescent requires an
+// additional parameter for learning rate scheduling.
+//
+// float l1 = ...
+// float l2 = ...
+// DeterministicGradientDescent loss_minimizer(l1, l2, gradient_evaluator);
+//
+// 3. Run optimization for up to 'max_epochs' epochs. 'loss' is filled with
+// loss values across epochs, and 'weights' contains the best parameters.
+//
+// Weights weights = Weights::Zero(num_features); // or other initialization
+// vector<float> loss;
+// int max_epochs = 100;
+// bool converged = loss_minimizer.Run(max_epochs, &weights, &loss);
+//
+// Note: algorithm-specific parameters such as learning rates are set based on
+// the data and the loss function. If the dataset or the loss pointed to by
+// 'gradient_evaluator' change, the user must call loss_minimizer.Setup() method
+// before loss_minimizer.Run().
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "lossmin/eigen-types.h"
+#include "lossmin/minimizers/gradient-evaluator.h"
+#include "lossmin/losses/loss-function.h"
+#include "third_party/eigen/Eigen/Core"
+
+namespace lossmin {
+
+// Interface for gradient descent algorithms that minimize a loss function over
+// a labeled dataset. A GradientEvaluator object provides the loss value and
+// gradient for given parameters at each epoch. Derived classes should be
+// thread-compatible, and must provide the methods EpochUpdate() and Setup().
+class LossMinimizer {
+ public:
+ // Constructor sets the l1 and l2 regularization parameters and
+ // 'gradient_evalutor_'.
+ LossMinimizer(
+ float l1, float l2, const GradientEvaluator &gradient_evaluator)
+ : l1_(l1), l2_(l2), gradient_evaluator_(gradient_evaluator),
+ converged_(false) {
+ std::random_device rd;
+ rnd_.seed(rd());
+ }
+ virtual ~LossMinimizer() {}
+
+ // Initializes algorithm-specific parameters such as learning rates, based
+ // on the loss function and the dataset. Called in the constructor of the
+ // derived classes. If the dataset that 'gradient_evaluator' points to
+ // changes, Setup() must be called again by the user.
+ virtual void Setup() = 0;
+
+ // Runs minimization until convergence, or up to 'max_epochs' epochs. Returns
+ // true if the algorithm has converged. Parameters 'weights' should be
+ // initialized by the user, and will contain the parameters produced at the
+ // last epoch. 'loss' is filled with training loss values produced every
+ // 'loss_epochs' epochs. Convergence is checked every 'convergence_epochs'.
+ bool Run(int max_epochs, int loss_epochs, int convergence_epochs,
+ Weights *weights, std::vector<float> *loss);
+
+ // Runs minimization, evaluating loss on both training and validation data.
+ bool Run(
+ int max_epochs, int loss_epochs, int convergence_epochs,
+ const InstanceSet &validation_instances,
+ const LabelSet &validation_labels, Weights *weights,
+ std::vector<float> *training_loss, std::vector<float> *validation_loss);
+
+ // Convenience Run method that evaluates the loss and checks for convergence
+ // at every iteration.
+ bool Run(int max_epochs, Weights *weights, std::vector<float> *loss) {
+ return Run(max_epochs, 1, 1, weights, loss);
+ }
+
+ // Abstract method that updates the weights in the current iteration.
+ virtual void EpochUpdate(Weights *weights, int epoch,
+ bool check_convergence) = 0;
+
+ // Returns the total loss for given parameters 'weights', including l1 and l2
+ // regularization.
+ float Loss(const Weights &weights) const {
+ float loss = gradient_evaluator_.Loss(weights);
+ if (l2_ > 0.0f) loss += 0.5 * l2_ * weights.squaredNorm();
+ if (l1_ > 0.0f) loss += l1_ * weights.cwiseAbs().sum();
+ return loss;
+ }
+
+ // Checks convergence of deterministic methods based on two criteria:
+ // (1) the squared l2 norm of the gradient is below 'convergence_threshold_'
+ // (2) for zero-valued weights, the corresponding gradient values are < l1.
+ // If converged, the flag 'converged_' is set to true.
+ void ConvergenceCheck(const Weights &weights, const Weights &gradient);
+
+ // Checks convergence based on the decrease in loss over the last
+ // 'num_convergence_epochs_' epochs. If converged, the flag 'converged_' is
+ // set to true.
+ void SimpleConvergenceCheck(const std::vector<float> &loss);
+
+ // Setters and getters for convergence criteria parameters.
+ bool converged() const { return converged_; }
+ void set_converged(bool converged) { converged_ = converged; }
+ void set_use_simple_convergence_check(bool use_simple_convergence_check) {
+ use_simple_convergence_check_ = use_simple_convergence_check;
+ }
+ void set_convergence_threshold(float convergence_threshold) {
+ convergence_threshold_ = convergence_threshold;
+ }
+ void set_num_convergence_epochs(int num_convergence_epochs) {
+ num_convergence_epochs_ = num_convergence_epochs;
+ }
+
+ // Returns the best initial learning rate for stochastic methods by evaluating
+ // elements of 'initial_rates_' on a subset of 'num_search_examples_' training
+ // examples.
+ float BestLearningRate();
+
+ // Methods that set parameters related to the initial learning rate in
+ // stochastic methods.
+ virtual void set_initial_learning_rate(float initial_learning_rate) {
+ initial_learning_rate_ = initial_learning_rate;
+ //LOG(INFO) << "initial_learning_rate set to " << initial_learning_rate_;
+ }
+ float initial_learning_rate() const { return initial_learning_rate_; }
+ void set_initial_rates(const std::vector<float> &initial_rates) {
+ initial_rates_ = initial_rates;
+ }
+ void set_num_search_examples(int num_search_examples) {
+ num_search_examples_ = num_search_examples;
+ }
+
+ // Sets the seed of the random number generator.
+ void SetSeed(int32_t seed) { rnd_.seed(seed); }
+
+ // Number of threads for stochastic methods.
+ void set_num_threads(int num_threads) { num_threads_ = num_threads; }
+ int num_threads() const { return num_threads_; }
+
+ // Minibatch size for stochastic methods.
+ void set_batch_size(int batch_size);
+ int NumBatches() const { return batch_examples_.size(); }
+
+ // Shuffles the batches with examples.
+ void ShuffleBatches() {
+ std::shuffle(batch_examples_.begin(), batch_examples_.end(), rnd_);
+ }
+
+ // Shuffles and returns the examples in 'batch_examples_[batch]'.
+ const std::vector<int> &batch_examples(int batch) {
+ //DCHECK(batch < batch_examples_.size());
+ std::shuffle(batch_examples_[batch].begin(),
+ batch_examples_[batch].end(), rnd_);
+ return batch_examples_[batch];
+ }
+
+ // Returns a reference to 'gradient_evaluator_'.
+ const GradientEvaluator &gradient_evaluator() const {
+ return gradient_evaluator_;
+ }
+
+ // Returns the l1 regularization parameter.
+ float l1() const { return l1_; }
+
+ // Returns the l2 regularization parameter.
+ float l2() const { return l2_; }
+
+ // Getter/setter for the 'num_scale_iterations_', used for sparse updates in
+ // stochastic methods.
+ int num_scale_iterations() const { return num_scale_iterations_; }
+ void set_num_scale_iterations(int num_scale_iterations) {
+ num_scale_iterations_ = num_scale_iterations;
+ }
+
+ // Getter/setter for 'num_iterations_per_stage_', used in
+ // StochasticVarianceReducedGradient.
+ int num_iterations_per_stage() const {
+ return num_iterations_per_stage_;
+ }
+ void set_num_iterations_per_stage(int num_iterations_per_stage) {
+ num_iterations_per_stage_ = num_iterations_per_stage;
+ }
+
+ // Getter/setter for projecting weights onto a ball in
+ // StochasticGradientDescent.
+ bool project_weights() const { return project_weights_; }
+ void set_project_weights(bool project_weights) {
+ project_weights_ = project_weights;
+ }
+
+ // Applies L1Prox coefficientwise to 'weights' and 'threshold'.
+ static void L1Prox(float threshold, Weights *weights) {
+ for (int i = 0; i < weights->size(); ++i) {
+ weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold);
+ }
+ }
+
+ // Applies L1Prox coefficientwise to 'weights' and 'threshold', where
+ // 'threshold' is a vector of per-coordinate thresholds.
+ static void L1Prox(const VectorXf &threshold, Weights *weights) {
+ for (int i = 0; i < weights->size(); ++i) {
+ weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold.coeff(i));
+ }
+ }
+
+ // Returns sign('x') * max(0.0, abs('x') - 'threshold').
+ static inline float L1Prox(float x, float threshold) {
+ return Sign(x) * std::max(std::abs(x) - threshold, 0.0f);
+ }
+
+ // Returns sign('x').
+ static inline float Sign(float x) {
+ if (x > 0.0f) return 1.0f;
+ if (x < 0.0f) return -1.0f;
+ return 0.0f;
+ }
+
+ // Default initial learning rate for stochastic methods.
+ const float kDefaultInitialLearningRate = 0.05;
+
+ private:
+ // Regularization parameters.
+ float l1_;
+ float l2_;
+
+ // GradientEvaluator used to compute the (unregularized) loss and gradient.
+ const GradientEvaluator &gradient_evaluator_;
+
+ // Convergence parameters.
+ bool converged_ = false; // convergence flag set by convergence checks
+ float convergence_threshold_ = 1e-5; // threshold for assessing convergence
+ bool use_simple_convergence_check_ = false; // which convergence check to use
+ int num_convergence_epochs_ = 5; // used in SimpleConvergenceCheck
+
+ // Initial learning rate, used in stochastic methods.
+ float initial_learning_rate_ = 0.01;
+
+ // Parameters used to search for a good initial learning rate for stochastic
+ // methods in the method BestLearningRate(). 'initial_rates_' are evaluated on
+ // a data subset of size 'num_search_examples_'.
+ std::vector<float> initial_rates_ = {0.001, 0.005, 0.01, 0.05};
+ int num_search_examples_ = 500;
+
+ // Multithreading parameters, optionally used in stochastic methods.
+ int num_threads_;
+ std::vector<std::vector<int>> batch_examples_;
+
+ // Pseudo-random number generator for shuffling 'batch_examples_'.
+ std::mt19937 rnd_;
+
+ // In stochastic methods, weights = scale * Weights. For numerical stability,
+ // this is reset to scale = 1.0 every 'num_scale_iterations_' iterations.
+ int num_scale_iterations_ = 100;
+
+ // Number of iterations per stage; used in StochasticVarianceReducedGradient.
+ int num_iterations_per_stage_ = 100000;
+
+ // For online/stochastic methods, a simple bound on the optimal solution is
+ // |weights|^2 <= 2 / L2. Knowing this, if an intermediate solution is
+ // outside of a ball of radius sqrt(2 / L2), we can project it onto the ball
+ // to potentially speed up convergence. This approach is used in
+ // StochasticGradientDescent, and controlled by the flag 'project_weights_'.
+ bool project_weights_ = true;
+};
+
+} // namespace lossmin
+
diff --git a/minimizers/parallel-boosting-with-momentum.cc b/minimizers/parallel-boosting-with-momentum.cc
new file mode 100644
index 0000000..7254401
--- /dev/null
+++ b/minimizers/parallel-boosting-with-momentum.cc
@@ -0,0 +1,61 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+
+#include "lossmin/minimizers/parallel-boosting-with-momentum.h"
+
+#include <math.h>
+#include <functional>
+
+#include "lossmin/minimizers/gradient-evaluator.h"
+#include "third_party/eigen/Eigen/Core"
+
+namespace lossmin {
+
+void ParallelBoostingWithMomentum::Setup() {
+ // Per-coordinate learning rates are learning_rates[j] = 1 / sparsity * Lj,
+ // where sparsity is the maximum instance l0 norm and Lj is upper bound on
+ // the loss curvature along coordinate j.
+ float sparsity = gradient_evaluator().Sparsity();
+ gradient_evaluator().PerCoordinateCurvature(&learning_rates_);
+ learning_rates_ =
+ (learning_rates_.array() + l2()).inverse().matrix() / sparsity;
+
+ // Initialize the approximating function parameters.
+ alpha_ = 0.5f;
+ beta_ = 1.0f - alpha_;
+ phi_center_ = Weights::Zero(gradient_evaluator().NumWeights());
+}
+
+void ParallelBoostingWithMomentum::EpochUpdate(Weights *weights, int epoch,
+ bool check_convergence) {
+ // Compute the intermediate weight vector y.
+ Weights y = (1.0f - alpha_) * *weights + alpha_ * phi_center_;
+
+ // Compute the gradient of the loss wrt y.
+ Weights gradient_wrt_y = Weights::Zero(y.size());
+ gradient_evaluator().Gradient(y, &gradient_wrt_y);
+ if (l2() > 0.0f) gradient_wrt_y += l2() * y;
+
+ // Gradient step.
+ *weights -= gradient_wrt_y.cwiseProduct(learning_rates_);
+
+ // l1 shrinkage.
+ if (l1() > 0.0f) {
+ L1Prox(l1() * learning_rates_, weights);
+ gradient_wrt_y += l1() * weights->unaryExpr(std::ptr_fun(Sign));
+ }
+
+ // Update the approximation function.
+ phi_center_ -= (1.0 - alpha_) / alpha_ * (y - *weights);
+ alpha_ =
+ -beta_ / 2.0 +
+ pow(beta_ + beta_ * beta_ / 4.0, static_cast<float>(0.5));
+ beta_ *= (1.0 - alpha_);
+
+ // Check convergence.
+ if (check_convergence) ConvergenceCheck(*weights, gradient_wrt_y);
+}
+
+} // namespace lossmin
diff --git a/minimizers/parallel-boosting-with-momentum.h b/minimizers/parallel-boosting-with-momentum.h
new file mode 100644
index 0000000..619dfe8
--- /dev/null
+++ b/minimizers/parallel-boosting-with-momentum.h
@@ -0,0 +1,60 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Author: nevena@google.com (Nevena Lazic)
+//
+// An implementation of:
+// I. Mukherjee, K. Canini, R. Frongillo, and Y. Singer, "Parallel Boosting with
+// Momentum", ECML PKDD 2013.
+// Variable names follow the notation in the paper.
+
+#pragma once
+
+#include "lossmin/eigen-types.h"
+#include "lossmin/minimizers/gradient-evaluator.h"
+#include "lossmin/minimizers/loss-minimizer.h"
+
+namespace lossmin {
+
+class GradientEvaluator;
+
+class ParallelBoostingWithMomentum : public LossMinimizer {
+ public:
+ ParallelBoostingWithMomentum(
+ float l1, float l2, const GradientEvaluator &gradient_evaluator)
+ : LossMinimizer(l1, l2, gradient_evaluator) {
+ Setup();
+ }
+
+ // Sets learning rates and other parameters.
+ void Setup() override;
+
+ private:
+ // Updates 'weights' and the quadratic approximation function phi(w), such
+ // that at iteration k, loss(weights_k) <= min_w phi_k(w).
+ // y = (1 - alpha) * weights + alpha * phi_center
+ // grad_y = loss_grad(y) + l2 * y
+ // weights[j] = weights[j] - grad_y[j] / learning_rates[j]
+ // weights[j] =
+ // sign(weights[j]) * max(0, weights[j], l1 / learning_rates[j])
+ void EpochUpdate(Weights *weights, int epoch,
+ bool check_convergence) override;
+
+ // Per-coordinate learning rates.
+ VectorXf learning_rates_;
+
+ // Center of the approximating quadratic function phi.
+ VectorXf phi_center_;
+
+ // Parameter for updating the approximation function phi. At each iteration,
+ // 'alpha_' is updated to the solution of the quadratic equation
+ // alpha_^2 = beta_ * (1.0 - alpha_)
+ float alpha_;
+
+ // Parameter used to update alpha, defined as
+ // beta_{epoch} = \prod_{i=1}^{epoch} (1 - alpha_i).
+ float beta_;
+};
+
+} // namespace lossmin
+
diff --git a/sample.cc b/sample.cc
new file mode 100644
index 0000000..ee6d31c
--- /dev/null
+++ b/sample.cc
@@ -0,0 +1,53 @@
+// Copyright 2017 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <memory>
+#include <random>
+#include <iostream>
+
+#include "lossmin/eigen-types.h"
+#include "lossmin/losses/loss-function.h"
+#include "lossmin/losses/inner-product-loss-function.h"
+#include "minimizers/gradient-evaluator.h"
+#include "minimizers/loss-minimizer.h"
+#include "minimizers/parallel-boosting-with-momentum.h"
+
+using namespace lossmin;
+
+int main(int argc, char **argv) {
+ LossFunction *loss_func = new LinearRegressionLossFunction();
+
+ const int rows_num = 10000;
+ const int features_num = 3;
+
+
+ InstanceSet instances(rows_num, features_num);
+
+ std::default_random_engine generator;
+ std::uniform_real_distribution<float> distribution(-1.0,1.0);
+
+ for (int row = 0; row < rows_num; row++) {
+ for (int feature = 0; feature < features_num; feature++) {
+ instances.insert(row, feature) = distribution(generator);
+ }
+ }
+
+ Weights real_weights(features_num, 1);
+ for (int feature = 0; feature < features_num; feature++) {
+ real_weights.coeffRef(feature, 0) = distribution(generator);
+ }
+ std::cout << "Real Weights:" << std::endl << real_weights << std::endl;
+
+ LabelSet labels = instances * real_weights;
+
+ GradientEvaluator gradient_evaluator(instances, labels, loss_func);
+ LossMinimizer *minimizer = new ParallelBoostingWithMomentum(
+ 0.0, 0.0, gradient_evaluator);
+ minimizer->Setup();
+
+ Weights weights(features_num, 1);
+ std::vector<float> loss;
+ minimizer->Run(5000, &weights, &loss);
+ std::cout << "Learned Weights:" << std::endl << weights << std::endl;
+}