minimizers/loss-minimizer.h - lossmin - Git at Google

 // Copyright 2014 Google Inc. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 // Author: nevena@google.com (Nevena Lazic)
 //
 // Interface for gradient descent algorithms. Example usage:
 //
 // 1. Create a GradientEvaluator object that computes the value and gradient of
 //    a loss function on a given dataset.
 //
 //    InstanceSet instances = ...
 //    LabelSet labels = ...
 //    LossFunction *loss_function = LossFunction::Create("logistic-regression");
 //    GradientEvaluator gradient_evaluator(instances, labels, loss_function);
 //
 // 2. Create a LossMinimizer object from 'gradient_evaluator' and l1 and l2
 //    regularization parameters. StochasticGradientDescent requires an
 //    additional parameter for learning rate scheduling.
 //
 //    float l1 = ...
 //    float l2 = ...
 //    DeterministicGradientDescent loss_minimizer(l1, l2, gradient_evaluator);
 //
 // 3. Run optimization for up to 'max_epochs' epochs. 'loss' is filled with
 //    loss values across epochs, and 'weights' contains the best parameters.
 //
 //    Weights weights = Weights::Zero(num_features);  // or other initialization
 //    vector<float> loss;
 //    int max_epochs = 100;
 //    bool converged = loss_minimizer.Run(max_epochs, &weights, &loss);
 //
 // Note: algorithm-specific parameters such as learning rates are set based on
 // the data and the loss function. If the dataset or the loss pointed to by
 // 'gradient_evaluator' change, the user must call loss_minimizer.Setup() method
 // before loss_minimizer.Run().

 #pragma once

 #include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <functional>
 #include <random>
 #include <string>
 #include <vector>

 #include "lossmin/eigen-types.h"
 #include "lossmin/minimizers/gradient-evaluator.h"
 #include "lossmin/losses/loss-function.h"
 #include "third_party/eigen/Eigen/Core"

 namespace lossmin {

 // Interface for gradient descent algorithms that minimize a loss function over
 // a labeled dataset. A GradientEvaluator object provides the loss value and
 // gradient for given parameters at each epoch. Derived classes should be
 // thread-compatible, and must provide the methods EpochUpdate() and Setup().
 class LossMinimizer {
  public:
   // Constructor sets the l1 and l2 regularization parameters and
   // 'gradient_evalutor_'.
   LossMinimizer(
       float l1, float l2, const GradientEvaluator &gradient_evaluator)
       : l1_(l1), l2_(l2), gradient_evaluator_(gradient_evaluator),
         converged_(false) {
     std::random_device rd;
     rnd_.seed(rd());
   }
   virtual ~LossMinimizer() {}

   // Initializes algorithm-specific parameters such as learning rates, based
   // on the loss function and the dataset. Called in the constructor of the
   // derived classes. If the dataset that 'gradient_evaluator' points to
   // changes, Setup() must be called again by the user.
   virtual void Setup() = 0;

   // Runs minimization until convergence, or up to 'max_epochs' epochs. Returns
   // true if the algorithm has converged. Parameters 'weights' should be
   // initialized by the user, and will contain the parameters produced at the
   // last epoch. 'loss' is filled with training loss values produced every
   // 'loss_epochs' epochs. Convergence is checked every 'convergence_epochs'.
   bool Run(int max_epochs, int loss_epochs, int convergence_epochs,
            Weights *weights, std::vector<float> *loss);

   // Runs minimization, evaluating loss on both training and validation data.
   bool Run(
       int max_epochs, int loss_epochs, int convergence_epochs,
       const InstanceSet &validation_instances,
       const LabelSet &validation_labels, Weights *weights,
       std::vector<float> *training_loss, std::vector<float> *validation_loss);

   // Convenience Run method that evaluates the loss and checks for convergence
   // at every iteration.
   bool Run(int max_epochs, Weights *weights, std::vector<float> *loss) {
     return Run(max_epochs, 1, 1, weights, loss);
   }

   // Abstract method that updates the weights in the current iteration.
   virtual void EpochUpdate(Weights *weights, int epoch,
                            bool check_convergence) = 0;

   // Returns the total loss for given parameters 'weights', including l1 and l2
   // regularization.
   float Loss(const Weights &weights) const {
     float loss = gradient_evaluator_.Loss(weights);
     if (l2_ > 0.0f) loss += 0.5 * l2_ * weights.squaredNorm();
     if (l1_ > 0.0f) loss += l1_ * weights.cwiseAbs().sum();
     return loss;
   }

   // Checks convergence of deterministic methods based on two criteria:
   // (1) the squared l2 norm of the gradient is below 'convergence_threshold_'
   // (2) for zero-valued weights, the corresponding gradient values are < l1.
   // If converged, the flag 'converged_' is set to true.
   void ConvergenceCheck(const Weights &weights, const Weights &gradient);

   // Checks convergence based on the decrease in loss over the last
   // 'num_convergence_epochs_' epochs. If converged, the flag 'converged_' is
   // set to true.
   void SimpleConvergenceCheck(const std::vector<float> &loss);

   // Setters and getters for convergence criteria parameters.
   bool converged() const { return converged_; }
   void set_converged(bool converged) { converged_ = converged; }
   void set_use_simple_convergence_check(bool use_simple_convergence_check) {
     use_simple_convergence_check_ = use_simple_convergence_check;
   }
   void set_convergence_threshold(float convergence_threshold) {
     convergence_threshold_ = convergence_threshold;
   }
   void set_num_convergence_epochs(int num_convergence_epochs) {
     num_convergence_epochs_ = num_convergence_epochs;
   }

   // Returns the best initial learning rate for stochastic methods by evaluating
   // elements of 'initial_rates_' on a subset of 'num_search_examples_' training
   // examples.
   float BestLearningRate();

   // Methods that set parameters related to the initial learning rate in
   // stochastic methods.
   virtual void set_initial_learning_rate(float initial_learning_rate) {
     initial_learning_rate_ = initial_learning_rate;
     //LOG(INFO) << "initial_learning_rate set to " << initial_learning_rate_;
   }
   float initial_learning_rate() const { return initial_learning_rate_; }
   void set_initial_rates(const std::vector<float> &initial_rates) {
     initial_rates_ = initial_rates;
   }
   void set_num_search_examples(int num_search_examples) {
     num_search_examples_ = num_search_examples;
   }

   // Sets the seed of the random number generator.
   void SetSeed(int32_t seed) { rnd_.seed(seed); }

   // Number of threads for stochastic methods.
   void set_num_threads(int num_threads) { num_threads_ = num_threads; }
   int num_threads() const { return num_threads_; }

   // Minibatch size for stochastic methods.
   void set_batch_size(int batch_size);
   int NumBatches() const { return batch_examples_.size(); }

   // Shuffles the batches with examples.
   void ShuffleBatches() {
     std::shuffle(batch_examples_.begin(), batch_examples_.end(), rnd_);
   }

   // Shuffles and returns the examples in 'batch_examples_[batch]'.
   const std::vector<int> &batch_examples(int batch) {
     //DCHECK(batch < batch_examples_.size());
     std::shuffle(batch_examples_[batch].begin(),
         batch_examples_[batch].end(), rnd_);
     return batch_examples_[batch];
   }

   // Returns a reference to 'gradient_evaluator_'.
   const GradientEvaluator &gradient_evaluator() const {
     return gradient_evaluator_;
   }

   // Returns the l1 regularization parameter.
   float l1() const { return l1_; }

   // Returns the l2 regularization parameter.
   float l2() const { return l2_; }

   // Getter/setter for the 'num_scale_iterations_', used for sparse updates in
   // stochastic methods.
   int num_scale_iterations() const { return num_scale_iterations_; }
   void set_num_scale_iterations(int num_scale_iterations) {
     num_scale_iterations_ = num_scale_iterations;
   }

   // Getter/setter for 'num_iterations_per_stage_', used in
   // StochasticVarianceReducedGradient.
   int num_iterations_per_stage() const {
     return num_iterations_per_stage_;
   }
   void set_num_iterations_per_stage(int num_iterations_per_stage) {
     num_iterations_per_stage_ = num_iterations_per_stage;
   }

   // Getter/setter for projecting weights onto a ball in
   // StochasticGradientDescent.
   bool project_weights() const { return project_weights_; }
   void set_project_weights(bool project_weights) {
     project_weights_ = project_weights;
   }

   // Applies L1Prox coefficientwise to 'weights' and 'threshold'.
   static void L1Prox(float threshold, Weights *weights) {
     for (int i = 0; i < weights->size(); ++i) {
       weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold);
     }
   }

   // Applies L1Prox coefficientwise to 'weights' and 'threshold', where
   // 'threshold' is a vector of per-coordinate thresholds.
   static void L1Prox(const VectorXf &threshold, Weights *weights) {
     for (int i = 0; i < weights->size(); ++i) {
       weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold.coeff(i));
     }
   }

   // Returns sign('x') * max(0.0, abs('x') - 'threshold').
   static inline float L1Prox(float x, float threshold) {
     return Sign(x) * std::max(std::abs(x) - threshold, 0.0f);
   }

   // Returns sign('x').
   static inline float Sign(float x) {
     if (x > 0.0f) return 1.0f;
     if (x < 0.0f) return -1.0f;
     return 0.0f;
   }

   // Default initial learning rate for stochastic methods.
   const float kDefaultInitialLearningRate = 0.05;

  private:
   // Regularization parameters.
   float l1_;
   float l2_;

   // GradientEvaluator used to compute the (unregularized) loss and gradient.
   const GradientEvaluator &gradient_evaluator_;

   // Convergence parameters.
   bool converged_ = false;  // convergence flag set by convergence checks
   float convergence_threshold_ = 1e-5;  // threshold for assessing convergence
   bool use_simple_convergence_check_ = false;  // which convergence check to use
   int num_convergence_epochs_ = 5;  // used in SimpleConvergenceCheck

   // Initial learning rate, used in stochastic methods.
   float initial_learning_rate_ = 0.01;

   // Parameters used to search for a good initial learning rate for stochastic
   // methods in the method BestLearningRate(). 'initial_rates_' are evaluated on
   // a data subset of size 'num_search_examples_'.
   std::vector<float> initial_rates_ = {0.001, 0.005, 0.01, 0.05};
   int num_search_examples_ = 500;

   // Multithreading parameters, optionally used in stochastic methods.
   int num_threads_;
   std::vector<std::vector<int>> batch_examples_;

   // Pseudo-random number generator for shuffling 'batch_examples_'.
   std::mt19937 rnd_;

   // In stochastic methods, weights = scale * Weights. For numerical stability,
   // this is reset to scale = 1.0 every 'num_scale_iterations_' iterations.
   int num_scale_iterations_ = 100;

   // Number of iterations per stage; used in StochasticVarianceReducedGradient.
   int num_iterations_per_stage_ = 100000;

   // For online/stochastic methods, a simple bound on the optimal solution is
   // |weights|^2 <= 2 / L2. Knowing this, if an intermediate solution is
   // outside of a ball of radius sqrt(2 / L2), we can project it onto the ball
   // to potentially speed up convergence. This approach is used in
   // StochasticGradientDescent, and controlled by the flag 'project_weights_'.
   bool project_weights_ = true;
 };

 }  // namespace lossmin
	// Copyright 2014 Google Inc. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.
	// Author: nevena@google.com (Nevena Lazic)
	//
	// Interface for gradient descent algorithms. Example usage:
	//
	// 1. Create a GradientEvaluator object that computes the value and gradient of
	// a loss function on a given dataset.
	//
	// InstanceSet instances = ...
	// LabelSet labels = ...
	// LossFunction *loss_function = LossFunction::Create("logistic-regression");
	// GradientEvaluator gradient_evaluator(instances, labels, loss_function);
	//
	// 2. Create a LossMinimizer object from 'gradient_evaluator' and l1 and l2
	// regularization parameters. StochasticGradientDescent requires an
	// additional parameter for learning rate scheduling.
	//
	// float l1 = ...
	// float l2 = ...
	// DeterministicGradientDescent loss_minimizer(l1, l2, gradient_evaluator);
	//
	// 3. Run optimization for up to 'max_epochs' epochs. 'loss' is filled with
	// loss values across epochs, and 'weights' contains the best parameters.
	//
	// Weights weights = Weights::Zero(num_features); // or other initialization
	// vector<float> loss;
	// int max_epochs = 100;
	// bool converged = loss_minimizer.Run(max_epochs, &weights, &loss);
	//
	// Note: algorithm-specific parameters such as learning rates are set based on
	// the data and the loss function. If the dataset or the loss pointed to by
	// 'gradient_evaluator' change, the user must call loss_minimizer.Setup() method
	// before loss_minimizer.Run().

	#pragma once

	#include <algorithm>
	#include <cmath>
	#include <cstdint>
	#include <functional>
	#include <random>
	#include <string>
	#include <vector>

	#include "lossmin/eigen-types.h"
	#include "lossmin/minimizers/gradient-evaluator.h"
	#include "lossmin/losses/loss-function.h"
	#include "third_party/eigen/Eigen/Core"

	namespace lossmin {

	// Interface for gradient descent algorithms that minimize a loss function over
	// a labeled dataset. A GradientEvaluator object provides the loss value and
	// gradient for given parameters at each epoch. Derived classes should be
	// thread-compatible, and must provide the methods EpochUpdate() and Setup().
	class LossMinimizer {
	public:
	// Constructor sets the l1 and l2 regularization parameters and
	// 'gradient_evalutor_'.
	LossMinimizer(
	float l1, float l2, const GradientEvaluator &gradient_evaluator)
	: l1_(l1), l2_(l2), gradient_evaluator_(gradient_evaluator),
	converged_(false) {
	std::random_device rd;
	rnd_.seed(rd());
	}
	virtual ~LossMinimizer() {}

	// Initializes algorithm-specific parameters such as learning rates, based
	// on the loss function and the dataset. Called in the constructor of the
	// derived classes. If the dataset that 'gradient_evaluator' points to
	// changes, Setup() must be called again by the user.
	virtual void Setup() = 0;

	// Runs minimization until convergence, or up to 'max_epochs' epochs. Returns
	// true if the algorithm has converged. Parameters 'weights' should be
	// initialized by the user, and will contain the parameters produced at the
	// last epoch. 'loss' is filled with training loss values produced every
	// 'loss_epochs' epochs. Convergence is checked every 'convergence_epochs'.
	bool Run(int max_epochs, int loss_epochs, int convergence_epochs,
	Weights weights, std::vector<float> loss);

	// Runs minimization, evaluating loss on both training and validation data.
	bool Run(
	int max_epochs, int loss_epochs, int convergence_epochs,
	const InstanceSet &validation_instances,
	const LabelSet &validation_labels, Weights *weights,
	std::vector<float> training_loss, std::vector<float> validation_loss);

	// Convenience Run method that evaluates the loss and checks for convergence
	// at every iteration.
	bool Run(int max_epochs, Weights weights, std::vector<float> loss) {
	return Run(max_epochs, 1, 1, weights, loss);
	}

	// Abstract method that updates the weights in the current iteration.
	virtual void EpochUpdate(Weights *weights, int epoch,
	bool check_convergence) = 0;

	// Returns the total loss for given parameters 'weights', including l1 and l2
	// regularization.
	float Loss(const Weights &weights) const {
	float loss = gradient_evaluator_.Loss(weights);
	if (l2_ > 0.0f) loss += 0.5 * l2_ * weights.squaredNorm();
	if (l1_ > 0.0f) loss += l1_ * weights.cwiseAbs().sum();
	return loss;
	}

	// Checks convergence of deterministic methods based on two criteria:
	// (1) the squared l2 norm of the gradient is below 'convergence_threshold_'
	// (2) for zero-valued weights, the corresponding gradient values are < l1.
	// If converged, the flag 'converged_' is set to true.
	void ConvergenceCheck(const Weights &weights, const Weights &gradient);

	// Checks convergence based on the decrease in loss over the last
	// 'num_convergence_epochs_' epochs. If converged, the flag 'converged_' is
	// set to true.
	void SimpleConvergenceCheck(const std::vector<float> &loss);

	// Setters and getters for convergence criteria parameters.
	bool converged() const { return converged_; }
	void set_converged(bool converged) { converged_ = converged; }
	void set_use_simple_convergence_check(bool use_simple_convergence_check) {
	use_simple_convergence_check_ = use_simple_convergence_check;
	}
	void set_convergence_threshold(float convergence_threshold) {
	convergence_threshold_ = convergence_threshold;
	}
	void set_num_convergence_epochs(int num_convergence_epochs) {
	num_convergence_epochs_ = num_convergence_epochs;
	}

	// Returns the best initial learning rate for stochastic methods by evaluating
	// elements of 'initial_rates_' on a subset of 'num_search_examples_' training
	// examples.
	float BestLearningRate();

	// Methods that set parameters related to the initial learning rate in
	// stochastic methods.
	virtual void set_initial_learning_rate(float initial_learning_rate) {
	initial_learning_rate_ = initial_learning_rate;
	//LOG(INFO) << "initial_learning_rate set to " << initial_learning_rate_;
	}
	float initial_learning_rate() const { return initial_learning_rate_; }
	void set_initial_rates(const std::vector<float> &initial_rates) {
	initial_rates_ = initial_rates;
	}
	void set_num_search_examples(int num_search_examples) {
	num_search_examples_ = num_search_examples;
	}

	// Sets the seed of the random number generator.
	void SetSeed(int32_t seed) { rnd_.seed(seed); }

	// Number of threads for stochastic methods.
	void set_num_threads(int num_threads) { num_threads_ = num_threads; }
	int num_threads() const { return num_threads_; }

	// Minibatch size for stochastic methods.
	void set_batch_size(int batch_size);
	int NumBatches() const { return batch_examples_.size(); }

	// Shuffles the batches with examples.
	void ShuffleBatches() {
	std::shuffle(batch_examples_.begin(), batch_examples_.end(), rnd_);
	}

	// Shuffles and returns the examples in 'batch_examples_[batch]'.
	const std::vector<int> &batch_examples(int batch) {
	//DCHECK(batch < batch_examples_.size());
	std::shuffle(batch_examples_[batch].begin(),
	batch_examples_[batch].end(), rnd_);
	return batch_examples_[batch];
	}

	// Returns a reference to 'gradient_evaluator_'.
	const GradientEvaluator &gradient_evaluator() const {
	return gradient_evaluator_;
	}

	// Returns the l1 regularization parameter.
	float l1() const { return l1_; }

	// Returns the l2 regularization parameter.
	float l2() const { return l2_; }

	// Getter/setter for the 'num_scale_iterations_', used for sparse updates in
	// stochastic methods.
	int num_scale_iterations() const { return num_scale_iterations_; }
	void set_num_scale_iterations(int num_scale_iterations) {
	num_scale_iterations_ = num_scale_iterations;
	}

	// Getter/setter for 'num_iterations_per_stage_', used in
	// StochasticVarianceReducedGradient.
	int num_iterations_per_stage() const {
	return num_iterations_per_stage_;
	}
	void set_num_iterations_per_stage(int num_iterations_per_stage) {
	num_iterations_per_stage_ = num_iterations_per_stage;
	}

	// Getter/setter for projecting weights onto a ball in
	// StochasticGradientDescent.
	bool project_weights() const { return project_weights_; }
	void set_project_weights(bool project_weights) {
	project_weights_ = project_weights;
	}

	// Applies L1Prox coefficientwise to 'weights' and 'threshold'.
	static void L1Prox(float threshold, Weights *weights) {
	for (int i = 0; i < weights->size(); ++i) {
	weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold);
	}
	}

	// Applies L1Prox coefficientwise to 'weights' and 'threshold', where
	// 'threshold' is a vector of per-coordinate thresholds.
	static void L1Prox(const VectorXf &threshold, Weights *weights) {
	for (int i = 0; i < weights->size(); ++i) {
	weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold.coeff(i));
	}
	}

	// Returns sign('x') * max(0.0, abs('x') - 'threshold').
	static inline float L1Prox(float x, float threshold) {
	return Sign(x) * std::max(std::abs(x) - threshold, 0.0f);
	}

	// Returns sign('x').
	static inline float Sign(float x) {
	if (x > 0.0f) return 1.0f;
	if (x < 0.0f) return -1.0f;
	return 0.0f;
	}

	// Default initial learning rate for stochastic methods.
	const float kDefaultInitialLearningRate = 0.05;

	private:
	// Regularization parameters.
	float l1_;
	float l2_;

	// GradientEvaluator used to compute the (unregularized) loss and gradient.
	const GradientEvaluator &gradient_evaluator_;

	// Convergence parameters.
	bool converged_ = false; // convergence flag set by convergence checks
	float convergence_threshold_ = 1e-5; // threshold for assessing convergence
	bool use_simple_convergence_check_ = false; // which convergence check to use
	int num_convergence_epochs_ = 5; // used in SimpleConvergenceCheck

	// Initial learning rate, used in stochastic methods.
	float initial_learning_rate_ = 0.01;

	// Parameters used to search for a good initial learning rate for stochastic
	// methods in the method BestLearningRate(). 'initial_rates_' are evaluated on
	// a data subset of size 'num_search_examples_'.
	std::vector<float> initial_rates_ = {0.001, 0.005, 0.01, 0.05};
	int num_search_examples_ = 500;

	// Multithreading parameters, optionally used in stochastic methods.
	int num_threads_;
	std::vector<std::vector<int>> batch_examples_;

	// Pseudo-random number generator for shuffling 'batch_examples_'.
	std::mt19937 rnd_;

	// In stochastic methods, weights = scale * Weights. For numerical stability,
	// this is reset to scale = 1.0 every 'num_scale_iterations_' iterations.
	int num_scale_iterations_ = 100;

	// Number of iterations per stage; used in StochasticVarianceReducedGradient.
	int num_iterations_per_stage_ = 100000;

	// For online/stochastic methods, a simple bound on the optimal solution is
	// \|weights\|^2 <= 2 / L2. Knowing this, if an intermediate solution is
	// outside of a ball of radius sqrt(2 / L2), we can project it onto the ball
	// to potentially speed up convergence. This approach is used in
	// StochasticGradientDescent, and controlled by the flag 'project_weights_'.
	bool project_weights_ = true;
	};

	} // namespace lossmin