blob: 0f05d10d07830cb32e86de517ebb77389b178df7 [file] [log] [blame]
// Copyright 2014 Google Inc. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Author: nevena@google.com (Nevena Lazic)
//
// Interface for gradient descent algorithms. Example usage:
//
// 1. Create a GradientEvaluator object that computes the value and gradient of
// a loss function on a given dataset.
//
// InstanceSet instances = ...
// LabelSet labels = ...
// LossFunction *loss_function = LossFunction::Create("logistic-regression");
// GradientEvaluator gradient_evaluator(instances, labels, loss_function);
//
// 2. Create a LossMinimizer object from 'gradient_evaluator' and l1 and l2
// regularization parameters. StochasticGradientDescent requires an
// additional parameter for learning rate scheduling.
//
// float l1 = ...
// float l2 = ...
// DeterministicGradientDescent loss_minimizer(l1, l2, gradient_evaluator);
//
// 3. Run optimization for up to 'max_epochs' epochs. 'loss' is filled with
// loss values across epochs, and 'weights' contains the best parameters.
//
// Weights weights = Weights::Zero(num_features); // or other initialization
// vector<float> loss;
// int max_epochs = 100;
// bool converged = loss_minimizer.Run(max_epochs, &weights, &loss);
//
// Note: algorithm-specific parameters such as learning rates are set based on
// the data and the loss function. If the dataset or the loss pointed to by
// 'gradient_evaluator' change, the user must call loss_minimizer.Setup() method
// before loss_minimizer.Run().
#pragma once
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <functional>
#include <random>
#include <string>
#include <vector>
#include "lossmin/eigen-types.h"
#include "lossmin/losses/loss-function.h"
#include "lossmin/minimizers/gradient-evaluator.h"
#include "third_party/eigen/Eigen/Core"
namespace lossmin {
// Interface for gradient descent algorithms that minimize a loss function over
// a labeled dataset. A GradientEvaluator object provides the loss value and
// gradient for given parameters at each epoch. Derived classes should be
// thread-compatible, and must provide the methods EpochUpdate() and Setup().
class LossMinimizer {
public:
// Constructor sets the l1 and l2 regularization parameters and
// 'gradient_evalutor_'.
LossMinimizer(float l1, float l2, const GradientEvaluator &gradient_evaluator)
: l1_(l1),
l2_(l2),
gradient_evaluator_(gradient_evaluator),
converged_(false) {
std::random_device rd;
rnd_.seed(rd());
}
virtual ~LossMinimizer() {}
// Initializes algorithm-specific parameters such as learning rates, based
// on the loss function and the dataset. Called in the constructor of the
// derived classes. If the dataset that 'gradient_evaluator' points to
// changes, Setup() must be called again by the user.
virtual void Setup() = 0;
// Runs minimization until convergence, or up to 'max_epochs' epochs. Returns
// true if the algorithm has converged. Parameters 'weights' should be
// initialized by the user, and will contain the parameters produced at the
// last epoch. 'loss' is filled with training loss values produced every
// 'loss_epochs' epochs. Convergence is checked every 'convergence_epochs'.
bool Run(int max_epochs, int loss_epochs, int convergence_epochs,
Weights *weights, std::vector<float> *loss);
// Runs minimization, evaluating loss on both training and validation data.
bool Run(int max_epochs, int loss_epochs, int convergence_epochs,
const InstanceSet &validation_instances,
const LabelSet &validation_labels, Weights *weights,
std::vector<float> *training_loss,
std::vector<float> *validation_loss);
// Convenience Run method that evaluates the loss and checks for convergence
// at every iteration.
bool Run(int max_epochs, Weights *weights, std::vector<float> *loss) {
return Run(max_epochs, 1, 1, weights, loss);
}
// Abstract method that updates the weights in the current iteration.
virtual void EpochUpdate(Weights *weights, int epoch,
bool check_convergence) = 0;
// Returns the total loss for given parameters 'weights', including l1 and l2
// regularization.
virtual float Loss(const Weights &weights) const {
float loss = gradient_evaluator_.Loss(weights);
if (l2_ > 0.0f) loss += 0.5 * l2_ * weights.squaredNorm();
if (l1_ > 0.0f) loss += l1_ * weights.cwiseAbs().sum();
return loss;
}
// Checks convergence of deterministic methods.
// If converged, the flag 'converged_' is set to true.
virtual void ConvergenceCheck(const Weights &weights,
const Weights &gradient);
// Checks convergence based on the decrease in loss over the last
// 'num_convergence_epochs_' epochs. If converged, the flag 'converged_' is
// set to true.
void SimpleConvergenceCheck(const std::vector<float> &loss);
// Setters and getters for convergence criteria parameters.
bool converged() const { return converged_; }
void set_converged(bool converged) { converged_ = converged; }
bool reached_solution() const { return reached_solution_; }
void set_reached_solution(bool reached_solution) {
reached_solution_ = reached_solution;
}
void set_use_simple_convergence_check(bool use_simple_convergence_check) {
use_simple_convergence_check_ = use_simple_convergence_check;
}
float convergence_threshold() const { return convergence_threshold_; }
void set_convergence_threshold(float convergence_threshold) {
convergence_threshold_ = convergence_threshold;
}
float simple_convergence_threshold() const {
return simple_convergence_threshold_;
}
void set_simple_convergence_threshold(float simple_convergence_threshold) {
simple_convergence_threshold_ = simple_convergence_threshold;
}
void set_num_convergence_epochs(int num_convergence_epochs) {
num_convergence_epochs_ = num_convergence_epochs;
}
float zero_threshold() const { return zero_threshold_; }
void set_zero_threshold(float zero_threshold) {
zero_threshold_ = zero_threshold;
}
// Returns the best initial learning rate for stochastic methods by evaluating
// elements of 'initial_rates_' on a subset of 'num_search_examples_' training
// examples.
float BestLearningRate();
// Methods that set parameters related to the initial learning rate in
// stochastic methods.
virtual void set_initial_learning_rate(float initial_learning_rate) {
initial_learning_rate_ = initial_learning_rate;
// LOG(INFO) << "initial_learning_rate set to " << initial_learning_rate_;
}
float initial_learning_rate() const { return initial_learning_rate_; }
void set_initial_rates(const std::vector<float> &initial_rates) {
initial_rates_ = initial_rates;
}
void set_num_search_examples(int num_search_examples) {
num_search_examples_ = num_search_examples;
}
// Sets the seed of the random number generator.
void SetSeed(int32_t seed) { rnd_.seed(seed); }
// Number of threads for stochastic methods.
void set_num_threads(int num_threads) { num_threads_ = num_threads; }
int num_threads() const { return num_threads_; }
// Minibatch size for stochastic methods.
void set_batch_size(int batch_size);
int NumBatches() const { return batch_examples_.size(); }
// Shuffles the batches with examples.
void ShuffleBatches() {
std::shuffle(batch_examples_.begin(), batch_examples_.end(), rnd_);
}
// Shuffles and returns the examples in 'batch_examples_[batch]'.
const std::vector<int> &batch_examples(int batch) {
// DCHECK(batch < batch_examples_.size());
std::shuffle(batch_examples_[batch].begin(), batch_examples_[batch].end(),
rnd_);
return batch_examples_[batch];
}
// Returns a reference to 'gradient_evaluator_'.
const GradientEvaluator &gradient_evaluator() const {
return gradient_evaluator_;
}
// Getter/setter of the l1 regularization parameter.
float l1() const { return l1_; }
void set_l1(float l1) { l1_ = l1; }
// Getter/setter of the l2 regularization parameter.
float l2() const { return l2_; }
void set_l2(float l2) { l2_ = l2; }
// Getter/setter for the 'num_scale_iterations_', used for sparse updates in
// stochastic methods.
int num_scale_iterations() const { return num_scale_iterations_; }
void set_num_scale_iterations(int num_scale_iterations) {
num_scale_iterations_ = num_scale_iterations;
}
// Getter/setter for 'num_iterations_per_stage_', used in
// StochasticVarianceReducedGradient.
int num_iterations_per_stage() const { return num_iterations_per_stage_; }
void set_num_iterations_per_stage(int num_iterations_per_stage) {
num_iterations_per_stage_ = num_iterations_per_stage;
}
// Getter/setter for projecting weights onto a ball in
// StochasticGradientDescent.
bool project_weights() const { return project_weights_; }
void set_project_weights(bool project_weights) {
project_weights_ = project_weights;
}
// Returns the number of iterations the last time Run() was executed
int num_epochs_run() const { return num_epochs_run_; }
// Applies L1Prox coefficientwise to 'weights' and 'threshold'.
static void L1Prox(float threshold, Weights *weights) {
for (int i = 0; i < weights->size(); ++i) {
weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold);
}
}
// Applies L1Prox coefficientwise to 'weights' and 'threshold', where
// 'threshold' is a vector of per-coordinate thresholds.
static void L1Prox(const VectorXf &threshold, Weights *weights) {
for (int i = 0; i < weights->size(); ++i) {
weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold.coeff(i));
}
}
// Returns sign('x') * max(0.0, abs('x') - 'threshold').
static inline float L1Prox(float x, float threshold) {
return Sign(x) * std::max(std::abs(x) - threshold, 0.0f);
}
// Returns sign('x').
static inline float Sign(float x) {
if (x > 0.0f) return 1.0f;
if (x < 0.0f) return -1.0f;
return 0.0f;
}
// Default initial learning rate for stochastic methods.
const float kDefaultInitialLearningRate = 0.05;
private:
// Regularization parameters.
float l1_;
float l2_;
// GradientEvaluator used to compute the (unregularized) loss and gradient.
const GradientEvaluator &gradient_evaluator_;
// Convergence parameters.
// Convergence threshold should be strict but not too strict.
// This will depend on precision used. As float gives 1e-8 relative accuracy,
// 1e-6 or 1e-7 is probably strictest one should use (but this also depends
// on the implementation of convergence checks).
// This can also be updated during initialization of the minimizer so the
// default value should be less strict (e.g. 1e-5).
bool converged_ = false; // convergence flag set by convergence checks
bool reached_solution_ =
false; // flag indicating whether the algorithm
// actually reached the solution as determined by ConvergenceCheck
float convergence_threshold_ =
1e-5; // threshold for assessing convergence by ConvergenceCheck
float simple_convergence_threshold_ =
1e-5; // threshold for assesing convergence by SimpleConvergenceCheck
bool use_simple_convergence_check_ = false; // which convergence check to use
int num_convergence_epochs_ = 5; // used in SimpleConvergenceCheck
// zero_threshold_ is the threshold below which we treat the coordinate value
// as zero (in absolute terms). This is used in ConvergenceCheck.
float zero_threshold_ = 1e-6;
// The number of epochs (iterations) when Run() was executed.
// In other words, each epoch is a step towards minimum during minimization.
// This variable gets updated when Run() is called.
int num_epochs_run_ = 0;
// Initial learning rate, used in stochastic methods.
float initial_learning_rate_ = 0.01;
// Parameters used to search for a good initial learning rate for stochastic
// methods in the method BestLearningRate(). 'initial_rates_' are evaluated on
// a data subset of size 'num_search_examples_'.
std::vector<float> initial_rates_ = {0.001, 0.005, 0.01, 0.05};
int num_search_examples_ = 500;
// Multithreading parameters, optionally used in stochastic methods.
int num_threads_;
std::vector<std::vector<int>> batch_examples_;
// Pseudo-random number generator for shuffling 'batch_examples_'.
std::mt19937 rnd_;
// In stochastic methods, weights = scale * Weights. For numerical stability,
// this is reset to scale = 1.0 every 'num_scale_iterations_' iterations.
int num_scale_iterations_ = 100;
// Number of iterations per stage; used in StochasticVarianceReducedGradient.
int num_iterations_per_stage_ = 100000;
// For online/stochastic methods, a simple bound on the optimal solution is
// |weights|^2 <= 2 / L2. Knowing this, if an intermediate solution is
// outside of a ball of radius sqrt(2 / L2), we can project it onto the ball
// to potentially speed up convergence. This approach is used in
// StochasticGradientDescent, and controlled by the flag 'project_weights_'.
bool project_weights_ = true;
};
} // namespace lossmin