| // Copyright 2014 Google Inc. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| // Author: nevena@google.com (Nevena Lazic) |
| // |
| // Interface for gradient descent algorithms. Example usage: |
| // |
| // 1. Create a GradientEvaluator object that computes the value and gradient of |
| // a loss function on a given dataset. |
| // |
| // InstanceSet instances = ... |
| // LabelSet labels = ... |
| // LossFunction *loss_function = LossFunction::Create("logistic-regression"); |
| // GradientEvaluator gradient_evaluator(instances, labels, loss_function); |
| // |
| // 2. Create a LossMinimizer object from 'gradient_evaluator' and l1 and l2 |
| // regularization parameters. StochasticGradientDescent requires an |
| // additional parameter for learning rate scheduling. |
| // |
| // float l1 = ... |
| // float l2 = ... |
| // DeterministicGradientDescent loss_minimizer(l1, l2, gradient_evaluator); |
| // |
| // 3. Run optimization for up to 'max_epochs' epochs. 'loss' is filled with |
| // loss values across epochs, and 'weights' contains the best parameters. |
| // |
| // Weights weights = Weights::Zero(num_features); // or other initialization |
| // vector<float> loss; |
| // int max_epochs = 100; |
| // bool converged = loss_minimizer.Run(max_epochs, &weights, &loss); |
| // |
| // Note: algorithm-specific parameters such as learning rates are set based on |
| // the data and the loss function. If the dataset or the loss pointed to by |
| // 'gradient_evaluator' change, the user must call loss_minimizer.Setup() method |
| // before loss_minimizer.Run(). |
| |
| #pragma once |
| |
| #include <algorithm> |
| #include <cmath> |
| #include <cstdint> |
| #include <functional> |
| #include <random> |
| #include <string> |
| #include <vector> |
| |
| #include "lossmin/eigen-types.h" |
| #include "lossmin/losses/loss-function.h" |
| #include "lossmin/minimizers/gradient-evaluator.h" |
| #include "third_party/eigen/Eigen/Core" |
| |
| namespace lossmin { |
| |
| // Interface for gradient descent algorithms that minimize a loss function over |
| // a labeled dataset. A GradientEvaluator object provides the loss value and |
| // gradient for given parameters at each epoch. Derived classes should be |
| // thread-compatible, and must provide the methods EpochUpdate() and Setup(). |
| class LossMinimizer { |
| public: |
| // Constructor sets the l1 and l2 regularization parameters and |
| // 'gradient_evalutor_'. |
| LossMinimizer(float l1, float l2, const GradientEvaluator &gradient_evaluator) |
| : l1_(l1), |
| l2_(l2), |
| gradient_evaluator_(gradient_evaluator), |
| converged_(false) { |
| std::random_device rd; |
| rnd_.seed(rd()); |
| } |
| virtual ~LossMinimizer() {} |
| |
| // Initializes algorithm-specific parameters such as learning rates, based |
| // on the loss function and the dataset. Called in the constructor of the |
| // derived classes. If the dataset that 'gradient_evaluator' points to |
| // changes, Setup() must be called again by the user. |
| virtual void Setup() = 0; |
| |
| // Runs minimization until convergence, or up to 'max_epochs' epochs. Returns |
| // true if the algorithm has converged. Parameters 'weights' should be |
| // initialized by the user, and will contain the parameters produced at the |
| // last epoch. 'loss' is filled with training loss values produced every |
| // 'loss_epochs' epochs. Convergence is checked every 'convergence_epochs'. |
| bool Run(int max_epochs, int loss_epochs, int convergence_epochs, |
| Weights *weights, std::vector<float> *loss); |
| |
| // Runs minimization, evaluating loss on both training and validation data. |
| bool Run(int max_epochs, int loss_epochs, int convergence_epochs, |
| const InstanceSet &validation_instances, |
| const LabelSet &validation_labels, Weights *weights, |
| std::vector<float> *training_loss, |
| std::vector<float> *validation_loss); |
| |
| // Convenience Run method that evaluates the loss and checks for convergence |
| // at every iteration. |
| bool Run(int max_epochs, Weights *weights, std::vector<float> *loss) { |
| return Run(max_epochs, 1, 1, weights, loss); |
| } |
| |
| // Abstract method that updates the weights in the current iteration. |
| virtual void EpochUpdate(Weights *weights, int epoch, |
| bool check_convergence) = 0; |
| |
| // Returns the total loss for given parameters 'weights', including l1 and l2 |
| // regularization. |
| virtual float Loss(const Weights &weights) const { |
| float loss = gradient_evaluator_.Loss(weights); |
| if (l2_ > 0.0f) loss += 0.5 * l2_ * weights.squaredNorm(); |
| if (l1_ > 0.0f) loss += l1_ * weights.cwiseAbs().sum(); |
| return loss; |
| } |
| |
| // Checks convergence of deterministic methods. |
| // If converged, the flag 'converged_' is set to true. |
| virtual void ConvergenceCheck(const Weights &weights, |
| const Weights &gradient); |
| |
| // Checks convergence based on the decrease in loss over the last |
| // 'num_convergence_epochs_' epochs. If converged, the flag 'converged_' is |
| // set to true. |
| void SimpleConvergenceCheck(const std::vector<float> &loss); |
| |
| // Setters and getters for convergence criteria parameters. |
| bool converged() const { return converged_; } |
| void set_converged(bool converged) { converged_ = converged; } |
| bool reached_solution() const { return reached_solution_; } |
| void set_reached_solution(bool reached_solution) { |
| reached_solution_ = reached_solution; |
| } |
| void set_use_simple_convergence_check(bool use_simple_convergence_check) { |
| use_simple_convergence_check_ = use_simple_convergence_check; |
| } |
| float convergence_threshold() const { return convergence_threshold_; } |
| void set_convergence_threshold(float convergence_threshold) { |
| convergence_threshold_ = convergence_threshold; |
| } |
| float simple_convergence_threshold() const { |
| return simple_convergence_threshold_; |
| } |
| void set_simple_convergence_threshold(float simple_convergence_threshold) { |
| simple_convergence_threshold_ = simple_convergence_threshold; |
| } |
| void set_num_convergence_epochs(int num_convergence_epochs) { |
| num_convergence_epochs_ = num_convergence_epochs; |
| } |
| float zero_threshold() const { return zero_threshold_; } |
| void set_zero_threshold(float zero_threshold) { |
| zero_threshold_ = zero_threshold; |
| } |
| |
| // Returns the best initial learning rate for stochastic methods by evaluating |
| // elements of 'initial_rates_' on a subset of 'num_search_examples_' training |
| // examples. |
| float BestLearningRate(); |
| |
| // Methods that set parameters related to the initial learning rate in |
| // stochastic methods. |
| virtual void set_initial_learning_rate(float initial_learning_rate) { |
| initial_learning_rate_ = initial_learning_rate; |
| // LOG(INFO) << "initial_learning_rate set to " << initial_learning_rate_; |
| } |
| float initial_learning_rate() const { return initial_learning_rate_; } |
| void set_initial_rates(const std::vector<float> &initial_rates) { |
| initial_rates_ = initial_rates; |
| } |
| void set_num_search_examples(int num_search_examples) { |
| num_search_examples_ = num_search_examples; |
| } |
| |
| // Sets the seed of the random number generator. |
| void SetSeed(int32_t seed) { rnd_.seed(seed); } |
| |
| // Number of threads for stochastic methods. |
| void set_num_threads(int num_threads) { num_threads_ = num_threads; } |
| int num_threads() const { return num_threads_; } |
| |
| // Minibatch size for stochastic methods. |
| void set_batch_size(int batch_size); |
| int NumBatches() const { return batch_examples_.size(); } |
| |
| // Shuffles the batches with examples. |
| void ShuffleBatches() { |
| std::shuffle(batch_examples_.begin(), batch_examples_.end(), rnd_); |
| } |
| |
| // Shuffles and returns the examples in 'batch_examples_[batch]'. |
| const std::vector<int> &batch_examples(int batch) { |
| // DCHECK(batch < batch_examples_.size()); |
| std::shuffle(batch_examples_[batch].begin(), batch_examples_[batch].end(), |
| rnd_); |
| return batch_examples_[batch]; |
| } |
| |
| // Returns a reference to 'gradient_evaluator_'. |
| const GradientEvaluator &gradient_evaluator() const { |
| return gradient_evaluator_; |
| } |
| |
| // Getter/setter of the l1 regularization parameter. |
| float l1() const { return l1_; } |
| void set_l1(float l1) { l1_ = l1; } |
| |
| // Getter/setter of the l2 regularization parameter. |
| float l2() const { return l2_; } |
| void set_l2(float l2) { l2_ = l2; } |
| |
| // Getter/setter for the 'num_scale_iterations_', used for sparse updates in |
| // stochastic methods. |
| int num_scale_iterations() const { return num_scale_iterations_; } |
| void set_num_scale_iterations(int num_scale_iterations) { |
| num_scale_iterations_ = num_scale_iterations; |
| } |
| |
| // Getter/setter for 'num_iterations_per_stage_', used in |
| // StochasticVarianceReducedGradient. |
| int num_iterations_per_stage() const { return num_iterations_per_stage_; } |
| void set_num_iterations_per_stage(int num_iterations_per_stage) { |
| num_iterations_per_stage_ = num_iterations_per_stage; |
| } |
| |
| // Getter/setter for projecting weights onto a ball in |
| // StochasticGradientDescent. |
| bool project_weights() const { return project_weights_; } |
| void set_project_weights(bool project_weights) { |
| project_weights_ = project_weights; |
| } |
| |
| // Returns the number of iterations the last time Run() was executed |
| int num_epochs_run() const { return num_epochs_run_; } |
| |
| // Applies L1Prox coefficientwise to 'weights' and 'threshold'. |
| static void L1Prox(float threshold, Weights *weights) { |
| for (int i = 0; i < weights->size(); ++i) { |
| weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold); |
| } |
| } |
| |
| // Applies L1Prox coefficientwise to 'weights' and 'threshold', where |
| // 'threshold' is a vector of per-coordinate thresholds. |
| static void L1Prox(const VectorXf &threshold, Weights *weights) { |
| for (int i = 0; i < weights->size(); ++i) { |
| weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold.coeff(i)); |
| } |
| } |
| |
| // Returns sign('x') * max(0.0, abs('x') - 'threshold'). |
| static inline float L1Prox(float x, float threshold) { |
| return Sign(x) * std::max(std::abs(x) - threshold, 0.0f); |
| } |
| |
| // Returns sign('x'). |
| static inline float Sign(float x) { |
| if (x > 0.0f) return 1.0f; |
| if (x < 0.0f) return -1.0f; |
| return 0.0f; |
| } |
| |
| // Default initial learning rate for stochastic methods. |
| const float kDefaultInitialLearningRate = 0.05; |
| |
| private: |
| // Regularization parameters. |
| float l1_; |
| float l2_; |
| |
| // GradientEvaluator used to compute the (unregularized) loss and gradient. |
| const GradientEvaluator &gradient_evaluator_; |
| |
| // Convergence parameters. |
| // Convergence threshold should be strict but not too strict. |
| // This will depend on precision used. As float gives 1e-8 relative accuracy, |
| // 1e-6 or 1e-7 is probably strictest one should use (but this also depends |
| // on the implementation of convergence checks). |
| // This can also be updated during initialization of the minimizer so the |
| // default value should be less strict (e.g. 1e-5). |
| bool converged_ = false; // convergence flag set by convergence checks |
| bool reached_solution_ = |
| false; // flag indicating whether the algorithm |
| // actually reached the solution as determined by ConvergenceCheck |
| float convergence_threshold_ = |
| 1e-5; // threshold for assessing convergence by ConvergenceCheck |
| float simple_convergence_threshold_ = |
| 1e-5; // threshold for assesing convergence by SimpleConvergenceCheck |
| bool use_simple_convergence_check_ = false; // which convergence check to use |
| int num_convergence_epochs_ = 5; // used in SimpleConvergenceCheck |
| |
| // zero_threshold_ is the threshold below which we treat the coordinate value |
| // as zero (in absolute terms). This is used in ConvergenceCheck. |
| float zero_threshold_ = 1e-6; |
| |
| // The number of epochs (iterations) when Run() was executed. |
| // In other words, each epoch is a step towards minimum during minimization. |
| // This variable gets updated when Run() is called. |
| int num_epochs_run_ = 0; |
| |
| // Initial learning rate, used in stochastic methods. |
| float initial_learning_rate_ = 0.01; |
| |
| // Parameters used to search for a good initial learning rate for stochastic |
| // methods in the method BestLearningRate(). 'initial_rates_' are evaluated on |
| // a data subset of size 'num_search_examples_'. |
| std::vector<float> initial_rates_ = {0.001, 0.005, 0.01, 0.05}; |
| int num_search_examples_ = 500; |
| |
| // Multithreading parameters, optionally used in stochastic methods. |
| int num_threads_; |
| std::vector<std::vector<int>> batch_examples_; |
| |
| // Pseudo-random number generator for shuffling 'batch_examples_'. |
| std::mt19937 rnd_; |
| |
| // In stochastic methods, weights = scale * Weights. For numerical stability, |
| // this is reset to scale = 1.0 every 'num_scale_iterations_' iterations. |
| int num_scale_iterations_ = 100; |
| |
| // Number of iterations per stage; used in StochasticVarianceReducedGradient. |
| int num_iterations_per_stage_ = 100000; |
| |
| // For online/stochastic methods, a simple bound on the optimal solution is |
| // |weights|^2 <= 2 / L2. Knowing this, if an intermediate solution is |
| // outside of a ball of radius sqrt(2 / L2), we can project it onto the ball |
| // to potentially speed up convergence. This approach is used in |
| // StochasticGradientDescent, and controlled by the flag 'project_weights_'. |
| bool project_weights_ = true; |
| }; |
| |
| } // namespace lossmin |