| // Copyright 2014 Google Inc. All Rights Reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| // Author: nevena@google.com (Nevena Lazic) |
| // |
| // An implementation of: |
| // I. Mukherjee, K. Canini, R. Frongillo, and Y. Singer, "Parallel Boosting with |
| // Momentum", ECML PKDD 2013. |
| // Variable names follow the notation in the paper. |
| |
| #pragma once |
| |
| #include "lossmin/eigen-types.h" |
| #include "lossmin/minimizers/gradient-evaluator.h" |
| #include "lossmin/minimizers/loss-minimizer.h" |
| |
| namespace lossmin { |
| |
| class GradientEvaluator; |
| |
| class ParallelBoostingWithMomentum : public LossMinimizer { |
| public: |
| ParallelBoostingWithMomentum(float l1, float l2, |
| const GradientEvaluator &gradient_evaluator) |
| : LossMinimizer(l1, l2, gradient_evaluator), |
| instances_transpose_(gradient_evaluator.instances().transpose()) { |
| Setup(); |
| } |
| |
| // Sets learning rates and other parameters. |
| void Setup() override; |
| |
| // Checks convergence by verifying the KKT conditions directly. |
| // |gradient| is the gradient of the objective at |weights|, not including |
| // the contribution of the l1 penalty part of the objective. |
| // |
| // The function checks whether the mean norm of violations of the KKT |
| // condition is below convergence threshold. The KKT condition is necessary |
| // and sufficient for |weights| to be a minimizer: |
| // If weights_i < 0 then gradient_i == l1() |
| // If weights_i > 0 then gradient_i == -l1() |
| // If weights_i == 0 then -l1() <= gradient_i <= l1(). |
| // |
| // The squared violations at each coordinate are summed and the square |
| // root divided by weights.size() is compared to convergence_thresold() |
| void ConvergenceCheck(const Weights &weights, |
| const Weights &gradient) override; |
| |
| // Returns the total loss for given parameters 'weights', including l1 and l2 |
| // regularization. Uses sparse matrix multiply from Eigen. |
| // This is more efficient in terms of performed operations than calling |
| // gradient_evaluator().Loss(weights) but is not intended for parallelization. |
| float Loss(const Weights &weights) const override; |
| |
| // Computes the inner product gradient at |weights|, written to |gradient*|; |
| // |*gradient| must be of the same size as |weights|. Uses sparse matrix |
| // multiply from Eigen. This is much more efficient in terms of performed |
| // operations than calling gradient_evaluator().gradient(weights) but is not |
| // intended for parallelization. |
| void SparseInnerProductGradient(const Weights &weights, |
| Weights *gradient) const; |
| |
| // Set phi_center_ (this is the same as v_0 in the paper). |
| // Following the paper exactly, phi_center_ should be equal to the |
| // initial guess for weights when Run() is executed (however, this requirement |
| // does not seem to be necessary for convergence in practice). |
| void set_phi_center(const VectorXf &phi) { phi_center_ = phi; } |
| |
| // Computes the learning rates. This is introduced to enable recomputing |
| // learning rates in case l2 penalty changes. |
| void compute_and_set_learning_rates(); |
| |
| // Set alpha_; we may need to reset alpha before Run(). |
| void set_alpha(const float alpha) { alpha_ = alpha; } |
| |
| // Set beta_; we may need to reset beta before Run(). |
| void set_beta(const float beta) { beta_ = beta; } |
| |
| private: |
| // Updates 'weights' and the quadratic approximation function phi(w), such |
| // that at iteration k, loss(weights_k) <= min_w phi_k(w). |
| // y = (1 - alpha) * weights + alpha * phi_center |
| // grad_y = loss_grad(y) + l2 * y |
| // weights[j] = weights[j] - grad_y[j] / learning_rates[j] |
| // weights[j] = |
| // sign(weights[j]) * max(0, weights[j], l1 / learning_rates[j]) |
| void EpochUpdate(Weights *weights, int epoch, |
| bool check_convergence) override; |
| |
| // Per-coordinate learning rates. |
| VectorXf learning_rates_; |
| |
| // Center of the approximating quadratic function phi. |
| VectorXf phi_center_; |
| |
| // Parameter for updating the approximation function phi. At each iteration, |
| // 'alpha_' is updated to the solution of the quadratic equation |
| // alpha_^2 = beta_ * (1.0 - alpha_) |
| float alpha_; |
| |
| // Parameter used to update alpha, defined as |
| // beta_{epoch} = \prod_{i=1}^{epoch} (1 - alpha_i). |
| float beta_; |
| |
| // The transpose of instances. It is needed for faster gradient computations. |
| // It should be computed when (and only when) instances changes, |
| // so it is computed at the construction of the minimizer. |
| const InstanceSet instances_transpose_; |
| }; |
| |
| } // namespace lossmin |