| // Copyright 2014 Google Inc. All Rights Reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| // Author: nevena@google.com (Nevena Lazic) |
| // |
| // An implementation of: |
| // I. Mukherjee, K. Canini, R. Frongillo, and Y. Singer, "Parallel Boosting with |
| // Momentum", ECML PKDD 2013. |
| // Variable names follow the notation in the paper. |
| |
| #pragma once |
| |
| #include "lossmin/eigen-types.h" |
| #include "lossmin/minimizers/gradient-evaluator.h" |
| #include "lossmin/minimizers/loss-minimizer.h" |
| |
| namespace lossmin { |
| |
| class GradientEvaluator; |
| |
| class ParallelBoostingWithMomentum : public LossMinimizer { |
| public: |
| ParallelBoostingWithMomentum(float l1, float l2, |
| const GradientEvaluator &gradient_evaluator) |
| : LossMinimizer(l1, l2, gradient_evaluator) { |
| Setup(); |
| } |
| |
| // Sets learning rates and other parameters. |
| void Setup() override; |
| |
| // Checks convergence by verifying the KKT conditions directly. |
| // |gradient| is the gradient of the objective at |weights|, not including |
| // the contribution of the l1 penalty part of the objective. |
| // |
| // The function checks whether the mean norm of violations of the KKT |
| // condition is below convergence threshold. The KKT condition is necessary |
| // and sufficient for |weights| to be a minimizer: |
| // If weights_i < 0 then gradient_i == l1() |
| // If weights_i > 0 then gradient_i == -l1() |
| // If weights_i == 0 then -l1() <= gradient_i <= l1(). |
| // |
| // The squared violations at each coordinate are summed and the square |
| // root divided by weights.size() is compared to convergence_thresold() |
| void ConvergenceCheck(const Weights &weights, |
| const Weights &gradient) override; |
| |
| private: |
| // Updates 'weights' and the quadratic approximation function phi(w), such |
| // that at iteration k, loss(weights_k) <= min_w phi_k(w). |
| // y = (1 - alpha) * weights + alpha * phi_center |
| // grad_y = loss_grad(y) + l2 * y |
| // weights[j] = weights[j] - grad_y[j] / learning_rates[j] |
| // weights[j] = |
| // sign(weights[j]) * max(0, weights[j], l1 / learning_rates[j]) |
| void EpochUpdate(Weights *weights, int epoch, |
| bool check_convergence) override; |
| |
| // Per-coordinate learning rates. |
| VectorXf learning_rates_; |
| |
| // Center of the approximating quadratic function phi. |
| VectorXf phi_center_; |
| |
| // Parameter for updating the approximation function phi. At each iteration, |
| // 'alpha_' is updated to the solution of the quadratic equation |
| // alpha_^2 = beta_ * (1.0 - alpha_) |
| float alpha_; |
| |
| // Parameter used to update alpha, defined as |
| // beta_{epoch} = \prod_{i=1}^{epoch} (1 - alpha_i). |
| float beta_; |
| }; |
| |
| } // namespace lossmin |