blob: d8e5bb2ad9198bfee42675dfbaacf4f10006e0a2 [file] [log] [blame]
// Copyright 2014 Google Inc. All Rights Reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Author: nevena@google.com (Nevena Lazic)
//
// An implementation of:
// I. Mukherjee, K. Canini, R. Frongillo, and Y. Singer, "Parallel Boosting with
// Momentum", ECML PKDD 2013.
// Variable names follow the notation in the paper.
#pragma once
#include "lossmin/eigen-types.h"
#include "lossmin/minimizers/gradient-evaluator.h"
#include "lossmin/minimizers/loss-minimizer.h"
namespace lossmin {
class GradientEvaluator;
class ParallelBoostingWithMomentum : public LossMinimizer {
public:
ParallelBoostingWithMomentum(float l1, float l2,
const GradientEvaluator &gradient_evaluator)
: LossMinimizer(l1, l2, gradient_evaluator),
instances_transpose_(gradient_evaluator.instances().transpose()) {
Setup();
}
// Sets learning rates and other parameters.
void Setup() override;
// Checks convergence by verifying the KKT conditions directly.
// |gradient| is the gradient of the objective at |weights|, not including
// the contribution of the l1 penalty part of the objective.
//
// The function checks whether the mean norm of violations of the KKT
// condition is below convergence threshold. The KKT condition is necessary
// and sufficient for |weights| to be a minimizer:
// If weights_i < 0 then gradient_i == l1()
// If weights_i > 0 then gradient_i == -l1()
// If weights_i == 0 then -l1() <= gradient_i <= l1().
//
// The squared violations at each coordinate are summed and the square
// root divided by weights.size() is compared to convergence_thresold()
void ConvergenceCheck(const Weights &weights,
const Weights &gradient) override;
// Returns the total loss for given parameters 'weights', including l1 and l2
// regularization. Uses sparse matrix multiply from Eigen.
// This is more efficient in terms of performed operations than calling
// gradient_evaluator().Loss(weights) but is not intended for parallelization.
float Loss(const Weights &weights) const override;
// Computes the inner product gradient at |weights|, written to |gradient*|;
// |*gradient| must be of the same size as |weights|. Uses sparse matrix
// multiply from Eigen. This is much more efficient in terms of performed
// operations than calling gradient_evaluator().gradient(weights) but is not
// intended for parallelization.
void SparseInnerProductGradient(const Weights &weights,
Weights *gradient) const;
// Set phi_center_ (this is the same as v_0 in the paper).
// Following the paper exactly, phi_center_ should be equal to the
// initial guess for weights when Run() is executed (however, this requirement
// does not seem to be necessary for convergence in practice).
void set_phi_center(const VectorXf &phi) { phi_center_ = phi; }
// Computes the learning rates. This is introduced to enable recomputing
// learning rates in case l2 penalty changes.
void compute_and_set_learning_rates();
// Set alpha_; we may need to reset alpha before Run().
void set_alpha(const float alpha) { alpha_ = alpha; }
// Set beta_; we may need to reset beta before Run().
void set_beta(const float beta) { beta_ = beta; }
private:
// Updates 'weights' and the quadratic approximation function phi(w), such
// that at iteration k, loss(weights_k) <= min_w phi_k(w).
// y = (1 - alpha) * weights + alpha * phi_center
// grad_y = loss_grad(y) + l2 * y
// weights[j] = weights[j] - grad_y[j] / learning_rates[j]
// weights[j] =
// sign(weights[j]) * max(0, weights[j], l1 / learning_rates[j])
void EpochUpdate(Weights *weights, int epoch,
bool check_convergence) override;
// Per-coordinate learning rates.
VectorXf learning_rates_;
// Center of the approximating quadratic function phi.
VectorXf phi_center_;
// Parameter for updating the approximation function phi. At each iteration,
// 'alpha_' is updated to the solution of the quadratic equation
// alpha_^2 = beta_ * (1.0 - alpha_)
float alpha_;
// Parameter used to update alpha, defined as
// beta_{epoch} = \prod_{i=1}^{epoch} (1 - alpha_i).
float beta_;
// The transpose of instances. It is needed for faster gradient computations.
// It should be computed when (and only when) instances changes,
// so it is computed at the construction of the minimizer.
const InstanceSet instances_transpose_;
};
} // namespace lossmin