lossmin/minimizers/parallel-boosting-with-momentum.h - lossmin - Git at Google

 // Copyright 2014 Google Inc. All Rights Reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 // Author: nevena@google.com (Nevena Lazic)
 //
 // An implementation of:
 // I. Mukherjee, K. Canini, R. Frongillo, and Y. Singer, "Parallel Boosting with
 // Momentum", ECML PKDD 2013.
 // Variable names follow the notation in the paper.

 #pragma once

 #include "lossmin/eigen-types.h"
 #include "lossmin/minimizers/gradient-evaluator.h"
 #include "lossmin/minimizers/loss-minimizer.h"

 namespace lossmin {

 class GradientEvaluator;

 class ParallelBoostingWithMomentum : public LossMinimizer {
  public:
   ParallelBoostingWithMomentum(float l1, float l2,
                                const GradientEvaluator &gradient_evaluator)
       : LossMinimizer(l1, l2, gradient_evaluator),
         instances_transpose_(gradient_evaluator.instances().transpose()) {
     Setup();
   }

   // Sets learning rates and other parameters.
   void Setup() override;

   // Checks convergence by verifying the KKT conditions directly.
   // |gradient| is the gradient of the objective at |weights|, not including
   // the contribution of the l1 penalty part of the objective.
   //
   // The function checks whether the mean norm of violations of the KKT
   // condition is below convergence threshold. The KKT condition is necessary
   // and sufficient for |weights| to be a minimizer:
   // If weights_i < 0 then gradient_i == l1()
   // If weights_i > 0 then gradient_i == -l1()
   // If weights_i == 0 then -l1() <= gradient_i <= l1().
   //
   // The squared violations at each coordinate are summed and the square
   // root divided by weights.size() is compared to convergence_thresold()
   void ConvergenceCheck(const Weights &weights,
                         const Weights &gradient) override;

   // Returns the total loss for given parameters 'weights', including l1 and l2
   // regularization. Uses sparse matrix multiply from Eigen.
   // This is more efficient in terms of performed operations than calling
   // gradient_evaluator().Loss(weights) but is not intended for parallelization.
   float Loss(const Weights &weights) const override;

   // Computes the inner product gradient at |weights|, written to |gradient*|;
   // |*gradient| must be of the same size as |weights|. Uses sparse matrix
   // multiply from Eigen. This is much more efficient in terms of performed
   // operations than calling gradient_evaluator().gradient(weights) but is not
   // intended for parallelization.
   void SparseInnerProductGradient(const Weights &weights,
                                   Weights *gradient) const;

   // Set phi_center_ (this is the same as v_0 in the paper).
   // Following the paper exactly, phi_center_ should be equal to the
   // initial guess for weights when Run() is executed (however, this requirement
   // does not seem to be necessary for convergence in practice).
   void set_phi_center(const VectorXf &phi) { phi_center_ = phi; }

   // Computes the learning rates. This is introduced to enable recomputing
   // learning rates in case l2 penalty changes.
   void compute_and_set_learning_rates();

   // Set alpha_; we may need to reset alpha before Run().
   void set_alpha(const float alpha) { alpha_ = alpha; }

   // Set beta_; we may need to reset beta before Run().
   void set_beta(const float beta) { beta_ = beta; }

  private:
   // Updates 'weights' and the quadratic approximation function phi(w), such
   // that at iteration k, loss(weights_k) <= min_w phi_k(w).
   //     y = (1 - alpha) * weights + alpha * phi_center
   //     grad_y = loss_grad(y) + l2 * y
   //     weights[j] = weights[j] - grad_y[j] / learning_rates[j]
   //     weights[j] =
   //         sign(weights[j]) * max(0, weights[j], l1 / learning_rates[j])
   void EpochUpdate(Weights *weights, int epoch,
                    bool check_convergence) override;

   // Per-coordinate learning rates.
   VectorXf learning_rates_;

   // Center of the approximating quadratic function phi.
   VectorXf phi_center_;

   // Parameter for updating the approximation function phi. At each iteration,
   // 'alpha_' is updated to the solution of the quadratic equation
   //     alpha_^2 = beta_ * (1.0 - alpha_)
   float alpha_;

   // Parameter used to update alpha, defined as
   //     beta_{epoch} = \prod_{i=1}^{epoch} (1 - alpha_i).
   float beta_;

   // The transpose of instances. It is needed for faster gradient computations.
   // It should be computed when (and only when) instances changes,
   // so it is computed at the construction of the minimizer.
   const InstanceSet instances_transpose_;
 };

 }  // namespace lossmin
	// Copyright 2014 Google Inc. All Rights Reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.
	// Author: nevena@google.com (Nevena Lazic)
	//
	// An implementation of:
	// I. Mukherjee, K. Canini, R. Frongillo, and Y. Singer, "Parallel Boosting with
	// Momentum", ECML PKDD 2013.
	// Variable names follow the notation in the paper.

	#pragma once

	#include "lossmin/eigen-types.h"
	#include "lossmin/minimizers/gradient-evaluator.h"
	#include "lossmin/minimizers/loss-minimizer.h"

	namespace lossmin {

	class GradientEvaluator;

	class ParallelBoostingWithMomentum : public LossMinimizer {
	public:
	ParallelBoostingWithMomentum(float l1, float l2,
	const GradientEvaluator &gradient_evaluator)
	: LossMinimizer(l1, l2, gradient_evaluator),
	instances_transpose_(gradient_evaluator.instances().transpose()) {
	Setup();
	}

	// Sets learning rates and other parameters.
	void Setup() override;

	// Checks convergence by verifying the KKT conditions directly.
	// \|gradient\| is the gradient of the objective at \|weights\|, not including
	// the contribution of the l1 penalty part of the objective.
	//
	// The function checks whether the mean norm of violations of the KKT
	// condition is below convergence threshold. The KKT condition is necessary
	// and sufficient for \|weights\| to be a minimizer:
	// If weights_i < 0 then gradient_i == l1()
	// If weights_i > 0 then gradient_i == -l1()
	// If weights_i == 0 then -l1() <= gradient_i <= l1().
	//
	// The squared violations at each coordinate are summed and the square
	// root divided by weights.size() is compared to convergence_thresold()
	void ConvergenceCheck(const Weights &weights,
	const Weights &gradient) override;

	// Returns the total loss for given parameters 'weights', including l1 and l2
	// regularization. Uses sparse matrix multiply from Eigen.
	// This is more efficient in terms of performed operations than calling
	// gradient_evaluator().Loss(weights) but is not intended for parallelization.
	float Loss(const Weights &weights) const override;

	// Computes the inner product gradient at \|weights\|, written to \|gradient*\|;
	// \|*gradient\| must be of the same size as \|weights\|. Uses sparse matrix
	// multiply from Eigen. This is much more efficient in terms of performed
	// operations than calling gradient_evaluator().gradient(weights) but is not
	// intended for parallelization.
	void SparseInnerProductGradient(const Weights &weights,
	Weights *gradient) const;

	// Set phi_center_ (this is the same as v_0 in the paper).
	// Following the paper exactly, phi_center_ should be equal to the
	// initial guess for weights when Run() is executed (however, this requirement
	// does not seem to be necessary for convergence in practice).
	void set_phi_center(const VectorXf &phi) { phi_center_ = phi; }

	// Computes the learning rates. This is introduced to enable recomputing
	// learning rates in case l2 penalty changes.
	void compute_and_set_learning_rates();

	// Set alpha_; we may need to reset alpha before Run().
	void set_alpha(const float alpha) { alpha_ = alpha; }

	// Set beta_; we may need to reset beta before Run().
	void set_beta(const float beta) { beta_ = beta; }

	private:
	// Updates 'weights' and the quadratic approximation function phi(w), such
	// that at iteration k, loss(weights_k) <= min_w phi_k(w).
	// y = (1 - alpha) * weights + alpha * phi_center
	// grad_y = loss_grad(y) + l2 * y
	// weights[j] = weights[j] - grad_y[j] / learning_rates[j]
	// weights[j] =
	// sign(weights[j]) * max(0, weights[j], l1 / learning_rates[j])
	void EpochUpdate(Weights *weights, int epoch,
	bool check_convergence) override;

	// Per-coordinate learning rates.
	VectorXf learning_rates_;

	// Center of the approximating quadratic function phi.
	VectorXf phi_center_;

	// Parameter for updating the approximation function phi. At each iteration,
	// 'alpha_' is updated to the solution of the quadratic equation
	// alpha_^2 = beta_ * (1.0 - alpha_)
	float alpha_;

	// Parameter used to update alpha, defined as
	// beta_{epoch} = \prod_{i=1}^{epoch} (1 - alpha_i).
	float beta_;

	// The transpose of instances. It is needed for faster gradient computations.
	// It should be computed when (and only when) instances changes,
	// so it is computed at the construction of the minimizer.
	const InstanceSet instances_transpose_;
	};

	} // namespace lossmin