lossmin/minimizers/parallel-boosting-with-momentum.h - lossmin - Git at Google

 // Copyright 2014 Google Inc. All Rights Reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 // Author: nevena@google.com (Nevena Lazic)
 //
 // An implementation of:
 // I. Mukherjee, K. Canini, R. Frongillo, and Y. Singer, "Parallel Boosting with
 // Momentum", ECML PKDD 2013.
 // Variable names follow the notation in the paper.

 #pragma once

 #include "lossmin/eigen-types.h"
 #include "lossmin/minimizers/gradient-evaluator.h"
 #include "lossmin/minimizers/loss-minimizer.h"

 namespace lossmin {

 class GradientEvaluator;

 class ParallelBoostingWithMomentum : public LossMinimizer {
  public:
   ParallelBoostingWithMomentum(float l1, float l2,
                                const GradientEvaluator &gradient_evaluator)
       : LossMinimizer(l1, l2, gradient_evaluator) {
     Setup();
   }

   // Sets learning rates and other parameters.
   void Setup() override;

   // Checks convergence by verifying the KKT conditions directly.
   // |gradient| is the gradient of the objective at |weights|, not including
   // the contribution of the l1 penalty part of the objective.
   //
   // The function checks whether the mean norm of violations of the KKT
   // condition is below convergence threshold. The KKT condition is necessary
   // and sufficient for |weights| to be a minimizer:
   // If weights_i < 0 then gradient_i == l1()
   // If weights_i > 0 then gradient_i == -l1()
   // If weights_i == 0 then -l1() <= gradient_i <= l1().
   //
   // The squared violations at each coordinate are summed and the square
   // root divided by weights.size() is compared to convergence_thresold()
   void ConvergenceCheck(const Weights &weights,
                         const Weights &gradient) override;

  private:
   // Updates 'weights' and the quadratic approximation function phi(w), such
   // that at iteration k, loss(weights_k) <= min_w phi_k(w).
   //     y = (1 - alpha) * weights + alpha * phi_center
   //     grad_y = loss_grad(y) + l2 * y
   //     weights[j] = weights[j] - grad_y[j] / learning_rates[j]
   //     weights[j] =
   //         sign(weights[j]) * max(0, weights[j], l1 / learning_rates[j])
   void EpochUpdate(Weights *weights, int epoch,
                    bool check_convergence) override;

   // Per-coordinate learning rates.
   VectorXf learning_rates_;

   // Center of the approximating quadratic function phi.
   VectorXf phi_center_;

   // Parameter for updating the approximation function phi. At each iteration,
   // 'alpha_' is updated to the solution of the quadratic equation
   //     alpha_^2 = beta_ * (1.0 - alpha_)
   float alpha_;

   // Parameter used to update alpha, defined as
   //     beta_{epoch} = \prod_{i=1}^{epoch} (1 - alpha_i).
   float beta_;
 };

 }  // namespace lossmin
	// Copyright 2014 Google Inc. All Rights Reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.
	// Author: nevena@google.com (Nevena Lazic)
	//
	// An implementation of:
	// I. Mukherjee, K. Canini, R. Frongillo, and Y. Singer, "Parallel Boosting with
	// Momentum", ECML PKDD 2013.
	// Variable names follow the notation in the paper.

	#pragma once

	#include "lossmin/eigen-types.h"
	#include "lossmin/minimizers/gradient-evaluator.h"
	#include "lossmin/minimizers/loss-minimizer.h"

	namespace lossmin {

	class GradientEvaluator;

	class ParallelBoostingWithMomentum : public LossMinimizer {
	public:
	ParallelBoostingWithMomentum(float l1, float l2,
	const GradientEvaluator &gradient_evaluator)
	: LossMinimizer(l1, l2, gradient_evaluator) {
	Setup();
	}

	// Sets learning rates and other parameters.
	void Setup() override;

	// Checks convergence by verifying the KKT conditions directly.
	// \|gradient\| is the gradient of the objective at \|weights\|, not including
	// the contribution of the l1 penalty part of the objective.
	//
	// The function checks whether the mean norm of violations of the KKT
	// condition is below convergence threshold. The KKT condition is necessary
	// and sufficient for \|weights\| to be a minimizer:
	// If weights_i < 0 then gradient_i == l1()
	// If weights_i > 0 then gradient_i == -l1()
	// If weights_i == 0 then -l1() <= gradient_i <= l1().
	//
	// The squared violations at each coordinate are summed and the square
	// root divided by weights.size() is compared to convergence_thresold()
	void ConvergenceCheck(const Weights &weights,
	const Weights &gradient) override;

	private:
	// Updates 'weights' and the quadratic approximation function phi(w), such
	// that at iteration k, loss(weights_k) <= min_w phi_k(w).
	// y = (1 - alpha) * weights + alpha * phi_center
	// grad_y = loss_grad(y) + l2 * y
	// weights[j] = weights[j] - grad_y[j] / learning_rates[j]
	// weights[j] =
	// sign(weights[j]) * max(0, weights[j], l1 / learning_rates[j])
	void EpochUpdate(Weights *weights, int epoch,
	bool check_convergence) override;

	// Per-coordinate learning rates.
	VectorXf learning_rates_;

	// Center of the approximating quadratic function phi.
	VectorXf phi_center_;

	// Parameter for updating the approximation function phi. At each iteration,
	// 'alpha_' is updated to the solution of the quadratic equation
	// alpha_^2 = beta_ * (1.0 - alpha_)
	float alpha_;

	// Parameter used to update alpha, defined as
	// beta_{epoch} = \prod_{i=1}^{epoch} (1 - alpha_i).
	float beta_;
	};

	} // namespace lossmin