blob: 3eb1848e088d4903b22def56f5f622280aed0dde [file] [log] [blame]
// Copyright 2014 Google Inc. All Rights Reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Author: nevena@google.com (Nevena Lazic)
// TODO(bazyli) update license header
#include "lossmin/minimizers/parallel-boosting-with-momentum.h"
#include <math.h>
#include <functional>
#include "lossmin/minimizers/gradient-evaluator.h"
#include "third_party/eigen/Eigen/Core"
namespace lossmin {
void ParallelBoostingWithMomentum::Setup() {
compute_and_set_learning_rates();
alpha_ = 0.5;
beta_ = 1.0 - alpha_;
phi_center_ = Weights::Zero(gradient_evaluator().NumWeights());
}
void ParallelBoostingWithMomentum::compute_and_set_learning_rates() {
// Per-coordinate learning rates are learning_rates[j] = 1 / sparsity * Lj,
// where sparsity is the maximum instance l0 norm and Lj is upper bound on
// the loss curvature along coordinate j.
double sparsity = gradient_evaluator().Sparsity();
gradient_evaluator().PerCoordinateCurvature(&learning_rates_);
learning_rates_ =
(learning_rates_.array() + l2()).inverse().matrix() / sparsity;
}
void ParallelBoostingWithMomentum::SparseInnerProductGradient(
const Weights &weights, Weights *gradient) const {
// Eigen library recommends computations step-by-step for best perfomance
// TODO(bazyli) parallel matrix vector multiply with open mp
Weights residual = gradient_evaluator().instances() * weights;
residual -= gradient_evaluator().labels();
*gradient = gradient_evaluator().instances_transposed() * residual;
*gradient /= gradient_evaluator().NumExamples();
}
double ParallelBoostingWithMomentum::Loss(const Weights &weights) const {
// Eigen library recommends computations step-by-step for best perfomance
// TODO(bazyli) parallel matrix vector multiply with open mp
Weights residual = gradient_evaluator().instances() * weights;
residual -= gradient_evaluator().labels();
double loss =
0.5 * residual.squaredNorm() / gradient_evaluator().NumExamples();
if (l2() > 0.0) loss += 0.5 * l2() * weights.squaredNorm();
if (l1() > 0.0) loss += l1() * weights.cwiseAbs().sum();
return loss;
}
void ParallelBoostingWithMomentum::ConvergenceCheck(const Weights &weights,
const Weights &gradient) {
double error_squared = 0.0;
for (int i = 0; i < gradient.size(); i++) {
// for weights > 0 the gradient should be == -l1
if (weights(i) > zero_threshold()) {
error_squared += (gradient(i) + l1()) * (gradient(i) + l1());
}
// for weights < 0 the gradient should be == l1
else if (weights(i) < -zero_threshold()) {
error_squared += (gradient(i) - l1()) * (gradient(i) - l1());
}
// for weights == 0 the gradient should be between -l1 and l1
else {
double err = std::max(std::abs(gradient(i)) - l1(), 0.0);
error_squared += err * err;
}
}
if (std::sqrt(error_squared) / weights.size() < convergence_threshold()) {
set_reached_solution(true);
set_converged(true);
}
}
void ParallelBoostingWithMomentum::EpochUpdate(Weights *weights, int epoch,
bool check_convergence) {
// Compute the intermediate weight vector y.
Weights y = (1.0 - alpha_) * *weights + alpha_ * phi_center_;
// Compute the gradient of the loss (except l1 penalty) wrt y.
Weights gradient_wrt_y = Weights::Zero(y.size());
SparseInnerProductGradient(y, &gradient_wrt_y);
if (l2() > 0.0) gradient_wrt_y += l2() * y;
// Gradient step.
*weights -= gradient_wrt_y.cwiseProduct(learning_rates_);
// l1 shrinkage.
if (l1() > 0.0) {
L1Prox(l1() * learning_rates_, weights);
}
// Update the approximation function.
phi_center_ -= (1.0 - alpha_) / alpha_ * (y - *weights);
alpha_ =
-beta_ / 2.0 + pow(beta_ + beta_ * beta_ / 4.0, static_cast<double>(0.5));
beta_ *= (1.0 - alpha_);
// Compute the gradient of the objective except the l1 part and check
// convergence
if (check_convergence) {
Weights gradient_wrt_weights = Weights::Zero(weights->size());
SparseInnerProductGradient(*weights, &gradient_wrt_weights);
if (l2() > 0.0f) {
gradient_wrt_weights += l2() * *weights;
}
ConvergenceCheck(*weights, gradient_wrt_weights);
}
}
} // namespace lossmin