blob: 4ae1748c132d6f9637a83fcdbd1119cc1945ce26 [file] [log] [blame]
// Copyright 2014 Google Inc. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Author: nevena@google.com (Nevena Lazic)
//
// GradientEvaluator is a class for computing the value and gradient of a loss
// LossFunction on a labeled dataset {(instance_i, label_i)}, given parameters
// 'weights'. Its methods are called by gradient descent algorithms implementing
// the LossMinimizer interface.
#pragma once
#include <algorithm>
#include <mutex>
#include <string>
#include <vector>
#include "lossmin/eigen-types.h"
#include "lossmin/losses/loss-function.h"
class BlockingCounter;
namespace lossmin {
class GradientEvaluator {
public:
// Constructor sets up the dataset and the loss function.
GradientEvaluator(const InstanceSet &instances, const LabelSet &labels,
const LossFunction *loss_function)
: instances_(instances),
instances_transposed_(instances.transpose()),
labels_(labels),
loss_function_(loss_function) {}
virtual ~GradientEvaluator() {}
// Returns the loss for given parameters 'weights'. Multi-threading is used
// if num_threads_ > 1.
virtual double Loss(const Weights &weights) const;
// Returns the loss for given parameters 'weights' and a subset of examples
// 'example_indices'.
virtual double Loss(const Weights &weights,
const std::vector<int> &example_indices) const;
// Returns the loss for given parameters 'weights' and a different
// dataset (typically used for validation).
virtual double Loss(const Weights &weights,
const InstanceSet &validation_instances,
const LabelSet &validation_labels) const;
// Computes the gradient wrt the given parameters 'weights'. 'gradient' is
// owned by the caller and should be initialized to zero.
// Multithreading is used if 'num_threads' > 1. The training examples are
// divided into 'num_batches' batches; each thread computes the gradient of a
// batch, adds it to 'gradient', and takes the next batch. These updates are
// asynchronous, and behaviour is non-deterministic.
virtual void Gradient(const Weights &weights, Weights *gradient) const;
// Adds the gradient wrt 'weight_scale * weights' for 'example' to the vector
// 'gradient' in place. The gradient is scaled by 'example_scale'.
virtual void AddExampleGradient(const Weights &weights, int example,
double weights_scale, double example_scale,
Weights *gradient) const {
loss_function_->AddExampleGradient(weights, instances_, labels_, example,
weights_scale, example_scale, gradient);
}
// Returns the gradient wrt 'weights' as a vector<pair<int, double>> rather
// than Eigen::SparseVector<double>, since Eigen is very inefficient with
// sparse vectors. This is only necessary if running SGDAdaGrad.
virtual void ExampleGradient(
const Weights &weights, int example, double weights_scale,
double example_scale,
std::vector<std::pair<int, double>> *example_gradient) const {
loss_function_->ExampleGradient(weights, instances_, labels_, example,
weights_scale, example_scale,
example_gradient);
}
// Returns the number of examples in the dataset.
virtual int NumExamples() const { return instances_.rows(); }
// Returns the number of features.
virtual int NumFeatures() const { return instances_.cols(); }
// Returns the number of weights for the given number of features.
virtual int NumWeights() const {
return loss_function_->NumWeights(NumFeatures());
}
// Returns an upper bound on the curvature of the loss function. Used to set
// the learning rate of some LossMinimizer algorithms.
virtual double LossCurvature() const {
return loss_function_->LossCurvature(instances_);
}
// Returns the per-coordinate curvature of the data. Used to set the learning
// rates of ParallelBoostingWithMomentum.
virtual void PerCoordinateCurvature(
VectorXd *per_coordinate_curvature) const {
loss_function_->PerCoordinateCurvature(instances_,
per_coordinate_curvature);
}
// Returns sparsity, defined as the maximum instance l0 norm. Used to help
// set learning rates in ParallelBoostingWithMomentum.
double Sparsity() const {
typename Instance::Index sparsity = 0;
for (int i = 0; i < instances_.rows(); ++i) {
sparsity = std::max(sparsity, instances_.innerVector(i).nonZeros());
}
return static_cast<double>(sparsity);
}
// Returns the loss function.
const LossFunction *loss_function() const { return loss_function_; }
// Returns the instances.
const InstanceSet &instances() const { return instances_; }
// Returns the transpose pf instances.
const InstanceSet &instances_transposed() const {
return instances_transposed_;
}
// Returns the labels.
const LabelSet &labels() const { return labels_; }
private:
// Training instances.
const InstanceSet &instances_;
// The transpose of instances. This is needed for fast gradient computations
// and should be computed once so it is computed at construction (not each
// time gradient is computed)
const InstanceSet instances_transposed_;
// Instance labels.
const LabelSet &labels_;
// Function for computing the loss and gradient of a single training example.
// Not owned.
const LossFunction *loss_function_;
};
} // namespace lossmin