minimizers/gradient-evaluator.h - lossmin - Git at Google

 // Copyright 2014 Google Inc. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 // Author: nevena@google.com (Nevena Lazic)
 //
 // GradientEvaluator is a class for computing the value and gradient of a loss
 // LossFunction on a labeled dataset {(instance_i, label_i)}, given parameters
 // 'weights'. Its methods are called by gradient descent algorithms implementing
 // the LossMinimizer interface.

 #pragma once

 #include <algorithm>
 #include <mutex>
 #include <string>
 #include <vector>

 #include "lossmin/eigen-types.h"
 #include "lossmin/losses/loss-function.h"

 class BlockingCounter;

 namespace lossmin {

 class GradientEvaluator {
  public:
   // Constructor sets up the dataset and the loss function.
   GradientEvaluator(const InstanceSet &instances, const LabelSet &labels,
                     const LossFunction *loss_function)
       : instances_(instances), labels_(labels),
         loss_function_(loss_function) {}

   virtual ~GradientEvaluator() {}

   // Returns the loss for given parameters 'weights'. Multi-threading is used
   // if num_threads_ > 1.
   virtual float Loss(const Weights &weights) const;

   // Returns the loss for given parameters 'weights' and a subset of examples
   // 'example_indices'.
   virtual float Loss(const Weights &weights,
                      const std::vector<int> &example_indices) const;

   // Returns the loss for given parameters 'weights' and a different
   // dataset (typically used for validation).
   virtual float Loss(
       const Weights &weights, const InstanceSet &validation_instances,
       const LabelSet &validation_labels) const;

   // Computes the gradient wrt the given parameters 'weights'. 'gradient' is
   // owned by the caller and should be initialized to zero.
   // Multithreading is used if 'num_threads' > 1. The training examples are
   // divided into 'num_batches' batches; each thread computes the gradient of a
   // batch, adds it to 'gradient', and takes the next batch. These updates are
   // asynchronous, and behaviour is non-deterministic.
   virtual void Gradient(const Weights &weights, Weights *gradient) const;

   // Adds the gradient wrt 'weight_scale * weights' for 'example' to the vector
   // 'gradient' in place. The gradient is scaled by 'example_scale'.
   virtual void AddExampleGradient(
       const Weights &weights, int example, float weights_scale,
       float example_scale, Weights *gradient) const {
     loss_function_->AddExampleGradient(
         weights, instances_, labels_, example, weights_scale, example_scale,
         gradient);
   }

   // Returns the gradient wrt 'weights' as a vector<pair<int, float>> rather
   // than Eigen::SparseVector<float>, since Eigen is very inefficient with
   // sparse vectors. This is only necessary if running SGDAdaGrad.
   virtual void ExampleGradient(
       const Weights &weights, int example, float weights_scale,
       float example_scale,
       std::vector<std::pair<int, float>> *example_gradient) const {
     loss_function_->ExampleGradient(
         weights, instances_, labels_, example, weights_scale, example_scale,
         example_gradient);
   }

   // Sets the number of threads and batch size for multithreaded computation.
   // Assigns examples to 'batch_examples_'.
   void set_num_threads(int num_threads, int batch_size);

   // Convenience method that sets the number of batches to a default.
   void set_num_threads(int num_threads) {
     int batch_size = static_cast<int>(NumExamples() / num_threads / 2);
     set_num_threads(num_threads, batch_size);
   }

   // Returns the number of examples in the dataset.
   virtual int NumExamples() const { return instances_.rows(); }

   // Returns the number of features.
   virtual int NumFeatures() const { return instances_.cols(); }

   // Returns the number of weights for the given number of features.
   virtual int NumWeights() const {
     return loss_function_->NumWeights(NumFeatures());
   }

   // Returns the number of batches for multithreaded computation.
   int num_batches() const { return num_batches_; }

   // Returns the number of threads.
   int num_threads() const { return num_threads_; }

   // Returns an upper bound on the curvature of the loss function. Used to set
   // the learning rate of some LossMinimizer algorithms.
   virtual float LossCurvature() const {
     return loss_function_->LossCurvature(instances_);
   }

   // Returns the per-coordinate curvature of the data. Used to set the learning
   // rates of ParallelBoostingWithMomentum.
   virtual void PerCoordinateCurvature(
       VectorXf *per_coordinate_curvature) const {
     loss_function_->PerCoordinateCurvature(instances_,
                                            per_coordinate_curvature);
   }

   // Returns sparsity, defined as the maximum instance l0 norm. Used to help
   // set learning rates in ParallelBoostingWithMomentum.
   float Sparsity() const {
     typename Instance::Index sparsity = 0;
     for (int i = 0; i < instances_.rows(); ++i) {
       sparsity = std::max(sparsity, instances_.innerVector(i).nonZeros());
     }
     return static_cast<float>(sparsity);
   }

   // Returns the loss function.
   const LossFunction *loss_function() const { return loss_function_; }

   // Returns the instances.
   const InstanceSet &instances() const { return instances_; }

   // Returns the labels.
   const LabelSet &labels() const { return labels_; }

   // Returns a list of examples in batch 'batch'.
   const std::vector<int> &batch_examples(int batch) const {
     return batch_examples_[batch];
   }

  protected:
   /*
   // Computes the loss for a single example 'example' and returns it in
   // 'loss_values[example_index]'.
   virtual void ThreadLoss(
       const Weights *weights, int example, VectorXf *loss_values,
       BlockingCounter *num_jobs_remaining) const;

   // Computes the gradient for examples in batch 'batch', adds it to 'gradient'.
   virtual void ThreadGradient(
       const Weights *weights, int batch, Weights *gradient,
       BlockingCounter *num_jobs_remaining) const;
   */

  private:
   // Training instances.
   const InstanceSet &instances_;

   // Instance labels.
   const LabelSet &labels_;

   // Function for computing the loss and gradient of a single training example.
   // Not owned.
   const LossFunction *loss_function_;

   // Mutex for multithreaded gradient computation.
   mutable std::mutex gradient_update_mutex_;

   // Number of threads used to parallelize gradient computation.
   // If num_threads_ = 1, multithreading is not used.
   int num_threads_ = 1;

   // List of examples assigned to each thread.
   std::vector<std::vector<int>> batch_examples_;

   // Number of batches.
   int num_batches_;
 };

 }  // namespace lossmin
	// Copyright 2014 Google Inc. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.
	// Author: nevena@google.com (Nevena Lazic)
	//
	// GradientEvaluator is a class for computing the value and gradient of a loss
	// LossFunction on a labeled dataset {(instance_i, label_i)}, given parameters
	// 'weights'. Its methods are called by gradient descent algorithms implementing
	// the LossMinimizer interface.

	#pragma once

	#include <algorithm>
	#include <mutex>
	#include <string>
	#include <vector>

	#include "lossmin/eigen-types.h"
	#include "lossmin/losses/loss-function.h"

	class BlockingCounter;

	namespace lossmin {

	class GradientEvaluator {
	public:
	// Constructor sets up the dataset and the loss function.
	GradientEvaluator(const InstanceSet &instances, const LabelSet &labels,
	const LossFunction *loss_function)
	: instances_(instances), labels_(labels),
	loss_function_(loss_function) {}

	virtual ~GradientEvaluator() {}

	// Returns the loss for given parameters 'weights'. Multi-threading is used
	// if num_threads_ > 1.
	virtual float Loss(const Weights &weights) const;

	// Returns the loss for given parameters 'weights' and a subset of examples
	// 'example_indices'.
	virtual float Loss(const Weights &weights,
	const std::vector<int> &example_indices) const;

	// Returns the loss for given parameters 'weights' and a different
	// dataset (typically used for validation).
	virtual float Loss(
	const Weights &weights, const InstanceSet &validation_instances,
	const LabelSet &validation_labels) const;

	// Computes the gradient wrt the given parameters 'weights'. 'gradient' is
	// owned by the caller and should be initialized to zero.
	// Multithreading is used if 'num_threads' > 1. The training examples are
	// divided into 'num_batches' batches; each thread computes the gradient of a
	// batch, adds it to 'gradient', and takes the next batch. These updates are
	// asynchronous, and behaviour is non-deterministic.
	virtual void Gradient(const Weights &weights, Weights *gradient) const;

	// Adds the gradient wrt 'weight_scale * weights' for 'example' to the vector
	// 'gradient' in place. The gradient is scaled by 'example_scale'.
	virtual void AddExampleGradient(
	const Weights &weights, int example, float weights_scale,
	float example_scale, Weights *gradient) const {
	loss_function_->AddExampleGradient(
	weights, instances_, labels_, example, weights_scale, example_scale,
	gradient);
	}

	// Returns the gradient wrt 'weights' as a vector<pair<int, float>> rather
	// than Eigen::SparseVector<float>, since Eigen is very inefficient with
	// sparse vectors. This is only necessary if running SGDAdaGrad.
	virtual void ExampleGradient(
	const Weights &weights, int example, float weights_scale,
	float example_scale,
	std::vector<std::pair<int, float>> *example_gradient) const {
	loss_function_->ExampleGradient(
	weights, instances_, labels_, example, weights_scale, example_scale,
	example_gradient);
	}

	// Sets the number of threads and batch size for multithreaded computation.
	// Assigns examples to 'batch_examples_'.
	void set_num_threads(int num_threads, int batch_size);

	// Convenience method that sets the number of batches to a default.
	void set_num_threads(int num_threads) {
	int batch_size = static_cast<int>(NumExamples() / num_threads / 2);
	set_num_threads(num_threads, batch_size);
	}

	// Returns the number of examples in the dataset.
	virtual int NumExamples() const { return instances_.rows(); }

	// Returns the number of features.
	virtual int NumFeatures() const { return instances_.cols(); }

	// Returns the number of weights for the given number of features.
	virtual int NumWeights() const {
	return loss_function_->NumWeights(NumFeatures());
	}

	// Returns the number of batches for multithreaded computation.
	int num_batches() const { return num_batches_; }

	// Returns the number of threads.
	int num_threads() const { return num_threads_; }

	// Returns an upper bound on the curvature of the loss function. Used to set
	// the learning rate of some LossMinimizer algorithms.
	virtual float LossCurvature() const {
	return loss_function_->LossCurvature(instances_);
	}

	// Returns the per-coordinate curvature of the data. Used to set the learning
	// rates of ParallelBoostingWithMomentum.
	virtual void PerCoordinateCurvature(
	VectorXf *per_coordinate_curvature) const {
	loss_function_->PerCoordinateCurvature(instances_,
	per_coordinate_curvature);
	}

	// Returns sparsity, defined as the maximum instance l0 norm. Used to help
	// set learning rates in ParallelBoostingWithMomentum.
	float Sparsity() const {
	typename Instance::Index sparsity = 0;
	for (int i = 0; i < instances_.rows(); ++i) {
	sparsity = std::max(sparsity, instances_.innerVector(i).nonZeros());
	}
	return static_cast<float>(sparsity);
	}

	// Returns the loss function.
	const LossFunction *loss_function() const { return loss_function_; }

	// Returns the instances.
	const InstanceSet &instances() const { return instances_; }

	// Returns the labels.
	const LabelSet &labels() const { return labels_; }

	// Returns a list of examples in batch 'batch'.
	const std::vector<int> &batch_examples(int batch) const {
	return batch_examples_[batch];
	}

	protected:
	/*
	// Computes the loss for a single example 'example' and returns it in
	// 'loss_values[example_index]'.
	virtual void ThreadLoss(
	const Weights weights, int example, VectorXf loss_values,
	BlockingCounter *num_jobs_remaining) const;

	// Computes the gradient for examples in batch 'batch', adds it to 'gradient'.
	virtual void ThreadGradient(
	const Weights weights, int batch, Weights gradient,
	BlockingCounter *num_jobs_remaining) const;
	*/

	private:
	// Training instances.
	const InstanceSet &instances_;

	// Instance labels.
	const LabelSet &labels_;

	// Function for computing the loss and gradient of a single training example.
	// Not owned.
	const LossFunction *loss_function_;

	// Mutex for multithreaded gradient computation.
	mutable std::mutex gradient_update_mutex_;

	// Number of threads used to parallelize gradient computation.
	// If num_threads_ = 1, multithreading is not used.
	int num_threads_ = 1;

	// List of examples assigned to each thread.
	std::vector<std::vector<int>> batch_examples_;

	// Number of batches.
	int num_batches_;
	};

	} // namespace lossmin