Changing precision of the linear algebra library to double.
(also moving instance transpose to gradient evaluator)
Change-Id: I99a71e41f682ac6033d25a038f78190588d00684
diff --git a/lossmin/eigen-types.h b/lossmin/eigen-types.h
index 47df398..6cfd72a 100644
--- a/lossmin/eigen-types.h
+++ b/lossmin/eigen-types.h
@@ -12,14 +12,17 @@
// Arrays.
typedef Eigen::Array<bool, Eigen::Dynamic, 1> ArrayXb;
typedef Eigen::ArrayXf ArrayXf;
+typedef Eigen::ArrayXd ArrayXd;
typedef Eigen::ArrayXi ArrayXi;
// Vectors.
typedef Eigen::VectorXf VectorXf;
+typedef Eigen::VectorXd VectorXd;
typedef Eigen::VectorXi VectorXi;
// Sparse Vectors.
typedef Eigen::SparseVector<float> SparseVectorXf;
+typedef Eigen::SparseVector<double> SparseVectorXd;
// Matrix.
typedef Eigen::Matrix<
@@ -32,20 +35,25 @@
Eigen::Dynamic,
Eigen::Dynamic,
Eigen::RowMajor> RowMatrixXf;
-
-// Sparse Matrix.
-typedef Eigen::SparseMatrix<float> SparseMatrixXf;
+typedef Eigen::Matrix<
+ double,
+ Eigen::Dynamic,
+ Eigen::Dynamic,
+ Eigen::ColMajor> MatrixXd;
+typedef Eigen::Matrix<
+ double,
+ Eigen::Dynamic,
+ Eigen::Dynamic,
+ Eigen::RowMajor> RowMatrixXd;
// Instances and parameters.
-typedef VectorXf Weights;
+typedef VectorXd Weights;
-typedef VectorXf Label;
-typedef RowMatrixXf LabelSet;
+typedef VectorXd Label;
+typedef RowMatrixXd LabelSet;
-typedef Eigen::SparseVector<float> Instance;
-typedef Eigen::SparseMatrix<float, Eigen::RowMajor> InstanceSet;
-typedef Eigen::SparseMatrix<float, Eigen::ColMajor> SparseMatrixXf;
-typedef Eigen::SparseVector<float> SparseVectorXf;
+typedef Eigen::SparseVector<double> Instance;
+typedef Eigen::SparseMatrix<double, Eigen::RowMajor> InstanceSet;
+typedef Eigen::SparseMatrix<double, Eigen::ColMajor> SparseMatrixXd;
} // namespace lossmin
-
diff --git a/lossmin/losses/inner-product-loss-function.cc b/lossmin/losses/inner-product-loss-function.cc
index 2d24293..7011143 100644
--- a/lossmin/losses/inner-product-loss-function.cc
+++ b/lossmin/losses/inner-product-loss-function.cc
@@ -8,37 +8,37 @@
namespace lossmin {
-float InnerProductLossFunction::LossCurvature(
+double InnerProductLossFunction::LossCurvature(
const InstanceSet &instances) const {
- float data_curvature = (instances.cwiseProduct(instances) *
+ double data_curvature = (instances.cwiseProduct(instances) *
Weights::Ones(instances.cols())).maxCoeff();
return curvature_ * data_curvature;
}
void InnerProductLossFunction::PerCoordinateCurvature(
- const InstanceSet &instances, VectorXf *per_coordinate_curvature) const {
+ const InstanceSet &instances, VectorXd *per_coordinate_curvature) const {
*per_coordinate_curvature =
- VectorXf::Ones(instances.rows()).transpose() *
+ VectorXd::Ones(instances.rows()).transpose() *
instances.cwiseProduct(instances) / instances.rows();
*per_coordinate_curvature *= curvature_;
}
-float InnerProductLossFunction::ExampleLoss(
+double InnerProductLossFunction::ExampleLoss(
const Weights &weights, const InstanceSet &instances,
const LabelSet &labels, int example) const {
- float inner_product = instances.innerVector(example).dot(weights);
+ double inner_product = instances.innerVector(example).dot(weights);
return InnerProductExampleLoss(inner_product, labels.coeff(example, 0));
}
void InnerProductLossFunction::AddExampleGradient(
const Weights &weights, const InstanceSet &instances,
- const LabelSet &labels, int example, float weights_scale,
- float example_scale, Weights *gradient) const {
- float inner_product = instances.innerVector(example).dot(weights);
- if (weights_scale != 1.0f) inner_product *= weights_scale;
- float inner_product_gradient =
+ const LabelSet &labels, int example, double weights_scale,
+ double example_scale, Weights *gradient) const {
+ double inner_product = instances.innerVector(example).dot(weights);
+ if (weights_scale != 1.0) inner_product *= weights_scale;
+ double inner_product_gradient =
InnerProductExampleGradient(inner_product, labels.coeff(example, 0));
- if (example_scale != 1.0f) inner_product_gradient *= example_scale;
+ if (example_scale != 1.0) inner_product_gradient *= example_scale;
if (synchronous_update()) {
std::lock_guard<std::mutex> lock(gradient_update_mutex_);
@@ -54,14 +54,14 @@
void InnerProductLossFunction::ExampleGradient(
const Weights &weights, const InstanceSet &instances,
- const LabelSet &labels, int example, float weights_scale,
- float example_scale,
- std::vector<std::pair<int, float>> *example_gradient) const {
- float inner_product = instances.innerVector(example).dot(weights);
- if (weights_scale != 1.0f) inner_product *= weights_scale;
- float inner_product_gradient =
+ const LabelSet &labels, int example, double weights_scale,
+ double example_scale,
+ std::vector<std::pair<int, double>> *example_gradient) const {
+ double inner_product = instances.innerVector(example).dot(weights);
+ if (weights_scale != 1.0) inner_product *= weights_scale;
+ double inner_product_gradient =
InnerProductExampleGradient(inner_product, labels.coeff(example, 0));
- if (example_scale != 1.0f) inner_product_gradient *= example_scale;
+ if (example_scale != 1.0) inner_product_gradient *= example_scale;
example_gradient->resize(instances.row(example).nonZeros());
int i = 0;
@@ -76,11 +76,11 @@
const Weights &weights, const InstanceSet &instances,
LabelSet *labels) const {
// Compute inner products.
- VectorXf inner_products = instances * weights;
+ VectorXd inner_products = instances * weights;
// Assign labels by calling InnerProductAssignLabel coefficientwise on
// 'inner_products'.
- std::function<float(float)> assign_label_ptr =
+ std::function<double(double)> assign_label_ptr =
std::bind(&InnerProductLossFunction::InnerProductPredictLabel,
this, std::placeholders::_1);
*labels = inner_products.unaryExpr(assign_label_ptr);
diff --git a/lossmin/losses/inner-product-loss-function.h b/lossmin/losses/inner-product-loss-function.h
index d00cef2..284125d 100644
--- a/lossmin/losses/inner-product-loss-function.h
+++ b/lossmin/losses/inner-product-loss-function.h
@@ -30,51 +30,51 @@
class InnerProductLossFunction : public LossFunction {
public:
// Returns the loss for a single example.
- float ExampleLoss(
+ double ExampleLoss(
const Weights &weights, const InstanceSet &instances,
const LabelSet &labels, int example) const override;
// Adds the gradient of a single example to 'gradient'.
void AddExampleGradient(
const Weights &weights, const InstanceSet &instances,
- const LabelSet &labels, int example, float weights_scale,
- float example_scale, Weights *gradient) const override;
+ const LabelSet &labels, int example, double weights_scale,
+ double example_scale, Weights *gradient) const override;
// Returns the gradient of a single example.
void ExampleGradient(
const Weights &weights, const InstanceSet &instances,
- const LabelSet &labels, int example, float weights_scale,
- float example_scale,
- std::vector<std::pair<int, float>> *example_gradient) const override;
+ const LabelSet &labels, int example, double weights_scale,
+ double example_scale,
+ std::vector<std::pair<int, double>> *example_gradient) const override;
// Assigns labels to 'instances' given 'weights'.
void PredictLabels(const Weights &weights, const InstanceSet &instances,
LabelSet *labels) const override;
// Returns an upper bound on the loss curvature.
- float LossCurvature(const InstanceSet &instances) const override;
+ double LossCurvature(const InstanceSet &instances) const override;
// Returns an upper bound on the per-coordinate curvature.
void PerCoordinateCurvature(
const InstanceSet &instances,
- VectorXf *per_coordinate_curvature) const override;
+ VectorXd *per_coordinate_curvature) const override;
- virtual float InnerProductExampleLoss(float inner_product, float label)
+ virtual double InnerProductExampleLoss(double inner_product, double label)
const = 0;
- virtual float InnerProductExampleGradient(float inner_product, float label)
+ virtual double InnerProductExampleGradient(double inner_product, double label)
const = 0;
- virtual float InnerProductPredictLabel(float inner_product) const = 0;
+ virtual double InnerProductPredictLabel(double inner_product) const = 0;
// Returns 'curvature_'.
- virtual float InnerProductCurvature(float inner_product, float label) const {
+ virtual double InnerProductCurvature(double inner_product, double label) const {
return curvature_;
}
protected:
// Sets the upper bound on the curvature of the loss function.
- void set_curvature(float curvature) { curvature_ = curvature; }
+ void set_curvature(double curvature) { curvature_ = curvature; }
private:
// Mutex for synchronous updates of the gradient vector.
@@ -83,7 +83,7 @@
// Upper bound on the absolute value of the second derivative of the loss:
// |d^2 loss(x) / dx^2| <= curvature_, where 'x' is the inner product
// <instance, weights>. Should be set by derived classes.
- float curvature_;
+ double curvature_;
};
// Linear regression with squared error loss.
@@ -92,21 +92,21 @@
LinearRegressionLossFunction() { set_curvature(1.0); }
// Returns the squared error loss.
- float InnerProductExampleLoss(float inner_product, float label)
+ double InnerProductExampleLoss(double inner_product, double label)
const override {
return 0.5 * (inner_product - label) * (inner_product - label);
}
// Returns the gradient of the squared error loss wrt 'inner_product'.
- float InnerProductExampleGradient(float inner_product, float label)
+ double InnerProductExampleGradient(double inner_product, double label)
const override {
return inner_product - label;
}
// Assigns a label given 'inner_product'.
- float InnerProductPredictLabel(float inner_product) const override {
+ double InnerProductPredictLabel(double inner_product) const override {
return inner_product;
}
};
-} // namespace lossmin
\ No newline at end of file
+} // namespace lossmin
diff --git a/lossmin/losses/loss-function.cc b/lossmin/losses/loss-function.cc
index ca11228..f5bb3bf 100644
--- a/lossmin/losses/loss-function.cc
+++ b/lossmin/losses/loss-function.cc
@@ -9,10 +9,10 @@
namespace lossmin {
-float LossFunction::BatchLoss(
+double LossFunction::BatchLoss(
const Weights &weights, const InstanceSet &instances,
const LabelSet &labels) const {
- float loss = 0.0f;
+ double loss = 0.0;
for (int i = 0; i < instances.rows(); ++i) {
loss += ExampleLoss(weights, instances, labels, i);
}
@@ -23,7 +23,7 @@
const Weights &weights, const InstanceSet &instances,
const LabelSet &labels, Weights *gradient) const {
for (int i = 0; i < instances.rows(); ++i) {
- AddExampleGradient(weights, instances, labels, i, 1.0f, 1.0f, gradient);
+ AddExampleGradient(weights, instances, labels, i, 1.0, 1.0, gradient);
}
}
diff --git a/lossmin/losses/loss-function.h b/lossmin/losses/loss-function.h
index b02aff0..42a28d5 100644
--- a/lossmin/losses/loss-function.h
+++ b/lossmin/losses/loss-function.h
@@ -35,7 +35,7 @@
// Returns the loss for a single example (row 'example' of 'instances' and
// 'labels').
- virtual float ExampleLoss(
+ virtual double ExampleLoss(
const Weights &weights, const InstanceSet &instances,
const LabelSet &labels, int example) const = 0;
@@ -47,15 +47,15 @@
// protected by a mutex in the implementation.
virtual void AddExampleGradient(
const Weights &weights, const InstanceSet &instances,
- const LabelSet &labels, int example, float weights_scale,
- float example_scale, Weights *gradient) const = 0;
+ const LabelSet &labels, int example, double weights_scale,
+ double example_scale, Weights *gradient) const = 0;
// Returns the gradient of the loss for a single example. Used in AdaGrad.
virtual void ExampleGradient(
const Weights &weights, const InstanceSet &instances,
- const LabelSet &labels, int example, float weights_scale,
- float example_scale,
- std::vector<std::pair<int, float>> *example_gradient) const = 0;
+ const LabelSet &labels, int example, double weights_scale,
+ double example_scale,
+ std::vector<std::pair<int, double>> *example_gradient) const = 0;
// Predicts 'labels' for 'instances' given 'weights'.
virtual void PredictLabels(
@@ -66,14 +66,14 @@
// Hessian matrix). Required by DeterministicGradientDescent. Optionally
// required by StochasticVarianceReducedGradient (for default learning rate)
// and StochasticGradientDescent for CURVATURE_BASED learning rate scheduling.
- virtual float LossCurvature(const InstanceSet &instances) const = 0;
+ virtual double LossCurvature(const InstanceSet &instances) const = 0;
// Returns an upper bound on the curvature of the loss along each coordinate
// (max absolute value of the second derivative) of the data. Required by
// ParallelBoostingWithmomentum.
virtual void PerCoordinateCurvature(
const InstanceSet &instances,
- VectorXf *per_coordinate_curvature) const = 0;
+ VectorXd *per_coordinate_curvature) const = 0;
// Initializes parameters to a suggested setting for this loss if appropriate.
virtual void Init(Weights *weights) const {}
@@ -84,7 +84,7 @@
// Returns the total loss for a set of examples. Default implementation runs
// through the examples and calls ExampleLoss on each.
- virtual float BatchLoss(const Weights &weights, const InstanceSet &instances,
+ virtual double BatchLoss(const Weights &weights, const InstanceSet &instances,
const LabelSet &labels) const;
// Returns the total gradient for a set of examples. Default implementation
@@ -108,5 +108,4 @@
//DISALLOW_COPY_AND_ASSIGN(LossFunction);
};
-
} // namespace lossmin
diff --git a/lossmin/minimizers/gradient-evaluator.cc b/lossmin/minimizers/gradient-evaluator.cc
index 32fafbc..0261b76 100644
--- a/lossmin/minimizers/gradient-evaluator.cc
+++ b/lossmin/minimizers/gradient-evaluator.cc
@@ -9,23 +9,23 @@
namespace lossmin {
-float GradientEvaluator::Loss(const Weights &weights) const {
+double GradientEvaluator::Loss(const Weights &weights) const {
// TODO(azani): Implement multi-threaded version.
return loss_function_->BatchLoss(weights, instances_, labels_) /
- NumExamples();
+ NumExamples();
}
-float GradientEvaluator::Loss(
- const Weights &weights, const InstanceSet &validation_instances,
- const LabelSet &validation_labels) const {
- return loss_function_->BatchLoss(
- weights, validation_instances, validation_labels) /
- validation_labels.rows();
+double GradientEvaluator::Loss(const Weights &weights,
+ const InstanceSet &validation_instances,
+ const LabelSet &validation_labels) const {
+ return loss_function_->BatchLoss(weights, validation_instances,
+ validation_labels) /
+ validation_labels.rows();
}
-float GradientEvaluator::Loss(
- const Weights &weights, const std::vector<int> &example_indices) const {
- float loss = 0.0f;
+double GradientEvaluator::Loss(const Weights &weights,
+ const std::vector<int> &example_indices) const {
+ double loss = 0.0;
for (int example : example_indices) {
loss += loss_function_->ExampleLoss(weights, instances_, labels_, example);
}
@@ -34,7 +34,7 @@
void GradientEvaluator::Gradient(const Weights &weights,
Weights *gradient) const {
- //DCHECK(gradient != nullptr);
+ // DCHECK(gradient != nullptr);
// TODO(azani): Implement multi-threaded version.
loss_function_->BatchGradient(weights, instances_, labels_, gradient);
*gradient /= NumExamples();
diff --git a/lossmin/minimizers/gradient-evaluator.h b/lossmin/minimizers/gradient-evaluator.h
index d756759..4ae1748 100644
--- a/lossmin/minimizers/gradient-evaluator.h
+++ b/lossmin/minimizers/gradient-evaluator.h
@@ -27,25 +27,27 @@
// Constructor sets up the dataset and the loss function.
GradientEvaluator(const InstanceSet &instances, const LabelSet &labels,
const LossFunction *loss_function)
- : instances_(instances), labels_(labels),
+ : instances_(instances),
+ instances_transposed_(instances.transpose()),
+ labels_(labels),
loss_function_(loss_function) {}
virtual ~GradientEvaluator() {}
// Returns the loss for given parameters 'weights'. Multi-threading is used
// if num_threads_ > 1.
- virtual float Loss(const Weights &weights) const;
+ virtual double Loss(const Weights &weights) const;
// Returns the loss for given parameters 'weights' and a subset of examples
// 'example_indices'.
- virtual float Loss(const Weights &weights,
- const std::vector<int> &example_indices) const;
+ virtual double Loss(const Weights &weights,
+ const std::vector<int> &example_indices) const;
// Returns the loss for given parameters 'weights' and a different
// dataset (typically used for validation).
- virtual float Loss(
- const Weights &weights, const InstanceSet &validation_instances,
- const LabelSet &validation_labels) const;
+ virtual double Loss(const Weights &weights,
+ const InstanceSet &validation_instances,
+ const LabelSet &validation_labels) const;
// Computes the gradient wrt the given parameters 'weights'. 'gradient' is
// owned by the caller and should be initialized to zero.
@@ -57,24 +59,23 @@
// Adds the gradient wrt 'weight_scale * weights' for 'example' to the vector
// 'gradient' in place. The gradient is scaled by 'example_scale'.
- virtual void AddExampleGradient(
- const Weights &weights, int example, float weights_scale,
- float example_scale, Weights *gradient) const {
- loss_function_->AddExampleGradient(
- weights, instances_, labels_, example, weights_scale, example_scale,
- gradient);
+ virtual void AddExampleGradient(const Weights &weights, int example,
+ double weights_scale, double example_scale,
+ Weights *gradient) const {
+ loss_function_->AddExampleGradient(weights, instances_, labels_, example,
+ weights_scale, example_scale, gradient);
}
- // Returns the gradient wrt 'weights' as a vector<pair<int, float>> rather
- // than Eigen::SparseVector<float>, since Eigen is very inefficient with
+ // Returns the gradient wrt 'weights' as a vector<pair<int, double>> rather
+ // than Eigen::SparseVector<double>, since Eigen is very inefficient with
// sparse vectors. This is only necessary if running SGDAdaGrad.
virtual void ExampleGradient(
- const Weights &weights, int example, float weights_scale,
- float example_scale,
- std::vector<std::pair<int, float>> *example_gradient) const {
- loss_function_->ExampleGradient(
- weights, instances_, labels_, example, weights_scale, example_scale,
- example_gradient);
+ const Weights &weights, int example, double weights_scale,
+ double example_scale,
+ std::vector<std::pair<int, double>> *example_gradient) const {
+ loss_function_->ExampleGradient(weights, instances_, labels_, example,
+ weights_scale, example_scale,
+ example_gradient);
}
// Returns the number of examples in the dataset.
@@ -90,26 +91,26 @@
// Returns an upper bound on the curvature of the loss function. Used to set
// the learning rate of some LossMinimizer algorithms.
- virtual float LossCurvature() const {
+ virtual double LossCurvature() const {
return loss_function_->LossCurvature(instances_);
}
// Returns the per-coordinate curvature of the data. Used to set the learning
// rates of ParallelBoostingWithMomentum.
virtual void PerCoordinateCurvature(
- VectorXf *per_coordinate_curvature) const {
+ VectorXd *per_coordinate_curvature) const {
loss_function_->PerCoordinateCurvature(instances_,
per_coordinate_curvature);
}
// Returns sparsity, defined as the maximum instance l0 norm. Used to help
// set learning rates in ParallelBoostingWithMomentum.
- float Sparsity() const {
+ double Sparsity() const {
typename Instance::Index sparsity = 0;
for (int i = 0; i < instances_.rows(); ++i) {
sparsity = std::max(sparsity, instances_.innerVector(i).nonZeros());
}
- return static_cast<float>(sparsity);
+ return static_cast<double>(sparsity);
}
// Returns the loss function.
@@ -118,6 +119,11 @@
// Returns the instances.
const InstanceSet &instances() const { return instances_; }
+ // Returns the transpose pf instances.
+ const InstanceSet &instances_transposed() const {
+ return instances_transposed_;
+ }
+
// Returns the labels.
const LabelSet &labels() const { return labels_; }
@@ -125,6 +131,11 @@
// Training instances.
const InstanceSet &instances_;
+ // The transpose of instances. This is needed for fast gradient computations
+ // and should be computed once so it is computed at construction (not each
+ // time gradient is computed)
+ const InstanceSet instances_transposed_;
+
// Instance labels.
const LabelSet &labels_;
@@ -134,4 +145,3 @@
};
} // namespace lossmin
-
diff --git a/lossmin/minimizers/loss-minimizer.cc b/lossmin/minimizers/loss-minimizer.cc
index 45ea3d6..a4dc58f 100644
--- a/lossmin/minimizers/loss-minimizer.cc
+++ b/lossmin/minimizers/loss-minimizer.cc
@@ -11,7 +11,7 @@
namespace lossmin {
bool LossMinimizer::Run(int max_epochs, int loss_epochs, int convergence_epochs,
- Weights *weights, std::vector<float> *loss) {
+ Weights *weights, std::vector<double> *loss) {
// Run for up to 'max_epochs' epochs.
int epoch;
for (epoch = 0; epoch < max_epochs; ++epoch) {
@@ -57,11 +57,11 @@
}
}
-void LossMinimizer::SimpleConvergenceCheck(const std::vector<float> &loss) {
+void LossMinimizer::SimpleConvergenceCheck(const std::vector<double> &loss) {
// Check convergence by verifying that the max relative loss decrease
// (loss[t-1] - loss[t]) / loss[t-1] is below 'simple_convergence_threshold_'.
if (loss.size() > num_convergence_epochs_) {
- float loss_difference = 0.0f;
+ double loss_difference = 0.0;
for (int i = loss.size() - num_convergence_epochs_; i < loss.size(); ++i) {
if (loss[i - 1] > 0) {
loss_difference = std::max(loss_difference, 1 - loss[i] / loss[i - 1]);
diff --git a/lossmin/minimizers/loss-minimizer.h b/lossmin/minimizers/loss-minimizer.h
index eb87947..4347b48 100644
--- a/lossmin/minimizers/loss-minimizer.h
+++ b/lossmin/minimizers/loss-minimizer.h
@@ -17,15 +17,15 @@
// regularization parameters. StochasticGradientDescent requires an
// additional parameter for learning rate scheduling.
//
-// float l1 = ...
-// float l2 = ...
+// double l1 = ...
+// double l2 = ...
// DeterministicGradientDescent loss_minimizer(l1, l2, gradient_evaluator);
//
// 3. Run optimization for up to 'max_epochs' epochs. 'loss' is filled with
// loss values across epochs, and 'weights' contains the best parameters.
//
// Weights weights = Weights::Zero(num_features); // or other initialization
-// vector<float> loss;
+// vector<double> loss;
// int max_epochs = 100;
// bool converged = loss_minimizer.Run(max_epochs, &weights, &loss);
//
@@ -60,7 +60,7 @@
public:
// Constructor sets the l1 and l2 regularization parameters and
// 'gradient_evalutor_'.
- LossMinimizer(float l1, float l2, const GradientEvaluator &gradient_evaluator)
+ LossMinimizer(double l1, double l2, const GradientEvaluator &gradient_evaluator)
: l1_(l1),
l2_(l2),
gradient_evaluator_(gradient_evaluator),
@@ -79,11 +79,11 @@
// last epoch. 'loss' is filled with training loss values produced every
// 'loss_epochs' epochs. Convergence is checked every 'convergence_epochs'.
bool Run(int max_epochs, int loss_epochs, int convergence_epochs,
- Weights *weights, std::vector<float> *loss);
+ Weights *weights, std::vector<double> *loss);
// Convenience Run method that evaluates the loss and checks for convergence
// at every iteration.
- bool Run(int max_epochs, Weights *weights, std::vector<float> *loss) {
+ bool Run(int max_epochs, Weights *weights, std::vector<double> *loss) {
return Run(max_epochs, 1, 1, weights, loss);
}
@@ -93,8 +93,8 @@
// Returns the total loss for given parameters 'weights', including l1 and l2
// regularization.
- virtual float Loss(const Weights &weights) const {
- float loss = gradient_evaluator_.Loss(weights);
+ virtual double Loss(const Weights &weights) const {
+ double loss = gradient_evaluator_.Loss(weights);
if (l2_ > 0.0f) loss += 0.5 * l2_ * weights.squaredNorm();
if (l1_ > 0.0f) loss += l1_ * weights.cwiseAbs().sum();
return loss;
@@ -109,7 +109,7 @@
// Checks convergence based on the decrease in loss over the last
// 'num_convergence_epochs_' epochs. If converged, the flag 'converged_' is
// set to true.
- void SimpleConvergenceCheck(const std::vector<float> &loss);
+ void SimpleConvergenceCheck(const std::vector<double> &loss);
// Setters and getters for convergence criteria parameters.
bool converged() const { return converged_; }
@@ -121,21 +121,21 @@
void set_use_simple_convergence_check(bool use_simple_convergence_check) {
use_simple_convergence_check_ = use_simple_convergence_check;
}
- float convergence_threshold() const { return convergence_threshold_; }
- void set_convergence_threshold(float convergence_threshold) {
+ double convergence_threshold() const { return convergence_threshold_; }
+ void set_convergence_threshold(double convergence_threshold) {
convergence_threshold_ = convergence_threshold;
}
- float simple_convergence_threshold() const {
+ double simple_convergence_threshold() const {
return simple_convergence_threshold_;
}
- void set_simple_convergence_threshold(float simple_convergence_threshold) {
+ void set_simple_convergence_threshold(double simple_convergence_threshold) {
simple_convergence_threshold_ = simple_convergence_threshold;
}
void set_num_convergence_epochs(int num_convergence_epochs) {
num_convergence_epochs_ = num_convergence_epochs;
}
- float zero_threshold() const { return zero_threshold_; }
- void set_zero_threshold(float zero_threshold) {
+ double zero_threshold() const { return zero_threshold_; }
+ void set_zero_threshold(double zero_threshold) {
zero_threshold_ = zero_threshold;
}
@@ -145,18 +145,18 @@
}
// Getter/setter of the l1 regularization parameter.
- float l1() const { return l1_; }
- void set_l1(float l1) { l1_ = l1; }
+ double l1() const { return l1_; }
+ void set_l1(double l1) { l1_ = l1; }
// Getter/setter of the l2 regularization parameter.
- float l2() const { return l2_; }
- void set_l2(float l2) { l2_ = l2; }
+ double l2() const { return l2_; }
+ void set_l2(double l2) { l2_ = l2; }
// Returns the number of iterations the last time Run() was executed
int num_epochs_run() const { return num_epochs_run_; }
// Applies L1Prox coefficientwise to 'weights' and 'threshold'.
- static void L1Prox(float threshold, Weights *weights) {
+ static void L1Prox(double threshold, Weights *weights) {
for (int i = 0; i < weights->size(); ++i) {
weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold);
}
@@ -164,35 +164,35 @@
// Applies L1Prox coefficientwise to 'weights' and 'threshold', where
// 'threshold' is a vector of per-coordinate thresholds.
- static void L1Prox(const VectorXf &threshold, Weights *weights) {
+ static void L1Prox(const VectorXd &threshold, Weights *weights) {
for (int i = 0; i < weights->size(); ++i) {
weights->coeffRef(i) = L1Prox(weights->coeff(i), threshold.coeff(i));
}
}
// Returns sign('x') * max(0.0, abs('x') - 'threshold').
- static inline float L1Prox(float x, float threshold) {
- return Sign(x) * std::max(std::abs(x) - threshold, 0.0f);
+ static inline double L1Prox(double x, double threshold) {
+ return Sign(x) * std::max(std::abs(x) - threshold, 0.0);
}
// Returns sign('x').
- static inline float Sign(float x) {
- if (x > 0.0f) return 1.0f;
- if (x < 0.0f) return -1.0f;
- return 0.0f;
+ static inline double Sign(double x) {
+ if (x > 0.0) return 1.0;
+ if (x < 0.0) return -1.0;
+ return 0.0;
}
private:
// Regularization parameters.
- float l1_;
- float l2_;
+ double l1_;
+ double l2_;
// GradientEvaluator used to compute the (unregularized) loss and gradient.
const GradientEvaluator &gradient_evaluator_;
// Convergence parameters.
// Convergence threshold should be strict but not too strict.
- // This will depend on precision used. As float gives 1e-8 relative accuracy,
+ // This will depend on precision used. As double gives 1e-8 relative accuracy,
// 1e-6 or 1e-7 is probably strictest one should use (but this also depends
// on the implementation of convergence checks).
// This can also be updated during initialization of the minimizer so the
@@ -201,16 +201,16 @@
bool reached_solution_ =
false; // flag indicating whether the algorithm
// actually reached the solution as determined by ConvergenceCheck
- float convergence_threshold_ =
- 1e-5; // threshold for assessing convergence by ConvergenceCheck
- float simple_convergence_threshold_ =
+ double convergence_threshold_ =
+ 1e-7; // threshold for assessing convergence by ConvergenceCheck
+ double simple_convergence_threshold_ =
1e-5; // threshold for assesing convergence by SimpleConvergenceCheck
bool use_simple_convergence_check_ = false; // which convergence check to use
int num_convergence_epochs_ = 5; // used in SimpleConvergenceCheck
// zero_threshold_ is the threshold below which we treat the coordinate value
// as zero (in absolute terms). This is used in ConvergenceCheck.
- float zero_threshold_ = 1e-6;
+ double zero_threshold_ = 1e-6;
// The number of epochs (iterations) when Run() was executed.
// In other words, each epoch is a step towards minimum during minimization.
diff --git a/lossmin/minimizers/parallel-boosting-with-momentum.cc b/lossmin/minimizers/parallel-boosting-with-momentum.cc
index e3f25f6..3eb1848 100644
--- a/lossmin/minimizers/parallel-boosting-with-momentum.cc
+++ b/lossmin/minimizers/parallel-boosting-with-momentum.cc
@@ -16,8 +16,8 @@
void ParallelBoostingWithMomentum::Setup() {
compute_and_set_learning_rates();
- alpha_ = 0.5f;
- beta_ = 1.0f - alpha_;
+ alpha_ = 0.5;
+ beta_ = 1.0 - alpha_;
phi_center_ = Weights::Zero(gradient_evaluator().NumWeights());
}
@@ -25,7 +25,7 @@
// Per-coordinate learning rates are learning_rates[j] = 1 / sparsity * Lj,
// where sparsity is the maximum instance l0 norm and Lj is upper bound on
// the loss curvature along coordinate j.
- float sparsity = gradient_evaluator().Sparsity();
+ double sparsity = gradient_evaluator().Sparsity();
gradient_evaluator().PerCoordinateCurvature(&learning_rates_);
learning_rates_ =
(learning_rates_.array() + l2()).inverse().matrix() / sparsity;
@@ -37,25 +37,25 @@
// TODO(bazyli) parallel matrix vector multiply with open mp
Weights residual = gradient_evaluator().instances() * weights;
residual -= gradient_evaluator().labels();
- *gradient = instances_transpose_ * residual;
+ *gradient = gradient_evaluator().instances_transposed() * residual;
*gradient /= gradient_evaluator().NumExamples();
}
-float ParallelBoostingWithMomentum::Loss(const Weights &weights) const {
+double ParallelBoostingWithMomentum::Loss(const Weights &weights) const {
// Eigen library recommends computations step-by-step for best perfomance
// TODO(bazyli) parallel matrix vector multiply with open mp
Weights residual = gradient_evaluator().instances() * weights;
residual -= gradient_evaluator().labels();
- float loss =
+ double loss =
0.5 * residual.squaredNorm() / gradient_evaluator().NumExamples();
- if (l2() > 0.0f) loss += 0.5 * l2() * weights.squaredNorm();
- if (l1() > 0.0f) loss += l1() * weights.cwiseAbs().sum();
+ if (l2() > 0.0) loss += 0.5 * l2() * weights.squaredNorm();
+ if (l1() > 0.0) loss += l1() * weights.cwiseAbs().sum();
return loss;
}
void ParallelBoostingWithMomentum::ConvergenceCheck(const Weights &weights,
const Weights &gradient) {
- float error_squared = 0.0f;
+ double error_squared = 0.0;
for (int i = 0; i < gradient.size(); i++) {
// for weights > 0 the gradient should be == -l1
if (weights(i) > zero_threshold()) {
@@ -67,7 +67,7 @@
}
// for weights == 0 the gradient should be between -l1 and l1
else {
- float err = std::max(std::abs(gradient(i)) - l1(), 0.0f);
+ double err = std::max(std::abs(gradient(i)) - l1(), 0.0);
error_squared += err * err;
}
}
@@ -81,25 +81,25 @@
void ParallelBoostingWithMomentum::EpochUpdate(Weights *weights, int epoch,
bool check_convergence) {
// Compute the intermediate weight vector y.
- Weights y = (1.0f - alpha_) * *weights + alpha_ * phi_center_;
+ Weights y = (1.0 - alpha_) * *weights + alpha_ * phi_center_;
// Compute the gradient of the loss (except l1 penalty) wrt y.
Weights gradient_wrt_y = Weights::Zero(y.size());
SparseInnerProductGradient(y, &gradient_wrt_y);
- if (l2() > 0.0f) gradient_wrt_y += l2() * y;
+ if (l2() > 0.0) gradient_wrt_y += l2() * y;
// Gradient step.
*weights -= gradient_wrt_y.cwiseProduct(learning_rates_);
// l1 shrinkage.
- if (l1() > 0.0f) {
+ if (l1() > 0.0) {
L1Prox(l1() * learning_rates_, weights);
}
// Update the approximation function.
phi_center_ -= (1.0 - alpha_) / alpha_ * (y - *weights);
alpha_ =
- -beta_ / 2.0 + pow(beta_ + beta_ * beta_ / 4.0, static_cast<float>(0.5));
+ -beta_ / 2.0 + pow(beta_ + beta_ * beta_ / 4.0, static_cast<double>(0.5));
beta_ *= (1.0 - alpha_);
// Compute the gradient of the objective except the l1 part and check
diff --git a/lossmin/minimizers/parallel-boosting-with-momentum.h b/lossmin/minimizers/parallel-boosting-with-momentum.h
index 62e75e6..c86c11f 100644
--- a/lossmin/minimizers/parallel-boosting-with-momentum.h
+++ b/lossmin/minimizers/parallel-boosting-with-momentum.h
@@ -21,10 +21,9 @@
class ParallelBoostingWithMomentum : public LossMinimizer {
public:
- ParallelBoostingWithMomentum(float l1, float l2,
+ ParallelBoostingWithMomentum(double l1, double l2,
const GradientEvaluator &gradient_evaluator)
- : LossMinimizer(l1, l2, gradient_evaluator),
- instances_transpose_(gradient_evaluator.instances().transpose()) {
+ : LossMinimizer(l1, l2, gradient_evaluator) {
Setup();
}
@@ -53,7 +52,7 @@
// regularization. Uses sparse matrix multiply from Eigen.
// This is more efficient in terms of performed operations than calling
// gradient_evaluator().Loss(weights).
- float Loss(const Weights &weights) const override;
+ double Loss(const Weights &weights) const override;
// Computes the inner product gradient at |weights|, written to |gradient*|;
// |*gradient| must be of the same size as |weights|. Uses sparse matrix
@@ -69,17 +68,17 @@
// Following the paper exactly, phi_center_ should be equal to the
// initial guess for weights when Run() is executed (however, this requirement
// does not seem to be necessary for convergence in practice).
- void set_phi_center(const VectorXf &phi) { phi_center_ = phi; }
+ void set_phi_center(const VectorXd &phi) { phi_center_ = phi; }
// Computes the learning rates. This is introduced to enable recomputing
// learning rates in case l2 penalty changes.
void compute_and_set_learning_rates();
// Set alpha_; we may need to reset alpha before Run().
- void set_alpha(const float alpha) { alpha_ = alpha; }
+ void set_alpha(const double alpha) { alpha_ = alpha; }
// Set beta_; we may need to reset beta before Run().
- void set_beta(const float beta) { beta_ = beta; }
+ void set_beta(const double beta) { beta_ = beta; }
private:
// Run epoch (iteration) of the algorithm.
@@ -94,24 +93,19 @@
bool check_convergence) override;
// Per-coordinate learning rates.
- VectorXf learning_rates_;
+ VectorXd learning_rates_;
// Center of the approximating quadratic function phi.
- VectorXf phi_center_;
+ VectorXd phi_center_;
// Parameter for updating the approximation function phi. At each iteration,
// 'alpha_' is updated to the solution of the quadratic equation
// alpha_^2 = beta_ * (1.0 - alpha_)
- float alpha_;
+ double alpha_;
// Parameter used to update alpha, defined as
// beta_{epoch} = \prod_{i=1}^{epoch} (1 - alpha_i).
- float beta_;
-
- // The transpose of instances. It is needed for faster gradient computations.
- // It should be computed when (and only when) instances changes,
- // so it is computed at the construction of the minimizer.
- const InstanceSet instances_transpose_;
+ double beta_;
};
} // namespace lossmin