src/algorithms/rappor/lasso_runner.cc - cobalt - Git at Google

 // Copyright 2018 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "src/algorithms/rappor/lasso_runner.h"

 #include <algorithm>

 #include <glog/logging.h>

 using cobalt_lossmin::GradientEvaluator;
 using cobalt_lossmin::InstanceSet;
 using cobalt_lossmin::LabelSet;
 using cobalt_lossmin::ParallelBoostingWithMomentum;
 using cobalt_lossmin::Weights;

 namespace cobalt::rappor {

 namespace {
 // ***************************************************************************
 // Constants used by the cobalt_lossmin minimizers in both steps of RAPPOR
 // (lasso_runner.RunFirstRapporStep and lasso_runner.GetExactValuesAndStdErrs).
 // They are meant to be generic so modify them with caution.
 //
 // kZeroThreshold is the (relative) proportion below which we assume a
 // candidate count to be effectively zero; that is, we assume a candidate to
 // be zero if its estimated count is below kZeroThreshold of the
 // bit_counter_.num_observations(). It cannot be exactly zero for
 // performance reasons (also, exact zero makes little if any sense
 // numerically). Also, smaller values might be more difficult for individual
 // runs of the minimizer.
 // You can change kZeroThreshold to the value you think is best (smaller
 // than what you would deem negligible); however, do not get close to double
 // precision accuracy (1e-16). A reasonable value could be between 1e-4 and
 // 1e-8.
 constexpr double kZeroThreshold = 1e-6;
 // kL2toL1Ratio is the ratio between l1 and l2 penalty.
 // Although pure lasso (or linear regression) does not include any l2 penalty,
 // a tiny bit can improve stability. A value of kL2toL1Ratio less or equal to
 // 1e-2 should not affect the interpretation of the solution.
 constexpr double kL2toL1Ratio = 1e-3;
 // kLossEpochs denotes how often the current objective value is recorded in
 // the solver (it will be recorded every kLossEpochs epochs).
 // kConvergenceMeasures denotes how often the minimizer checks convergence:
 // the algorithm will check convergence
 // every kConvergenceEpochs epochs and terminate if at least
 // one of the following is true.
 // 1. The minimizer reached the solution up to the accuracy defined by
 // minimizer.covergence_threshold(). In this case,
 // minimizer.reached_solution() == true and minimizer.converged() == true.
 // 2. The algorithm stopped improving as measured by the
 // simple convergence check of the minimizer.
 // In this case, minimizer.converged() == true.
 // kLossEpochs and kConvergenceEpochs must be positive (and
 // probably both not larger than 10). Also, kLossEpochs <=
 // kConvergenceEpochs makes more sense.
 constexpr int kLossEpochs = 5;
 constexpr int kConvergenceMeasures = 5;
 // kAlpha is a constant from parallel boosting with momentum paper,
 // must be 0 < kAlpha < 1; cobalt_lossmin library default initial choice is 0.5,
 // but we need to be able to reset this value in minimizer if needed.
 constexpr double kAlpha = 0.5;
 // kMinConvergenceThreshold is an absolute lower bound for the convergence
 // thresholds in the algorithm; It should be something within a couple of
 // orders of magnitude of the double precision (reasonable values may be
 // between 1e-12 and 1e-14). This is introduced because convergence thresholds
 // are computed relatively to the initial gradient norm and so they might be
 // too low if the initial guess is very good.
 constexpr double kMinConvergenceThreshold = 1e-12;
 //
 // ***************************************************************************
 // Constants of the lasso_runner that will be used in RunFirstLassoStep.
 //
 // Note(bazyli) about modifying constants:
 // 1. To improve the accuracy of the solution you may want to
 // set the convergence thresholds: kRelativeConvergenceThreshold,
 // kRelativeInLassoPathConvergenceThreshold, and kSimpleConvergenceThreshold
 // to smaller values, at the expense of more computations (epochs) performed.
 // See their descriptions below.
 // 2. You may want to modify kMaxEpochs if it is too strict (which may be the
 // case if you set very small convergence thresholds), but kNumLassoSteps and
 // kL1maxToL1minRatio should be quite generic so modify them with caution.
 // 3. You may test whether linear path (kUseLinearPath == true) or
 // (kUseLinearPath == false) is better for your case. The run times and result
 // quality may slightly differ (see below).
 //
 // Define minimizer and lasso-path-specific parameters.
 // kRelativeConvergenceThreshold is the improvement that we expect to achieve
 // at convergence, relative to initial gradient. That is, if ||g|| is the
 // initial 2-norm norm of the gradient, we expect the final (i.e. after the
 // last lasso problem) total measure of KKT violation to be equal to
 // kRelativeConvergenceThreshold * ||g||. Value smaller than 1e-8 gives a full
 // (single precision) convergence and is probably overshooting because the
 // purpose of the first step is mostly to identify potential nonzero
 // coefficients. Something between 1e-5 and 1e-7 should be enough.
 // Even larger value can be chosen for performance reasons. This accuracy will
 // be used only in the final lasso subproblem.
 // kRelativeInLassoPathConvergenceThreshold has the same interpretation but
 // will be used inside the lasso path (before the last subproblem). We
 // probably want to set this to be slightly (e.g. ten times) less restrictive
 // (larger) than kRelativeConvergenceThreshold for efficiency because the
 // solutions inside the lasso path serve as a "warm-up" before the last
 // subproblem; this subproblem, on the other hand, can be more accurate and
 // should benefit more from "momentum" computations in the algorithm.
 // kSimpleConvergenceThreshold == d means that the algorithm will stop if the
 // best relative improvement between two consecutive measures of the objective
 // in the last kConvergenceMeasures measures, is below d. In other words,
 // relative improvements less than d will cause the minimizer to return.
 // See the description of kConvergenceEpochs and
 // kLossEpochs above. A rule-of-thumb value for kSimpleConvergenceThreshold
 // can be some small fraction of the inverse of kMaxEpochs below.
 //
 // Note: The actual absolute values of convergence
 // thresholds used in the algorithm are capped by
 // loss_minimizer.min_convergence_threshold_ in case ||g|| is almost zero in the
 // first place (see below). Note: If you are bumping into "The lasso path did
 // not reach the last subproblem" or "The lasso path did not reach the last
 // subproblem." in Analyze, it might mean that the convergence thresholds are
 // too strict for the specified kMaxEpochs limit, given the difficulty of the
 // problem.
 constexpr double kRelativeConvergenceThreshold = 1e-5;
 constexpr double kRelativeInLassoPathConvergenceThreshold = 1e-4;
 constexpr double kSimpleConvergenceThreshold = 1e-5;
 // kMaxEpochs denotes the limit on the total number of epochs (iterations)
 // run; the actual number can be up to two times larger
 // because every individual lasso subproblem (run of
 // minimizer) has the same bound and the total epochs count is updated after
 // the subproblem is run.
 // Note: If you are bumping into "The lasso path did not reach the
 // last subproblem." error in Analyze, it is possible that this number is too
 // strict (low) for the convergence thresholds above.
 constexpr int kMaxEpochs = 20000;
 // kNumLassoSteps is the number of subproblems solved in the lasso path;
 // it is not true that more steps will take more time: there should be a "sweet
 // spot", and definitely this number should not be too small (probably something
 // between 50 and 500).
 constexpr int kNumLassoSteps = 100;
 // kL1maxToL1minRatio is the ratio between the first and the last l1 penalty
 // value in the lasso path; should probably be something between 1e-6 and
 // 1e-3 (the R library glmnet uses 1e-3 by default).
 constexpr double kL1maxToL1minRatio = 1e-3;
 // kUseLinearPath specifies whether the lasso path should be linear (true) or
 // logarithmic. Linear path means that the l1 penalties form an arithmetic
 // sequence from largest to smallest; logarithmic path means that the penalties
 // form a geometric series. Note(bazyli): linear path is more conservative in
 // decreasing the penalties in the initial part of the lasso path. Since we are
 // interested in the heavy hitters, we may want to prefer to add them slowly in
 // this phase. Logarithmic path, on the other hand, decreases the penalty fast
 // in the initial phase but more slowly towards the end of the lasso path. This
 // is numerically more stable and may be faster but may introduce a lot of
 // nonzero coefficients in the initial phase.
 constexpr bool kUseLinearPath = true;
 //
 // ***************************************************************************
 // Constants related to GetExactValuesAndStdErrors
 //
 // The convergence thresholds and
 // kMaxEpochsSingleRun have the same definitions as the corresponding
 // constants in RunFirstRapporStep. However, we may want the convergence
 // thresholds to be more strict, because here we are interested in more exact
 // values, and the problem to solve should be easier.
 //
 // Initialize the minimizer constants.
 // Relative convergence thresholds have the same interpretation as the ones
 // used in RunFirstRapporStep. Consult their description if you plan to modify
 // the values.
 constexpr double kRelativeConvergenceThreshold2Step = 1e-6;
 constexpr double kSimpleConvergenceThreshold2Step = 1e-6;
 // kNumRuns is the number of runs to estimate standard deviations of
 // coefficients. The problem will be solved kNumRuns times, each time
 // with a slightly different right hand side.
 // This number probably should not be smaller than 5 but not too large:
 // large value should give a better approximation of standard errors but
 // kNumRuns == n means that the whole problem will be solved n times so it
 // affects the runtime.
 constexpr int kNumRuns = 20;
 // kMaxEpochsSingleRun is the maximum number of epochs in a single
 // run of the algorithm. So total number of epochs run will be bounded by
 // kNumRuns * kMaxEpochsSingleRun.
 constexpr int kMaxEpochsSingleRun = 5000;
 // Note(bazyli): If convergence thresholds are too small relative to
 // kMaxEpochsSingleRun then some of the runs of the algorithm may not
 // converge. If less than 5 converge then the standard errors are set to 0. If
 // none converges, then the exact_est_candidate_weights is set to be the same
 // as est_candidate_weights. This is a safeguard solution and if it happens,
 // we may want to adjust the constants so that it doesn't happen again (e.g.
 // increase kMaxEpochsSingleRun).
 }  // namespace

 LassoRunner::LassoRunner(const InstanceSet* matrix) : matrix_(matrix) {
   zero_threshold_ = kZeroThreshold;
   l2_to_l1_ratio_ = kL2toL1Ratio;
   alpha_ = kAlpha;
   min_convergence_threshold_ = kMinConvergenceThreshold;
   num_lasso_steps_ = kNumLassoSteps;
   l1_max_to_l1_min_ratio_ = kL1maxToL1minRatio;
   use_linear_path_ = kUseLinearPath;
 }

 void LassoRunner::RunFirstRapporStep(const int max_nonzero_coeffs, const double max_solution_1_norm,
                                      const LabelSet& as_label_set, Weights* est_candidate_weights,
                                      std::vector<int>* second_step_cols) {
   // Construct the lossmin minimizer objects.
   // TODO(bazyli): remove this copy
   InstanceSet candidate_matrix = *matrix_;
   GradientEvaluator grad_eval(candidate_matrix, as_label_set);
   ParallelBoostingWithMomentum minimizer(0.0, 0.0, grad_eval);  // penalties will be set later.

   // Initialize the solution vector to zero vector for the lasso path and
   // compute the initial gradient (this will be used to get initial l1 penalty
   // and convergence thresholds).
   const int num_candidates = candidate_matrix.cols();
   Weights initial_gradient = Weights::Zero(num_candidates);
   grad_eval.SparseGradient(*est_candidate_weights, &initial_gradient);

   // Set the minimizer absolute convergence constants.
   const double initial_mean_gradient_norm = initial_gradient.norm() / num_candidates;
   const double kConvergenceThreshold = std::max(
       min_convergence_threshold_, kRelativeConvergenceThreshold * initial_mean_gradient_norm);
   const double kInLassoPathConvergenceThreshold =
       std::max(min_convergence_threshold_,
                kRelativeInLassoPathConvergenceThreshold * initial_mean_gradient_norm);

   // Set the constants for lasso path computations.
   const double l1max = initial_gradient.array().abs().maxCoeff();
   const double l1min = l1_max_to_l1_min_ratio_ * l1max;
   const double l2 = l2_to_l1_ratio_ * l1min;
   const double l1delta = (l1max - l1min) / num_lasso_steps_;
   const double l1mult = std::exp(std::log(l1_max_to_l1_min_ratio_) / num_lasso_steps_);

   // Set up the minimizer.
   minimizer.set_zero_threshold(zero_threshold_);
   minimizer.set_convergence_threshold(kInLassoPathConvergenceThreshold);
   minimizer.set_simple_convergence_threshold(kSimpleConvergenceThreshold);
   minimizer.set_l2(l2);
   minimizer.compute_and_set_learning_rates();  // learning rates must be
                                                // re-computed when l2 penalty
                                                // changes.
   VLOG(4) << "Lasso in-path convergence threshold ==" << kInLassoPathConvergenceThreshold;
   VLOG(4) << "Lasso final convergence threshold ==" << kConvergenceThreshold;

   // Initialize variables to track the lasso path computations.
   std::vector<double> loss_history;
   double solution_1_norm = 0;
   int total_epochs_run = 0;
   int how_many_nonzero_coeffs = 0;

   // Perform lasso path computations.
   int i = 0;
   double l1_this_step = use_linear_path_ ? l1max - l1delta : l1max * l1mult;

   for (; i < num_lasso_steps_ && total_epochs_run < kMaxEpochs; i++) {
     VLOG(4) << "Minimizing " << i << "-th Lasso subproblem";

     if (how_many_nonzero_coeffs >= max_nonzero_coeffs || i == num_lasso_steps_ - 1 ||
         solution_1_norm >= max_solution_1_norm) {
       // Enter the final lasso subproblem.
       minimizer.set_convergence_threshold(kConvergenceThreshold);
       if (i < num_lasso_steps_ - 1) {
         // If stopping criteria are met before reaching maximum number of steps,
         // use l1 from previous run.
         l1_this_step = use_linear_path_ ? l1_this_step + l1delta : l1_this_step / l1mult;
         i = num_lasso_steps_ - 1;
       }
       VLOG(4) << "Entered last Run";
     }

     minimizer.set_l1(std::max(l1min, l1_this_step));
     minimizer.set_reached_solution(false);
     minimizer.set_converged(false);

     // Set minimizer input as in the parallel boosting with momentum paper.
     minimizer.set_phi_center(*est_candidate_weights);
     minimizer.set_alpha(alpha_);
     minimizer.set_beta(1 - alpha_);

     VLOG(4) << "The l1 penalty used == " << l1_this_step;

     minimizer.Run(kMaxEpochs, kLossEpochs, kConvergenceMeasures, est_candidate_weights,
                   &loss_history);

     // Compute the 1-norm of the current solution and the number of nonzero
     // coefficients.
     solution_1_norm = est_candidate_weights->lpNorm<1>();
     how_many_nonzero_coeffs = (est_candidate_weights->array() > zero_threshold_).count();

     VLOG(4) << "Ran " << minimizer.num_epochs_run() << " epochs in this step.";
     VLOG(4) << "Num of nonzero coefficients found: " << how_many_nonzero_coeffs;
     VLOG(4) << "Solution 1-norm == " << solution_1_norm;
     total_epochs_run += minimizer.num_epochs_run();

     l1_this_step = use_linear_path_ ? l1_this_step - l1delta : l1_this_step * l1mult;
   }

   VLOG(4) << "Ran " << total_epochs_run << " epochs in total.";

   // Record minimizer info.
   minimizer_data_.reached_last_lasso_subproblem = (i == num_lasso_steps_);
   minimizer_data_.num_epochs_run = total_epochs_run;
   minimizer_data_.converged = minimizer.converged();
   minimizer_data_.reached_solution = minimizer.reached_solution();
   minimizer_data_.l1 = minimizer.l1();
   minimizer_data_.l2 = minimizer.l2();
   minimizer_data_.convergence_threshold = kConvergenceThreshold;
   minimizer_data_.zero_threshold = kZeroThreshold;

   // Prepare the vector of columns for the second step of RAPPOR.
   second_step_cols->clear();
   second_step_cols->reserve(how_many_nonzero_coeffs);
   for (int i = 0; i < est_candidate_weights->size(); i++) {
     if (est_candidate_weights->coeff(i) > zero_threshold_) {
       second_step_cols->push_back(i);
     }
   }
 }

 void LassoRunner::GetExactValuesAndStdErrs(const double l1, const Weights& est_candidate_weights,
                                            const std::vector<double>& est_standard_errs,
                                            const InstanceSet& instances,
                                            const LabelSet& as_label_set,
                                            Weights* exact_est_candidate_weights,
                                            Weights* est_candidate_errors) {
   // Note(bazyli): If convergence thresholds are too small relative to
   // kMaxEpochsSingleRun then some of the runs of the algorithm may not
   // converge. If less than 5 converge then the standard errors are set to 0. If
   // none converges, then the exact_est_candidate_weights is set to be the same
   // as est_candidate_weights. This is a safeguard solution and if it happens,
   // we may want to adjust the constants so that it doesn't happen again (e.g.
   // increase kMaxEpochsSingleRun).
   const double l2 = l2_to_l1_ratio_ * l1;
   const int num_candidates = est_candidate_weights.size();
   const int num_labels = as_label_set.size();
   const int min_num_converged_for_standard_errors = 5;
   int num_converged = 0;  // We record the number of runs in which the minimizer
                           // actually converged (although if everything is fine,
                           // i.e. all constants have appropriate values for the given case, this
                           // should be equal to num_runs at completion).

   // We will need the solutions from all runs to compute the mean solution and
   // standard errors.
   std::vector<Weights> est_weights_runs;  // vector for storing solutions from different runs
   Weights mean_est_weights = Weights::Zero(num_candidates);

   // In each run create a new_label_set by adding Gaussian noise to the original
   // as_label_set.
   for (int i = 0; i < kNumRuns; i++) {
     LabelSet new_label_set = as_label_set;

     for (int j = 0; j < num_labels; j++) {
       std::normal_distribution<double> nrm_distr(0, static_cast<double>(est_standard_errs[j]));
       double noise = nrm_distr(random_dev_);
       new_label_set(j) += noise;
     }

     // We will run the minimizer for the right hand side with noise
     // (new_label_set). We always use the est_candidate_weights as the initial
     // guess.

     // Construct the minimizer and compute initial gradient.
     Weights new_candidate_weights = est_candidate_weights;
     std::vector<double> loss_history_not_used;
     GradientEvaluator grad_eval(instances, new_label_set);
     ParallelBoostingWithMomentum minimizer(l1, l2, grad_eval);
     Weights initial_gradient = Weights(num_candidates);
     grad_eval.SparseGradient(new_candidate_weights, &initial_gradient);
     double initial_mean_gradient_norm = initial_gradient.norm() / num_candidates;
     double convergence_threshold =
         std::max(min_convergence_threshold_,
                  kRelativeConvergenceThreshold2Step * initial_mean_gradient_norm);

     // Set up and run the minimizer.
     minimizer.set_converged(false);
     minimizer.set_reached_solution(false);
     minimizer.set_phi_center(new_candidate_weights);
     minimizer.set_convergence_threshold(convergence_threshold);
     minimizer.set_zero_threshold(zero_threshold_);
     minimizer.set_simple_convergence_threshold(kSimpleConvergenceThreshold2Step);
     minimizer.set_alpha(alpha_);
     minimizer.set_beta(1 - alpha_);
     minimizer.Run(kMaxEpochsSingleRun, kLossEpochs, kConvergenceMeasures, &new_candidate_weights,
                   &loss_history_not_used);

     if (minimizer.converged()) {
       // Update the mean and store the current solution.
       mean_est_weights += new_candidate_weights;
       est_weights_runs.push_back(new_candidate_weights);
       num_converged++;
     }
   }

   if (num_converged > 0) {
     mean_est_weights /= num_converged;
   } else {
     mean_est_weights = est_candidate_weights;
   }

   // Compute the sample means and standard deviations (standard errors).
   Weights sample_stds = Weights::Zero(num_candidates);
   if (num_converged >= min_num_converged_for_standard_errors) {
     for (auto& est_weight : est_weights_runs) {
       sample_stds += (est_weight - mean_est_weights).array().pow(2).matrix();
     }
     sample_stds = sample_stds / (num_converged - 1);
     sample_stds = sample_stds.array().sqrt();
   }
   *est_candidate_errors = sample_stds;
   *exact_est_candidate_weights = mean_est_weights;
 }

 }  // namespace cobalt::rappor
	// Copyright 2018 The Fuchsia Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "src/algorithms/rappor/lasso_runner.h"

	#include <algorithm>

	#include <glog/logging.h>

	using cobalt_lossmin::GradientEvaluator;
	using cobalt_lossmin::InstanceSet;
	using cobalt_lossmin::LabelSet;
	using cobalt_lossmin::ParallelBoostingWithMomentum;
	using cobalt_lossmin::Weights;

	namespace cobalt::rappor {

	namespace {
	// ***************************************************************************
	// Constants used by the cobalt_lossmin minimizers in both steps of RAPPOR
	// (lasso_runner.RunFirstRapporStep and lasso_runner.GetExactValuesAndStdErrs).
	// They are meant to be generic so modify them with caution.
	//
	// kZeroThreshold is the (relative) proportion below which we assume a
	// candidate count to be effectively zero; that is, we assume a candidate to
	// be zero if its estimated count is below kZeroThreshold of the
	// bit_counter_.num_observations(). It cannot be exactly zero for
	// performance reasons (also, exact zero makes little if any sense
	// numerically). Also, smaller values might be more difficult for individual
	// runs of the minimizer.
	// You can change kZeroThreshold to the value you think is best (smaller
	// than what you would deem negligible); however, do not get close to double
	// precision accuracy (1e-16). A reasonable value could be between 1e-4 and
	// 1e-8.
	constexpr double kZeroThreshold = 1e-6;
	// kL2toL1Ratio is the ratio between l1 and l2 penalty.
	// Although pure lasso (or linear regression) does not include any l2 penalty,
	// a tiny bit can improve stability. A value of kL2toL1Ratio less or equal to
	// 1e-2 should not affect the interpretation of the solution.
	constexpr double kL2toL1Ratio = 1e-3;
	// kLossEpochs denotes how often the current objective value is recorded in
	// the solver (it will be recorded every kLossEpochs epochs).
	// kConvergenceMeasures denotes how often the minimizer checks convergence:
	// the algorithm will check convergence
	// every kConvergenceEpochs epochs and terminate if at least
	// one of the following is true.
	// 1. The minimizer reached the solution up to the accuracy defined by
	// minimizer.covergence_threshold(). In this case,
	// minimizer.reached_solution() == true and minimizer.converged() == true.
	// 2. The algorithm stopped improving as measured by the
	// simple convergence check of the minimizer.
	// In this case, minimizer.converged() == true.
	// kLossEpochs and kConvergenceEpochs must be positive (and
	// probably both not larger than 10). Also, kLossEpochs <=
	// kConvergenceEpochs makes more sense.
	constexpr int kLossEpochs = 5;
	constexpr int kConvergenceMeasures = 5;
	// kAlpha is a constant from parallel boosting with momentum paper,
	// must be 0 < kAlpha < 1; cobalt_lossmin library default initial choice is 0.5,
	// but we need to be able to reset this value in minimizer if needed.
	constexpr double kAlpha = 0.5;
	// kMinConvergenceThreshold is an absolute lower bound for the convergence
	// thresholds in the algorithm; It should be something within a couple of
	// orders of magnitude of the double precision (reasonable values may be
	// between 1e-12 and 1e-14). This is introduced because convergence thresholds
	// are computed relatively to the initial gradient norm and so they might be
	// too low if the initial guess is very good.
	constexpr double kMinConvergenceThreshold = 1e-12;
	//
	// ***************************************************************************
	// Constants of the lasso_runner that will be used in RunFirstLassoStep.
	//
	// Note(bazyli) about modifying constants:
	// 1. To improve the accuracy of the solution you may want to
	// set the convergence thresholds: kRelativeConvergenceThreshold,
	// kRelativeInLassoPathConvergenceThreshold, and kSimpleConvergenceThreshold
	// to smaller values, at the expense of more computations (epochs) performed.
	// See their descriptions below.
	// 2. You may want to modify kMaxEpochs if it is too strict (which may be the
	// case if you set very small convergence thresholds), but kNumLassoSteps and
	// kL1maxToL1minRatio should be quite generic so modify them with caution.
	// 3. You may test whether linear path (kUseLinearPath == true) or
	// (kUseLinearPath == false) is better for your case. The run times and result
	// quality may slightly differ (see below).
	//
	// Define minimizer and lasso-path-specific parameters.
	// kRelativeConvergenceThreshold is the improvement that we expect to achieve
	// at convergence, relative to initial gradient. That is, if \|\|g\|\| is the
	// initial 2-norm norm of the gradient, we expect the final (i.e. after the
	// last lasso problem) total measure of KKT violation to be equal to
	// kRelativeConvergenceThreshold * \|\|g\|\|. Value smaller than 1e-8 gives a full
	// (single precision) convergence and is probably overshooting because the
	// purpose of the first step is mostly to identify potential nonzero
	// coefficients. Something between 1e-5 and 1e-7 should be enough.
	// Even larger value can be chosen for performance reasons. This accuracy will
	// be used only in the final lasso subproblem.
	// kRelativeInLassoPathConvergenceThreshold has the same interpretation but
	// will be used inside the lasso path (before the last subproblem). We
	// probably want to set this to be slightly (e.g. ten times) less restrictive
	// (larger) than kRelativeConvergenceThreshold for efficiency because the
	// solutions inside the lasso path serve as a "warm-up" before the last
	// subproblem; this subproblem, on the other hand, can be more accurate and
	// should benefit more from "momentum" computations in the algorithm.
	// kSimpleConvergenceThreshold == d means that the algorithm will stop if the
	// best relative improvement between two consecutive measures of the objective
	// in the last kConvergenceMeasures measures, is below d. In other words,
	// relative improvements less than d will cause the minimizer to return.
	// See the description of kConvergenceEpochs and
	// kLossEpochs above. A rule-of-thumb value for kSimpleConvergenceThreshold
	// can be some small fraction of the inverse of kMaxEpochs below.
	//
	// Note: The actual absolute values of convergence
	// thresholds used in the algorithm are capped by
	// loss_minimizer.min_convergence_threshold_ in case \|\|g\|\| is almost zero in the
	// first place (see below). Note: If you are bumping into "The lasso path did
	// not reach the last subproblem" or "The lasso path did not reach the last
	// subproblem." in Analyze, it might mean that the convergence thresholds are
	// too strict for the specified kMaxEpochs limit, given the difficulty of the
	// problem.
	constexpr double kRelativeConvergenceThreshold = 1e-5;
	constexpr double kRelativeInLassoPathConvergenceThreshold = 1e-4;
	constexpr double kSimpleConvergenceThreshold = 1e-5;
	// kMaxEpochs denotes the limit on the total number of epochs (iterations)
	// run; the actual number can be up to two times larger
	// because every individual lasso subproblem (run of
	// minimizer) has the same bound and the total epochs count is updated after
	// the subproblem is run.
	// Note: If you are bumping into "The lasso path did not reach the
	// last subproblem." error in Analyze, it is possible that this number is too
	// strict (low) for the convergence thresholds above.
	constexpr int kMaxEpochs = 20000;
	// kNumLassoSteps is the number of subproblems solved in the lasso path;
	// it is not true that more steps will take more time: there should be a "sweet
	// spot", and definitely this number should not be too small (probably something
	// between 50 and 500).
	constexpr int kNumLassoSteps = 100;
	// kL1maxToL1minRatio is the ratio between the first and the last l1 penalty
	// value in the lasso path; should probably be something between 1e-6 and
	// 1e-3 (the R library glmnet uses 1e-3 by default).
	constexpr double kL1maxToL1minRatio = 1e-3;
	// kUseLinearPath specifies whether the lasso path should be linear (true) or
	// logarithmic. Linear path means that the l1 penalties form an arithmetic
	// sequence from largest to smallest; logarithmic path means that the penalties
	// form a geometric series. Note(bazyli): linear path is more conservative in
	// decreasing the penalties in the initial part of the lasso path. Since we are
	// interested in the heavy hitters, we may want to prefer to add them slowly in
	// this phase. Logarithmic path, on the other hand, decreases the penalty fast
	// in the initial phase but more slowly towards the end of the lasso path. This
	// is numerically more stable and may be faster but may introduce a lot of
	// nonzero coefficients in the initial phase.
	constexpr bool kUseLinearPath = true;
	//
	// ***************************************************************************
	// Constants related to GetExactValuesAndStdErrors
	//
	// The convergence thresholds and
	// kMaxEpochsSingleRun have the same definitions as the corresponding
	// constants in RunFirstRapporStep. However, we may want the convergence
	// thresholds to be more strict, because here we are interested in more exact
	// values, and the problem to solve should be easier.
	//
	// Initialize the minimizer constants.
	// Relative convergence thresholds have the same interpretation as the ones
	// used in RunFirstRapporStep. Consult their description if you plan to modify
	// the values.
	constexpr double kRelativeConvergenceThreshold2Step = 1e-6;
	constexpr double kSimpleConvergenceThreshold2Step = 1e-6;
	// kNumRuns is the number of runs to estimate standard deviations of
	// coefficients. The problem will be solved kNumRuns times, each time
	// with a slightly different right hand side.
	// This number probably should not be smaller than 5 but not too large:
	// large value should give a better approximation of standard errors but
	// kNumRuns == n means that the whole problem will be solved n times so it
	// affects the runtime.
	constexpr int kNumRuns = 20;
	// kMaxEpochsSingleRun is the maximum number of epochs in a single
	// run of the algorithm. So total number of epochs run will be bounded by
	// kNumRuns * kMaxEpochsSingleRun.
	constexpr int kMaxEpochsSingleRun = 5000;
	// Note(bazyli): If convergence thresholds are too small relative to
	// kMaxEpochsSingleRun then some of the runs of the algorithm may not
	// converge. If less than 5 converge then the standard errors are set to 0. If
	// none converges, then the exact_est_candidate_weights is set to be the same
	// as est_candidate_weights. This is a safeguard solution and if it happens,
	// we may want to adjust the constants so that it doesn't happen again (e.g.
	// increase kMaxEpochsSingleRun).
	} // namespace

	LassoRunner::LassoRunner(const InstanceSet* matrix) : matrix_(matrix) {
	zero_threshold_ = kZeroThreshold;
	l2_to_l1_ratio_ = kL2toL1Ratio;
	alpha_ = kAlpha;
	min_convergence_threshold_ = kMinConvergenceThreshold;
	num_lasso_steps_ = kNumLassoSteps;
	l1_max_to_l1_min_ratio_ = kL1maxToL1minRatio;
	use_linear_path_ = kUseLinearPath;
	}

	void LassoRunner::RunFirstRapporStep(const int max_nonzero_coeffs, const double max_solution_1_norm,
	const LabelSet& as_label_set, Weights* est_candidate_weights,
	std::vector<int>* second_step_cols) {
	// Construct the lossmin minimizer objects.
	// TODO(bazyli): remove this copy
	InstanceSet candidate_matrix = *matrix_;
	GradientEvaluator grad_eval(candidate_matrix, as_label_set);
	ParallelBoostingWithMomentum minimizer(0.0, 0.0, grad_eval); // penalties will be set later.

	// Initialize the solution vector to zero vector for the lasso path and
	// compute the initial gradient (this will be used to get initial l1 penalty
	// and convergence thresholds).
	const int num_candidates = candidate_matrix.cols();
	Weights initial_gradient = Weights::Zero(num_candidates);
	grad_eval.SparseGradient(*est_candidate_weights, &initial_gradient);

	// Set the minimizer absolute convergence constants.
	const double initial_mean_gradient_norm = initial_gradient.norm() / num_candidates;
	const double kConvergenceThreshold = std::max(
	min_convergence_threshold_, kRelativeConvergenceThreshold * initial_mean_gradient_norm);
	const double kInLassoPathConvergenceThreshold =
	std::max(min_convergence_threshold_,
	kRelativeInLassoPathConvergenceThreshold * initial_mean_gradient_norm);

	// Set the constants for lasso path computations.
	const double l1max = initial_gradient.array().abs().maxCoeff();
	const double l1min = l1_max_to_l1_min_ratio_ * l1max;
	const double l2 = l2_to_l1_ratio_ * l1min;
	const double l1delta = (l1max - l1min) / num_lasso_steps_;
	const double l1mult = std::exp(std::log(l1_max_to_l1_min_ratio_) / num_lasso_steps_);

	// Set up the minimizer.
	minimizer.set_zero_threshold(zero_threshold_);
	minimizer.set_convergence_threshold(kInLassoPathConvergenceThreshold);
	minimizer.set_simple_convergence_threshold(kSimpleConvergenceThreshold);
	minimizer.set_l2(l2);
	minimizer.compute_and_set_learning_rates(); // learning rates must be
	// re-computed when l2 penalty
	// changes.
	VLOG(4) << "Lasso in-path convergence threshold ==" << kInLassoPathConvergenceThreshold;
	VLOG(4) << "Lasso final convergence threshold ==" << kConvergenceThreshold;

	// Initialize variables to track the lasso path computations.
	std::vector<double> loss_history;
	double solution_1_norm = 0;
	int total_epochs_run = 0;
	int how_many_nonzero_coeffs = 0;

	// Perform lasso path computations.
	int i = 0;
	double l1_this_step = use_linear_path_ ? l1max - l1delta : l1max * l1mult;

	for (; i < num_lasso_steps_ && total_epochs_run < kMaxEpochs; i++) {
	VLOG(4) << "Minimizing " << i << "-th Lasso subproblem";

	if (how_many_nonzero_coeffs >= max_nonzero_coeffs \|\| i == num_lasso_steps_ - 1 \|\|
	solution_1_norm >= max_solution_1_norm) {
	// Enter the final lasso subproblem.
	minimizer.set_convergence_threshold(kConvergenceThreshold);
	if (i < num_lasso_steps_ - 1) {
	// If stopping criteria are met before reaching maximum number of steps,
	// use l1 from previous run.
	l1_this_step = use_linear_path_ ? l1_this_step + l1delta : l1_this_step / l1mult;
	i = num_lasso_steps_ - 1;
	}
	VLOG(4) << "Entered last Run";
	}

	minimizer.set_l1(std::max(l1min, l1_this_step));
	minimizer.set_reached_solution(false);
	minimizer.set_converged(false);

	// Set minimizer input as in the parallel boosting with momentum paper.
	minimizer.set_phi_center(*est_candidate_weights);
	minimizer.set_alpha(alpha_);
	minimizer.set_beta(1 - alpha_);

	VLOG(4) << "The l1 penalty used == " << l1_this_step;

	minimizer.Run(kMaxEpochs, kLossEpochs, kConvergenceMeasures, est_candidate_weights,
	&loss_history);

	// Compute the 1-norm of the current solution and the number of nonzero
	// coefficients.
	solution_1_norm = est_candidate_weights->lpNorm<1>();
	how_many_nonzero_coeffs = (est_candidate_weights->array() > zero_threshold_).count();

	VLOG(4) << "Ran " << minimizer.num_epochs_run() << " epochs in this step.";
	VLOG(4) << "Num of nonzero coefficients found: " << how_many_nonzero_coeffs;
	VLOG(4) << "Solution 1-norm == " << solution_1_norm;
	total_epochs_run += minimizer.num_epochs_run();

	l1_this_step = use_linear_path_ ? l1_this_step - l1delta : l1_this_step * l1mult;
	}

	VLOG(4) << "Ran " << total_epochs_run << " epochs in total.";

	// Record minimizer info.
	minimizer_data_.reached_last_lasso_subproblem = (i == num_lasso_steps_);
	minimizer_data_.num_epochs_run = total_epochs_run;
	minimizer_data_.converged = minimizer.converged();
	minimizer_data_.reached_solution = minimizer.reached_solution();
	minimizer_data_.l1 = minimizer.l1();
	minimizer_data_.l2 = minimizer.l2();
	minimizer_data_.convergence_threshold = kConvergenceThreshold;
	minimizer_data_.zero_threshold = kZeroThreshold;

	// Prepare the vector of columns for the second step of RAPPOR.
	second_step_cols->clear();
	second_step_cols->reserve(how_many_nonzero_coeffs);
	for (int i = 0; i < est_candidate_weights->size(); i++) {
	if (est_candidate_weights->coeff(i) > zero_threshold_) {
	second_step_cols->push_back(i);
	}
	}
	}

	void LassoRunner::GetExactValuesAndStdErrs(const double l1, const Weights& est_candidate_weights,
	const std::vector<double>& est_standard_errs,
	const InstanceSet& instances,
	const LabelSet& as_label_set,
	Weights* exact_est_candidate_weights,
	Weights* est_candidate_errors) {
	// Note(bazyli): If convergence thresholds are too small relative to
	// kMaxEpochsSingleRun then some of the runs of the algorithm may not
	// converge. If less than 5 converge then the standard errors are set to 0. If
	// none converges, then the exact_est_candidate_weights is set to be the same
	// as est_candidate_weights. This is a safeguard solution and if it happens,
	// we may want to adjust the constants so that it doesn't happen again (e.g.
	// increase kMaxEpochsSingleRun).
	const double l2 = l2_to_l1_ratio_ * l1;
	const int num_candidates = est_candidate_weights.size();
	const int num_labels = as_label_set.size();
	const int min_num_converged_for_standard_errors = 5;
	int num_converged = 0; // We record the number of runs in which the minimizer
	// actually converged (although if everything is fine,
	// i.e. all constants have appropriate values for the given case, this
	// should be equal to num_runs at completion).

	// We will need the solutions from all runs to compute the mean solution and
	// standard errors.
	std::vector<Weights> est_weights_runs; // vector for storing solutions from different runs
	Weights mean_est_weights = Weights::Zero(num_candidates);

	// In each run create a new_label_set by adding Gaussian noise to the original
	// as_label_set.
	for (int i = 0; i < kNumRuns; i++) {
	LabelSet new_label_set = as_label_set;

	for (int j = 0; j < num_labels; j++) {
	std::normal_distribution<double> nrm_distr(0, static_cast<double>(est_standard_errs[j]));
	double noise = nrm_distr(random_dev_);
	new_label_set(j) += noise;
	}

	// We will run the minimizer for the right hand side with noise
	// (new_label_set). We always use the est_candidate_weights as the initial
	// guess.

	// Construct the minimizer and compute initial gradient.
	Weights new_candidate_weights = est_candidate_weights;
	std::vector<double> loss_history_not_used;
	GradientEvaluator grad_eval(instances, new_label_set);
	ParallelBoostingWithMomentum minimizer(l1, l2, grad_eval);
	Weights initial_gradient = Weights(num_candidates);
	grad_eval.SparseGradient(new_candidate_weights, &initial_gradient);
	double initial_mean_gradient_norm = initial_gradient.norm() / num_candidates;
	double convergence_threshold =
	std::max(min_convergence_threshold_,
	kRelativeConvergenceThreshold2Step * initial_mean_gradient_norm);

	// Set up and run the minimizer.
	minimizer.set_converged(false);
	minimizer.set_reached_solution(false);
	minimizer.set_phi_center(new_candidate_weights);
	minimizer.set_convergence_threshold(convergence_threshold);
	minimizer.set_zero_threshold(zero_threshold_);
	minimizer.set_simple_convergence_threshold(kSimpleConvergenceThreshold2Step);
	minimizer.set_alpha(alpha_);
	minimizer.set_beta(1 - alpha_);
	minimizer.Run(kMaxEpochsSingleRun, kLossEpochs, kConvergenceMeasures, &new_candidate_weights,
	&loss_history_not_used);

	if (minimizer.converged()) {
	// Update the mean and store the current solution.
	mean_est_weights += new_candidate_weights;
	est_weights_runs.push_back(new_candidate_weights);
	num_converged++;
	}
	}

	if (num_converged > 0) {
	mean_est_weights /= num_converged;
	} else {
	mean_est_weights = est_candidate_weights;
	}

	// Compute the sample means and standard deviations (standard errors).
	Weights sample_stds = Weights::Zero(num_candidates);
	if (num_converged >= min_num_converged_for_standard_errors) {
	for (auto& est_weight : est_weights_runs) {
	sample_stds += (est_weight - mean_est_weights).array().pow(2).matrix();
	}
	sample_stds = sample_stds / (num_converged - 1);
	sample_stds = sample_stds.array().sqrt();
	}
	*est_candidate_errors = sample_stds;
	*exact_est_candidate_weights = mean_est_weights;
	}

	} // namespace cobalt::rappor