| // Copyright 2018 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "src/algorithms/rappor/lasso_runner.h" |
| |
| #include <algorithm> |
| |
| #include <glog/logging.h> |
| |
| using cobalt_lossmin::GradientEvaluator; |
| using cobalt_lossmin::InstanceSet; |
| using cobalt_lossmin::LabelSet; |
| using cobalt_lossmin::ParallelBoostingWithMomentum; |
| using cobalt_lossmin::Weights; |
| |
| namespace cobalt::rappor { |
| |
| namespace { |
| // *************************************************************************** |
| // Constants used by the cobalt_lossmin minimizers in both steps of RAPPOR |
| // (lasso_runner.RunFirstRapporStep and lasso_runner.GetExactValuesAndStdErrs). |
| // They are meant to be generic so modify them with caution. |
| // |
| // kZeroThreshold is the (relative) proportion below which we assume a |
| // candidate count to be effectively zero; that is, we assume a candidate to |
| // be zero if its estimated count is below kZeroThreshold of the |
| // bit_counter_.num_observations(). It cannot be exactly zero for |
| // performance reasons (also, exact zero makes little if any sense |
| // numerically). Also, smaller values might be more difficult for individual |
| // runs of the minimizer. |
| // You can change kZeroThreshold to the value you think is best (smaller |
| // than what you would deem negligible); however, do not get close to double |
| // precision accuracy (1e-16). A reasonable value could be between 1e-4 and |
| // 1e-8. |
| constexpr double kZeroThreshold = 1e-6; |
| // kL2toL1Ratio is the ratio between l1 and l2 penalty. |
| // Although pure lasso (or linear regression) does not include any l2 penalty, |
| // a tiny bit can improve stability. A value of kL2toL1Ratio less or equal to |
| // 1e-2 should not affect the interpretation of the solution. |
| constexpr double kL2toL1Ratio = 1e-3; |
| // kLossEpochs denotes how often the current objective value is recorded in |
| // the solver (it will be recorded every kLossEpochs epochs). |
| // kConvergenceMeasures denotes how often the minimizer checks convergence: |
| // the algorithm will check convergence |
| // every kConvergenceEpochs epochs and terminate if at least |
| // one of the following is true. |
| // 1. The minimizer reached the solution up to the accuracy defined by |
| // minimizer.covergence_threshold(). In this case, |
| // minimizer.reached_solution() == true and minimizer.converged() == true. |
| // 2. The algorithm stopped improving as measured by the |
| // simple convergence check of the minimizer. |
| // In this case, minimizer.converged() == true. |
| // kLossEpochs and kConvergenceEpochs must be positive (and |
| // probably both not larger than 10). Also, kLossEpochs <= |
| // kConvergenceEpochs makes more sense. |
| constexpr int kLossEpochs = 5; |
| constexpr int kConvergenceMeasures = 5; |
| // kAlpha is a constant from parallel boosting with momentum paper, |
| // must be 0 < kAlpha < 1; cobalt_lossmin library default initial choice is 0.5, |
| // but we need to be able to reset this value in minimizer if needed. |
| constexpr double kAlpha = 0.5; |
| // kMinConvergenceThreshold is an absolute lower bound for the convergence |
| // thresholds in the algorithm; It should be something within a couple of |
| // orders of magnitude of the double precision (reasonable values may be |
| // between 1e-12 and 1e-14). This is introduced because convergence thresholds |
| // are computed relatively to the initial gradient norm and so they might be |
| // too low if the initial guess is very good. |
| constexpr double kMinConvergenceThreshold = 1e-12; |
| // |
| // *************************************************************************** |
| // Constants of the lasso_runner that will be used in RunFirstLassoStep. |
| // |
| // Note(bazyli) about modifying constants: |
| // 1. To improve the accuracy of the solution you may want to |
| // set the convergence thresholds: kRelativeConvergenceThreshold, |
| // kRelativeInLassoPathConvergenceThreshold, and kSimpleConvergenceThreshold |
| // to smaller values, at the expense of more computations (epochs) performed. |
| // See their descriptions below. |
| // 2. You may want to modify kMaxEpochs if it is too strict (which may be the |
| // case if you set very small convergence thresholds), but kNumLassoSteps and |
| // kL1maxToL1minRatio should be quite generic so modify them with caution. |
| // 3. You may test whether linear path (kUseLinearPath == true) or |
| // (kUseLinearPath == false) is better for your case. The run times and result |
| // quality may slightly differ (see below). |
| // |
| // Define minimizer and lasso-path-specific parameters. |
| // kRelativeConvergenceThreshold is the improvement that we expect to achieve |
| // at convergence, relative to initial gradient. That is, if ||g|| is the |
| // initial 2-norm norm of the gradient, we expect the final (i.e. after the |
| // last lasso problem) total measure of KKT violation to be equal to |
| // kRelativeConvergenceThreshold * ||g||. Value smaller than 1e-8 gives a full |
| // (single precision) convergence and is probably overshooting because the |
| // purpose of the first step is mostly to identify potential nonzero |
| // coefficients. Something between 1e-5 and 1e-7 should be enough. |
| // Even larger value can be chosen for performance reasons. This accuracy will |
| // be used only in the final lasso subproblem. |
| // kRelativeInLassoPathConvergenceThreshold has the same interpretation but |
| // will be used inside the lasso path (before the last subproblem). We |
| // probably want to set this to be slightly (e.g. ten times) less restrictive |
| // (larger) than kRelativeConvergenceThreshold for efficiency because the |
| // solutions inside the lasso path serve as a "warm-up" before the last |
| // subproblem; this subproblem, on the other hand, can be more accurate and |
| // should benefit more from "momentum" computations in the algorithm. |
| // kSimpleConvergenceThreshold == d means that the algorithm will stop if the |
| // best relative improvement between two consecutive measures of the objective |
| // in the last kConvergenceMeasures measures, is below d. In other words, |
| // relative improvements less than d will cause the minimizer to return. |
| // See the description of kConvergenceEpochs and |
| // kLossEpochs above. A rule-of-thumb value for kSimpleConvergenceThreshold |
| // can be some small fraction of the inverse of kMaxEpochs below. |
| // |
| // Note: The actual absolute values of convergence |
| // thresholds used in the algorithm are capped by |
| // loss_minimizer.min_convergence_threshold_ in case ||g|| is almost zero in the |
| // first place (see below). Note: If you are bumping into "The lasso path did |
| // not reach the last subproblem" or "The lasso path did not reach the last |
| // subproblem." in Analyze, it might mean that the convergence thresholds are |
| // too strict for the specified kMaxEpochs limit, given the difficulty of the |
| // problem. |
| constexpr double kRelativeConvergenceThreshold = 1e-5; |
| constexpr double kRelativeInLassoPathConvergenceThreshold = 1e-4; |
| constexpr double kSimpleConvergenceThreshold = 1e-5; |
| // kMaxEpochs denotes the limit on the total number of epochs (iterations) |
| // run; the actual number can be up to two times larger |
| // because every individual lasso subproblem (run of |
| // minimizer) has the same bound and the total epochs count is updated after |
| // the subproblem is run. |
| // Note: If you are bumping into "The lasso path did not reach the |
| // last subproblem." error in Analyze, it is possible that this number is too |
| // strict (low) for the convergence thresholds above. |
| constexpr int kMaxEpochs = 20000; |
| // kNumLassoSteps is the number of subproblems solved in the lasso path; |
| // it is not true that more steps will take more time: there should be a "sweet |
| // spot", and definitely this number should not be too small (probably something |
| // between 50 and 500). |
| constexpr int kNumLassoSteps = 100; |
| // kL1maxToL1minRatio is the ratio between the first and the last l1 penalty |
| // value in the lasso path; should probably be something between 1e-6 and |
| // 1e-3 (the R library glmnet uses 1e-3 by default). |
| constexpr double kL1maxToL1minRatio = 1e-3; |
| // kUseLinearPath specifies whether the lasso path should be linear (true) or |
| // logarithmic. Linear path means that the l1 penalties form an arithmetic |
| // sequence from largest to smallest; logarithmic path means that the penalties |
| // form a geometric series. Note(bazyli): linear path is more conservative in |
| // decreasing the penalties in the initial part of the lasso path. Since we are |
| // interested in the heavy hitters, we may want to prefer to add them slowly in |
| // this phase. Logarithmic path, on the other hand, decreases the penalty fast |
| // in the initial phase but more slowly towards the end of the lasso path. This |
| // is numerically more stable and may be faster but may introduce a lot of |
| // nonzero coefficients in the initial phase. |
| constexpr bool kUseLinearPath = true; |
| // |
| // *************************************************************************** |
| // Constants related to GetExactValuesAndStdErrors |
| // |
| // The convergence thresholds and |
| // kMaxEpochsSingleRun have the same definitions as the corresponding |
| // constants in RunFirstRapporStep. However, we may want the convergence |
| // thresholds to be more strict, because here we are interested in more exact |
| // values, and the problem to solve should be easier. |
| // |
| // Initialize the minimizer constants. |
| // Relative convergence thresholds have the same interpretation as the ones |
| // used in RunFirstRapporStep. Consult their description if you plan to modify |
| // the values. |
| constexpr double kRelativeConvergenceThreshold2Step = 1e-6; |
| constexpr double kSimpleConvergenceThreshold2Step = 1e-6; |
| // kNumRuns is the number of runs to estimate standard deviations of |
| // coefficients. The problem will be solved kNumRuns times, each time |
| // with a slightly different right hand side. |
| // This number probably should not be smaller than 5 but not too large: |
| // large value should give a better approximation of standard errors but |
| // kNumRuns == n means that the whole problem will be solved n times so it |
| // affects the runtime. |
| constexpr int kNumRuns = 20; |
| // kMaxEpochsSingleRun is the maximum number of epochs in a single |
| // run of the algorithm. So total number of epochs run will be bounded by |
| // kNumRuns * kMaxEpochsSingleRun. |
| constexpr int kMaxEpochsSingleRun = 5000; |
| // Note(bazyli): If convergence thresholds are too small relative to |
| // kMaxEpochsSingleRun then some of the runs of the algorithm may not |
| // converge. If less than 5 converge then the standard errors are set to 0. If |
| // none converges, then the exact_est_candidate_weights is set to be the same |
| // as est_candidate_weights. This is a safeguard solution and if it happens, |
| // we may want to adjust the constants so that it doesn't happen again (e.g. |
| // increase kMaxEpochsSingleRun). |
| } // namespace |
| |
| LassoRunner::LassoRunner(const InstanceSet* matrix) : matrix_(matrix) { |
| zero_threshold_ = kZeroThreshold; |
| l2_to_l1_ratio_ = kL2toL1Ratio; |
| alpha_ = kAlpha; |
| min_convergence_threshold_ = kMinConvergenceThreshold; |
| num_lasso_steps_ = kNumLassoSteps; |
| l1_max_to_l1_min_ratio_ = kL1maxToL1minRatio; |
| use_linear_path_ = kUseLinearPath; |
| } |
| |
| void LassoRunner::RunFirstRapporStep(const int max_nonzero_coeffs, const double max_solution_1_norm, |
| const LabelSet& as_label_set, Weights* est_candidate_weights, |
| std::vector<int>* second_step_cols) { |
| // Construct the lossmin minimizer objects. |
| // TODO(bazyli): remove this copy |
| InstanceSet candidate_matrix = *matrix_; |
| GradientEvaluator grad_eval(candidate_matrix, as_label_set); |
| ParallelBoostingWithMomentum minimizer(0.0, 0.0, grad_eval); // penalties will be set later. |
| |
| // Initialize the solution vector to zero vector for the lasso path and |
| // compute the initial gradient (this will be used to get initial l1 penalty |
| // and convergence thresholds). |
| const int num_candidates = candidate_matrix.cols(); |
| Weights initial_gradient = Weights::Zero(num_candidates); |
| grad_eval.SparseGradient(*est_candidate_weights, &initial_gradient); |
| |
| // Set the minimizer absolute convergence constants. |
| const double initial_mean_gradient_norm = initial_gradient.norm() / num_candidates; |
| const double kConvergenceThreshold = std::max( |
| min_convergence_threshold_, kRelativeConvergenceThreshold * initial_mean_gradient_norm); |
| const double kInLassoPathConvergenceThreshold = |
| std::max(min_convergence_threshold_, |
| kRelativeInLassoPathConvergenceThreshold * initial_mean_gradient_norm); |
| |
| // Set the constants for lasso path computations. |
| const double l1max = initial_gradient.array().abs().maxCoeff(); |
| const double l1min = l1_max_to_l1_min_ratio_ * l1max; |
| const double l2 = l2_to_l1_ratio_ * l1min; |
| const double l1delta = (l1max - l1min) / num_lasso_steps_; |
| const double l1mult = std::exp(std::log(l1_max_to_l1_min_ratio_) / num_lasso_steps_); |
| |
| // Set up the minimizer. |
| minimizer.set_zero_threshold(zero_threshold_); |
| minimizer.set_convergence_threshold(kInLassoPathConvergenceThreshold); |
| minimizer.set_simple_convergence_threshold(kSimpleConvergenceThreshold); |
| minimizer.set_l2(l2); |
| minimizer.compute_and_set_learning_rates(); // learning rates must be |
| // re-computed when l2 penalty |
| // changes. |
| VLOG(4) << "Lasso in-path convergence threshold ==" << kInLassoPathConvergenceThreshold; |
| VLOG(4) << "Lasso final convergence threshold ==" << kConvergenceThreshold; |
| |
| // Initialize variables to track the lasso path computations. |
| std::vector<double> loss_history; |
| double solution_1_norm = 0; |
| int total_epochs_run = 0; |
| int how_many_nonzero_coeffs = 0; |
| |
| // Perform lasso path computations. |
| int i = 0; |
| double l1_this_step = use_linear_path_ ? l1max - l1delta : l1max * l1mult; |
| |
| for (; i < num_lasso_steps_ && total_epochs_run < kMaxEpochs; i++) { |
| VLOG(4) << "Minimizing " << i << "-th Lasso subproblem"; |
| |
| if (how_many_nonzero_coeffs >= max_nonzero_coeffs || i == num_lasso_steps_ - 1 || |
| solution_1_norm >= max_solution_1_norm) { |
| // Enter the final lasso subproblem. |
| minimizer.set_convergence_threshold(kConvergenceThreshold); |
| if (i < num_lasso_steps_ - 1) { |
| // If stopping criteria are met before reaching maximum number of steps, |
| // use l1 from previous run. |
| l1_this_step = use_linear_path_ ? l1_this_step + l1delta : l1_this_step / l1mult; |
| i = num_lasso_steps_ - 1; |
| } |
| VLOG(4) << "Entered last Run"; |
| } |
| |
| minimizer.set_l1(std::max(l1min, l1_this_step)); |
| minimizer.set_reached_solution(false); |
| minimizer.set_converged(false); |
| |
| // Set minimizer input as in the parallel boosting with momentum paper. |
| minimizer.set_phi_center(*est_candidate_weights); |
| minimizer.set_alpha(alpha_); |
| minimizer.set_beta(1 - alpha_); |
| |
| VLOG(4) << "The l1 penalty used == " << l1_this_step; |
| |
| minimizer.Run(kMaxEpochs, kLossEpochs, kConvergenceMeasures, est_candidate_weights, |
| &loss_history); |
| |
| // Compute the 1-norm of the current solution and the number of nonzero |
| // coefficients. |
| solution_1_norm = est_candidate_weights->lpNorm<1>(); |
| how_many_nonzero_coeffs = (est_candidate_weights->array() > zero_threshold_).count(); |
| |
| VLOG(4) << "Ran " << minimizer.num_epochs_run() << " epochs in this step."; |
| VLOG(4) << "Num of nonzero coefficients found: " << how_many_nonzero_coeffs; |
| VLOG(4) << "Solution 1-norm == " << solution_1_norm; |
| total_epochs_run += minimizer.num_epochs_run(); |
| |
| l1_this_step = use_linear_path_ ? l1_this_step - l1delta : l1_this_step * l1mult; |
| } |
| |
| VLOG(4) << "Ran " << total_epochs_run << " epochs in total."; |
| |
| // Record minimizer info. |
| minimizer_data_.reached_last_lasso_subproblem = (i == num_lasso_steps_); |
| minimizer_data_.num_epochs_run = total_epochs_run; |
| minimizer_data_.converged = minimizer.converged(); |
| minimizer_data_.reached_solution = minimizer.reached_solution(); |
| minimizer_data_.l1 = minimizer.l1(); |
| minimizer_data_.l2 = minimizer.l2(); |
| minimizer_data_.convergence_threshold = kConvergenceThreshold; |
| minimizer_data_.zero_threshold = kZeroThreshold; |
| |
| // Prepare the vector of columns for the second step of RAPPOR. |
| second_step_cols->clear(); |
| second_step_cols->reserve(how_many_nonzero_coeffs); |
| for (int i = 0; i < est_candidate_weights->size(); i++) { |
| if (est_candidate_weights->coeff(i) > zero_threshold_) { |
| second_step_cols->push_back(i); |
| } |
| } |
| } |
| |
| void LassoRunner::GetExactValuesAndStdErrs(const double l1, const Weights& est_candidate_weights, |
| const std::vector<double>& est_standard_errs, |
| const InstanceSet& instances, |
| const LabelSet& as_label_set, |
| Weights* exact_est_candidate_weights, |
| Weights* est_candidate_errors) { |
| // Note(bazyli): If convergence thresholds are too small relative to |
| // kMaxEpochsSingleRun then some of the runs of the algorithm may not |
| // converge. If less than 5 converge then the standard errors are set to 0. If |
| // none converges, then the exact_est_candidate_weights is set to be the same |
| // as est_candidate_weights. This is a safeguard solution and if it happens, |
| // we may want to adjust the constants so that it doesn't happen again (e.g. |
| // increase kMaxEpochsSingleRun). |
| const double l2 = l2_to_l1_ratio_ * l1; |
| const int num_candidates = est_candidate_weights.size(); |
| const int num_labels = as_label_set.size(); |
| const int min_num_converged_for_standard_errors = 5; |
| int num_converged = 0; // We record the number of runs in which the minimizer |
| // actually converged (although if everything is fine, |
| // i.e. all constants have appropriate values for the given case, this |
| // should be equal to num_runs at completion). |
| |
| // We will need the solutions from all runs to compute the mean solution and |
| // standard errors. |
| std::vector<Weights> est_weights_runs; // vector for storing solutions from different runs |
| Weights mean_est_weights = Weights::Zero(num_candidates); |
| |
| // In each run create a new_label_set by adding Gaussian noise to the original |
| // as_label_set. |
| for (int i = 0; i < kNumRuns; i++) { |
| LabelSet new_label_set = as_label_set; |
| |
| for (int j = 0; j < num_labels; j++) { |
| std::normal_distribution<double> nrm_distr(0, static_cast<double>(est_standard_errs[j])); |
| double noise = nrm_distr(random_dev_); |
| new_label_set(j) += noise; |
| } |
| |
| // We will run the minimizer for the right hand side with noise |
| // (new_label_set). We always use the est_candidate_weights as the initial |
| // guess. |
| |
| // Construct the minimizer and compute initial gradient. |
| Weights new_candidate_weights = est_candidate_weights; |
| std::vector<double> loss_history_not_used; |
| GradientEvaluator grad_eval(instances, new_label_set); |
| ParallelBoostingWithMomentum minimizer(l1, l2, grad_eval); |
| Weights initial_gradient = Weights(num_candidates); |
| grad_eval.SparseGradient(new_candidate_weights, &initial_gradient); |
| double initial_mean_gradient_norm = initial_gradient.norm() / num_candidates; |
| double convergence_threshold = |
| std::max(min_convergence_threshold_, |
| kRelativeConvergenceThreshold2Step * initial_mean_gradient_norm); |
| |
| // Set up and run the minimizer. |
| minimizer.set_converged(false); |
| minimizer.set_reached_solution(false); |
| minimizer.set_phi_center(new_candidate_weights); |
| minimizer.set_convergence_threshold(convergence_threshold); |
| minimizer.set_zero_threshold(zero_threshold_); |
| minimizer.set_simple_convergence_threshold(kSimpleConvergenceThreshold2Step); |
| minimizer.set_alpha(alpha_); |
| minimizer.set_beta(1 - alpha_); |
| minimizer.Run(kMaxEpochsSingleRun, kLossEpochs, kConvergenceMeasures, &new_candidate_weights, |
| &loss_history_not_used); |
| |
| if (minimizer.converged()) { |
| // Update the mean and store the current solution. |
| mean_est_weights += new_candidate_weights; |
| est_weights_runs.push_back(new_candidate_weights); |
| num_converged++; |
| } |
| } |
| |
| if (num_converged > 0) { |
| mean_est_weights /= num_converged; |
| } else { |
| mean_est_weights = est_candidate_weights; |
| } |
| |
| // Compute the sample means and standard deviations (standard errors). |
| Weights sample_stds = Weights::Zero(num_candidates); |
| if (num_converged >= min_num_converged_for_standard_errors) { |
| for (auto& est_weight : est_weights_runs) { |
| sample_stds += (est_weight - mean_est_weights).array().pow(2).matrix(); |
| } |
| sample_stds = sample_stds / (num_converged - 1); |
| sample_stds = sample_stds.array().sqrt(); |
| } |
| *est_candidate_errors = sample_stds; |
| *exact_est_candidate_weights = mean_est_weights; |
| } |
| |
| } // namespace cobalt::rappor |