| // Copyright 2018 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifndef COBALT_ALGORITHMS_RAPPOR_RAPPOR_ANALYZER_TEST_H_ |
| #define COBALT_ALGORITHMS_RAPPOR_RAPPOR_ANALYZER_TEST_H_ |
| |
| #include "algorithms/rappor/rappor_analyzer.h" |
| |
| #include <gflags/gflags.h> |
| #include <glog/logging.h> |
| |
| #include <algorithm> |
| #include <chrono> |
| #include <memory> |
| #include <random> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include "algorithms/rappor/rappor_encoder.h" |
| #include "algorithms/rappor/rappor_test_utils.h" |
| #include "encoder/client_secret.h" |
| #include "third_party/eigen/Eigen/SparseQR" |
| #include "third_party/googletest/googletest/include/gtest/gtest.h" |
| |
| namespace cobalt { |
| namespace rappor { |
| |
| class RapporAnalyzerTest : public ::testing::Test { |
| protected: |
| // Sets the member variable analyzer_ to a new RapporAnalyzer configured |
| // with the given arguments and the current values of prob_0_becomes_1_, |
| // prob_1_stays_1_. |
| void SetAnalyzer(uint32_t num_candidates, uint32_t num_bloom_bits, |
| uint32_t num_cohorts, uint32_t num_hashes); |
| |
| void BuildCandidateMap(); |
| |
| // This should be invoked after BuildCandidateMap. It returns the bit index |
| // within the CandidateMap for the given |candidate_index|, |cohort_index|, |
| // and |hash_index|. |
| uint16_t GetCandidateMapValue(uint16_t candidate_index, uint16_t cohort_index, |
| uint16_t hash_index); |
| |
| // Builds and returns a bit string (i.e. a string of ASCII '0's and '1's) |
| // representing the Bloom filter implicitly stored within the CandidateMap |
| // for the given |candidate_index| and |cohort_index|. |
| std::string BuildBitString(uint16_t candidate_index, uint16_t cohort_index); |
| |
| const Eigen::SparseMatrix<double, Eigen::RowMajor>& candidate_matrix() { |
| return analyzer_->candidate_matrix_; |
| } |
| |
| void AddObservation(uint32_t cohort, std::string binary_string); |
| |
| void ExtractEstimatedBitCountRatios(Eigen::VectorXd* est_bit_count_ratios); |
| |
| void ExtractEstimatedBitCountRatiosAndStdErrors( |
| Eigen::VectorXd* est_bit_count_ratios, |
| std::vector<double>* est_std_errors); |
| |
| void AddObservationsForCandidates(const std::vector<int>& candidate_indices); |
| |
| // Generate a random number from power law distribution on the interval |
| // [|left|,|right|] with given |exponent|. |
| int GenerateNumberFromPowerLaw(const double left, const double right, |
| const double exponent); |
| |
| // Generate a "map" of shuffled Ids, that is, a vector of size |
| // |num_candidates| containing exactly the numbers 0,1,...,num_canidates - 1, |
| // in a random order. |
| std::vector<int> GenerateRandomMapOfIds(const int num_candidates); |
| |
| std::vector<int> CountsEstimatesFromResults( |
| const std::vector<CandidateResult>& results); |
| |
| Eigen::VectorXd VectorFromCounts( |
| const std::vector<int>& exact_candidate_counts); |
| |
| // Checks how well the |exact_candidate_counts| reproduces the right hand side |
| // of the equation solved by Analyze(). The function prints the value of |
| // d == || A * x_s - b || / || b || where x_s == |exact_candidate_counts| / |
| // num_observations, b is the vector of the estimated bit count ratios from |
| // ExactEstimatedBitCountRatios(), and A == analyzer_->candidate_matrix_ (it |
| // must be valid, the function does not build A). |
| // The purpose of this function is to assess how much information is lost by |
| // both encoding and the assumption that cohorts have equal ratios. d == 0 |
| // means that exact_candidate_count is among the solutions to the problem |
| // being solved, so values much larger than 0 suggest that getting the right |
| // counts in the Analyzer is difficult (if exact_candidate_counts is our a |
| // priori knowledge of the solution). Note: d does not account for |
| // non-uniqeness of the solution of A * x_s = b, just checks how well the |
| // given vector solves it. |
| void CheckExactSolution(const std::vector<int>& exact_candidate_counts); |
| |
| void PrintTrueCountsAndEstimates( |
| const std::string& case_label, uint32_t num_candidates, |
| const std::vector<CandidateResult>& results, |
| const std::vector<int>& true_candidate_counts); |
| |
| // Assess utility of |results|. The informal measure suggested by mironov is: |
| // "Largest n such that at most 10% of the n highest hitters are identified as |
| // such incorrectly (are false positives)". Obviously, this makes sense only |
| // for n in some range (it may unjusty suggest that the results are bad for n |
| // too small, or for n too large, that they are good when in fact they are |
| // not). Also, 10% is arbitrary. So instead, we just compute the false |
| // positive rates for n in some grid set. We also print the total number of |
| // nonzero estimates identified. |
| void AssessUtility(const std::vector<CandidateResult>& results, |
| const std::vector<int>& true_candidate_counts); |
| |
| // Computes the least squares fit on the candidate matrix using QR, |
| // for the given rhs in |est_bit_count_ratios| and saves it to |results| |
| grpc::Status ComputeLeastSquaresFitQR( |
| const Eigen::VectorXd& est_bit_count_ratios, |
| std::vector<CandidateResult>* results); |
| |
| // Runs a simple least squares problem for Ax = b on the candidate matrix |
| // using QR algorithm from eigen library; this is to see the results |
| // without penalty terms (note: in an overdetermined system the solution |
| // is not unique so this is more a helper testing function to cross-check |
| // the behavior of regression without penalty) |
| void RunSimpleLinearRegressionReference( |
| const std::string& case_label, uint32_t num_candidates, |
| uint32_t num_bloom_bits, uint32_t num_cohorts, uint32_t num_hashes, |
| std::vector<int> candidate_indices, |
| std::vector<int> true_candidate_counts); |
| |
| // Invokes the Analyze() method using the given parameters. Checks that |
| // the algorithms converges and that the result vector has the correct length. |
| void ShortExperimentWithAnalyze(const std::string& case_label, |
| uint32_t num_candidates, |
| uint32_t num_bloom_bits, uint32_t num_cohorts, |
| uint32_t num_hashes, |
| std::vector<int> candidate_indices, |
| std::vector<int> true_candidate_counts, |
| const bool print_estimates); |
| |
| // Same as ShortExperimentWithAnalyze() but also measures the time taken by |
| // Analyze(), calls CheckExactSolution() to assess the loss of information, |
| // and calls AssessUtility() to compare the results with true counts. |
| void LongExperimentWithAnalyze(const std::string& case_label, |
| uint32_t num_candidates, |
| uint32_t num_bloom_bits, uint32_t num_cohorts, |
| uint32_t num_hashes, |
| std::vector<int> candidate_indices, |
| std::vector<int> true_candidate_counts, |
| const bool print_estimates); |
| |
| // Similar to ShortExperimentWithAnalyze except it also |
| // computes the estimates for both Analyze and simple regression using QR, |
| // which is computed on exactly the same system. |
| void CompareAnalyzeToSimpleRegression(const std::string& case_label, |
| uint32_t num_candidates, |
| uint32_t num_bloom_bits, |
| uint32_t num_cohorts, |
| uint32_t num_hashes, |
| std::vector<int> candidate_indices, |
| std::vector<int> true_candidate_counts); |
| |
| RapporConfig config_; |
| std::unique_ptr<RapporAnalyzer> analyzer_; |
| |
| RapporCandidateList candidate_list_; |
| |
| // By default this test uses p=0, q=1. Individual tests may override this. |
| double prob_0_becomes_1_ = 0.0; |
| double prob_1_stays_1_ = 1.0; |
| |
| // Random device |
| std::random_device random_dev_; |
| }; |
| |
| } // namespace rappor |
| } // namespace cobalt |
| |
| #endif // COBALT_ALGORITHMS_RAPPOR_RAPPOR_ANALYZER_TEST_H_ |