| // Copyright 2018 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "algorithms/rappor/rappor_analyzer_test.h" |
| |
| namespace cobalt { |
| namespace rappor { |
| |
| // Comparison of Analyze and simple least squares. |
| // It invokes Analyze() in a few very simple cases, checks that the the |
| // algorithm converges and that the result vector has the correct size. For each |
| // case, it also computes the least squares solution using QR for exactly the |
| // same system and prints both solutions (note that the least squares solution |
| // is not always unique). |
| TEST_F(RapporAnalyzerTest, CompareAnalyzeToRegression) { |
| static const uint32_t kNumCandidates = 10; |
| static const uint32_t kNumCohorts = 2; |
| static const uint32_t kNumHashes = 2; |
| static const uint32_t kNumBloomBits = 8; |
| |
| std::vector<int> candidate_indices(100, 5); |
| std::vector<int> true_candidate_counts = {0, 0, 0, 0, 0, 100, 0, 0, 0, 0}; |
| CompareAnalyzeToSimpleRegression("p=0, q=1, only candidate 5", kNumCandidates, |
| kNumBloomBits, kNumCohorts, kNumHashes, |
| candidate_indices, true_candidate_counts); |
| |
| candidate_indices = std::vector<int>(20, 1); |
| candidate_indices.insert(candidate_indices.end(), 20, 4); |
| candidate_indices.insert(candidate_indices.end(), 60, 9); |
| true_candidate_counts = {0, 20, 0, 0, 20, 0, 0, 0, 0, 60}; |
| CompareAnalyzeToSimpleRegression( |
| "p=0, q=1, several candidates", kNumCandidates, kNumBloomBits, |
| kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts); |
| |
| prob_0_becomes_1_ = 0.1; |
| prob_1_stays_1_ = 0.9; |
| |
| candidate_indices = std::vector<int>(100, 5); |
| true_candidate_counts = {0, 0, 0, 0, 0, 100, 0, 0, 0, 0}; |
| CompareAnalyzeToSimpleRegression( |
| "p=0.1, q=0.9, only candidate 5", kNumCandidates, kNumBloomBits, |
| kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts); |
| |
| candidate_indices = std::vector<int>(20, 1); |
| candidate_indices.insert(candidate_indices.end(), 20, 4); |
| candidate_indices.insert(candidate_indices.end(), 60, 9); |
| true_candidate_counts = {0, 20, 0, 0, 20, 0, 0, 0, 0, 60}; |
| CompareAnalyzeToSimpleRegression( |
| "p=0.1, q=0.9, several candidates", kNumCandidates, kNumBloomBits, |
| kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts); |
| } |
| |
| // Runs LongExperimentWithAnalyze; the true candidate counts are |
| // distributed according to the power law; we specify the number of |
| // observations and the exponent parameter of the power law. The ids are then |
| // shuffled so that it is not true that large ids are more frequent. |
| // Note: encoding observations is time consuming so large tests may take long. |
| TEST_F(RapporAnalyzerTest, PowerLawExperiment) { |
| static const uint32_t kNumCandidates = 20000; |
| static const uint32_t kNumCohorts = 128; |
| static const uint32_t kNumHashes = 2; |
| static const uint32_t kNumBloomBits = 128; |
| static const uint32_t kNumObservations = 1e+6; |
| static const bool print_estimates = false; |
| const double exponent = 30; |
| const int max_id = static_cast<const int>(kNumCandidates - 1); |
| |
| std::vector<int> candidate_indices(kNumObservations); |
| std::vector<int> true_candidate_counts(kNumCandidates, 0); |
| |
| // Create a "map" of shuffled ids to randomize the observed id values |
| std::vector<int> candidate_ids_list_shuffled(kNumCandidates); |
| std::iota(candidate_ids_list_shuffled.begin(), |
| candidate_ids_list_shuffled.end(), 0); |
| std::mt19937 g(random_dev_()); |
| std::shuffle(candidate_ids_list_shuffled.begin(), |
| candidate_ids_list_shuffled.end(), g); |
| |
| // Generate observations from the power law distribution on |
| // [0,kNumCandidates-1] |
| const double left = 0.0; |
| const double right = static_cast<const double>(max_id); |
| for (uint32_t i = 0; i < kNumObservations; i++) { |
| double random_power_law_number = |
| GenerateNumberFromPowerLaw(left, right, exponent); |
| int observed_candidate_id = static_cast<int>(random_power_law_number); |
| observed_candidate_id = std::min(observed_candidate_id, max_id); |
| observed_candidate_id = candidate_ids_list_shuffled[observed_candidate_id]; |
| candidate_indices[i] = observed_candidate_id; |
| true_candidate_counts[observed_candidate_id]++; |
| } |
| |
| prob_0_becomes_1_ = 0.05; |
| prob_1_stays_1_ = 0.95; |
| |
| LongExperimentWithAnalyze("p=0.05, q=0.95, power-law distribution", |
| kNumCandidates, kNumBloomBits, kNumCohorts, |
| kNumHashes, candidate_indices, |
| true_candidate_counts, print_estimates); |
| |
| prob_0_becomes_1_ = 0.25; |
| prob_1_stays_1_ = 0.75; |
| |
| LongExperimentWithAnalyze("p=0.25, q=0.75, power-law distribution", |
| kNumCandidates, kNumBloomBits, kNumCohorts, |
| kNumHashes, candidate_indices, |
| true_candidate_counts, print_estimates); |
| } |
| |
| // This is the same as PowerLawExperiment but the distribution of observations |
| // is exponential. |
| TEST_F(RapporAnalyzerTest, ExponentialExperiment) { |
| static const uint32_t kNumCandidates = 300; |
| static const uint32_t kNumCohorts = 2; |
| static const uint32_t kNumHashes = 2; |
| static const uint32_t kNumBloomBits = 128; |
| static const uint32_t kNumObservations = 1e+6; |
| static const bool print_estimates = true; |
| static const double lambda = 1.0; // the support of pdf for lambda == 1.0 ... |
| static const double approximate_max_generated_num = |
| 6.0; // ... is roughly [0, 6.0] |
| const int max_id = static_cast<const int>(kNumCandidates - 1); |
| |
| std::vector<int> candidate_indices(kNumObservations); |
| std::vector<int> true_candidate_counts(kNumCandidates, 0); |
| |
| // Create a "map" of shuffled ids to randomize the observed id values |
| std::vector<int> candidate_ids_list_shuffled = |
| GenerateRandomMapOfIds(kNumCandidates); |
| |
| // Generate observations from the exponential distribution on |
| // [0,kNumCandidates-1] |
| std::exponential_distribution<double> exp_distribution(lambda); |
| for (uint32_t i = 0; i < kNumObservations; i++) { |
| double random_exponential_number = exp_distribution(random_dev_); |
| random_exponential_number /= approximate_max_generated_num; |
| random_exponential_number *= static_cast<double>(kNumCandidates); |
| int observed_candidate_id = static_cast<int>(random_exponential_number); |
| observed_candidate_id = std::min(observed_candidate_id, max_id); |
| observed_candidate_id = candidate_ids_list_shuffled[observed_candidate_id]; |
| candidate_indices[i] = observed_candidate_id; |
| true_candidate_counts[observed_candidate_id]++; |
| } |
| |
| prob_0_becomes_1_ = 0.05; |
| prob_1_stays_1_ = 0.95; |
| |
| LongExperimentWithAnalyze("p=0.05, q=0.95, exponential distribution", |
| kNumCandidates, kNumBloomBits, kNumCohorts, |
| kNumHashes, candidate_indices, |
| true_candidate_counts, print_estimates); |
| |
| prob_0_becomes_1_ = 0.25; |
| prob_1_stays_1_ = 0.75; |
| |
| LongExperimentWithAnalyze("p=0.25, q=0.75, exponential distribution", |
| kNumCandidates, kNumBloomBits, kNumCohorts, |
| kNumHashes, candidate_indices, |
| true_candidate_counts, print_estimates); |
| } |
| |
| // This is the same as PowerLawExperiment but the distribution of observations |
| // comes from normal distribution. |
| TEST_F(RapporAnalyzerTest, NormalDistExperiment) { |
| static const uint32_t kNumCandidates = 100; |
| static const uint32_t kNumCohorts = 2; |
| static const uint32_t kNumHashes = 2; |
| static const uint32_t kNumBloomBits = 32; |
| static const uint32_t kNumObservations = 1e+5; |
| static const bool print_estimates = true; |
| const double mean = static_cast<const double>(kNumCandidates / 2); |
| // Most probablility weight is within +/- 3 standard deviations. |
| const double sd = static_cast<const double>(mean / 10); |
| const int max_id = static_cast<const int>(kNumCandidates - 1); |
| |
| std::vector<int> candidate_indices(kNumObservations); |
| std::vector<int> true_candidate_counts(kNumCandidates, 0); |
| |
| // Create a "map" of shuffled ids to randomize the observed id values. |
| std::vector<int> candidate_ids_list_shuffled = |
| GenerateRandomMapOfIds(kNumCandidates); |
| |
| // Generate observations from the normal distribution. |
| std::normal_distribution<double> nrm_distribution(mean, sd); |
| for (uint32_t i = 0; i < kNumObservations; i++) { |
| double random_normal_number = nrm_distribution(random_dev_); |
| int observed_candidate_id = static_cast<int>(random_normal_number); |
| observed_candidate_id = |
| std::max(0, std::min(observed_candidate_id, max_id)); |
| observed_candidate_id = candidate_ids_list_shuffled[observed_candidate_id]; |
| candidate_indices[i] = observed_candidate_id; |
| true_candidate_counts[observed_candidate_id]++; |
| } |
| |
| prob_0_becomes_1_ = 0.05; |
| prob_1_stays_1_ = 0.95; |
| |
| LongExperimentWithAnalyze("p=0.05, q=0.95, normal distribution", |
| kNumCandidates, kNumBloomBits, kNumCohorts, |
| kNumHashes, candidate_indices, |
| true_candidate_counts, print_estimates); |
| |
| prob_0_becomes_1_ = 0.25; |
| prob_1_stays_1_ = 0.75; |
| |
| LongExperimentWithAnalyze("p=0.25, q=0.75, normal distribution", |
| kNumCandidates, kNumBloomBits, kNumCohorts, |
| kNumHashes, candidate_indices, |
| true_candidate_counts, print_estimates); |
| } |
| |
| // This is the same as PowerLawExperiment but the observations come from a |
| // uniform distribution supported on some small set of candidates. |
| TEST_F(RapporAnalyzerTest, KOutOfNExperiment) { |
| // For this test to be meaningful we should have kNumObservedCandidates << |
| // kNumCandidates |
| static const uint32_t kNumCandidates = 2000; |
| static const uint32_t kNumObservedCandidates = 10; |
| static const uint32_t kNumCohorts = 50; |
| static const uint32_t kNumHashes = 2; |
| static const uint32_t kNumBloomBits = 32; |
| static const uint32_t kNumObservations = 1e+5; |
| static const bool print_estimates = true; |
| const int max_id = static_cast<const int>(kNumCandidates - 1); |
| |
| std::vector<int> candidate_indices(kNumObservations); |
| std::vector<int> true_candidate_counts(kNumCandidates, 0); |
| |
| // Create a "map" of shuffled ids to randomize the observed id values |
| std::vector<int> candidate_ids_list_shuffled = |
| GenerateRandomMapOfIds(kNumCandidates); |
| |
| // Generate observations |
| for (uint32_t i = 0; i < kNumObservations; i++) { |
| int observed_candidate_id = i % kNumObservedCandidates; |
| observed_candidate_id = std::min(observed_candidate_id, max_id); |
| observed_candidate_id = candidate_ids_list_shuffled[observed_candidate_id]; |
| candidate_indices[i] = observed_candidate_id; |
| true_candidate_counts[observed_candidate_id]++; |
| } |
| |
| prob_0_becomes_1_ = 0.05; |
| prob_1_stays_1_ = 0.95; |
| |
| LongExperimentWithAnalyze("p=0.05, q=0.95, k out of N distribution", |
| kNumCandidates, kNumBloomBits, kNumCohorts, |
| kNumHashes, candidate_indices, |
| true_candidate_counts, print_estimates); |
| |
| prob_0_becomes_1_ = 0.25; |
| prob_1_stays_1_ = 0.75; |
| |
| LongExperimentWithAnalyze("p=0.25, q=0.75, k out of N distribution", |
| kNumCandidates, kNumBloomBits, kNumCohorts, |
| kNumHashes, candidate_indices, |
| true_candidate_counts, print_estimates); |
| } |
| |
| } // namespace rappor |
| } // namespace cobalt |