blob: 7194c2932bc74739e36d29ada9ae5835940bafdb [file] [log] [blame]
// Copyright 2018 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "algorithms/rappor/rappor_analyzer_test.h"
namespace cobalt {
namespace rappor {
// Comparison of Analyze and simple least squares.
// It invokes Analyze() in a few very simple cases, checks that the the
// algorithm converges and that the result vector has the correct size. For each
// case, it also computes the least squares solution using QR for exactly the
// same system and prints both solutions (note that the least squares solution
// is not always unique).
TEST_F(RapporAnalyzerTest, CompareAnalyzeToRegression) {
static const uint32_t kNumCandidates = 10;
static const uint32_t kNumCohorts = 2;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 8;
std::vector<int> candidate_indices(100, 5);
std::vector<int> true_candidate_counts = {0, 0, 0, 0, 0, 100, 0, 0, 0, 0};
CompareAnalyzeToSimpleRegression("p=0, q=1, only candidate 5", kNumCandidates,
kNumBloomBits, kNumCohorts, kNumHashes,
candidate_indices, true_candidate_counts);
candidate_indices = std::vector<int>(20, 1);
candidate_indices.insert(candidate_indices.end(), 20, 4);
candidate_indices.insert(candidate_indices.end(), 60, 9);
true_candidate_counts = {0, 20, 0, 0, 20, 0, 0, 0, 0, 60};
CompareAnalyzeToSimpleRegression(
"p=0, q=1, several candidates", kNumCandidates, kNumBloomBits,
kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts);
prob_0_becomes_1_ = 0.1;
prob_1_stays_1_ = 0.9;
candidate_indices = std::vector<int>(100, 5);
true_candidate_counts = {0, 0, 0, 0, 0, 100, 0, 0, 0, 0};
CompareAnalyzeToSimpleRegression(
"p=0.1, q=0.9, only candidate 5", kNumCandidates, kNumBloomBits,
kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts);
candidate_indices = std::vector<int>(20, 1);
candidate_indices.insert(candidate_indices.end(), 20, 4);
candidate_indices.insert(candidate_indices.end(), 60, 9);
true_candidate_counts = {0, 20, 0, 0, 20, 0, 0, 0, 0, 60};
CompareAnalyzeToSimpleRegression(
"p=0.1, q=0.9, several candidates", kNumCandidates, kNumBloomBits,
kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts);
}
// Runs LongExperimentWithAnalyze; the true candidate counts are
// distributed according to the power law; we specify the number of
// observations and the exponent parameter of the power law. The ids are then
// shuffled so that it is not true that large ids are more frequent.
// Note: encoding observations is time consuming so large tests may take long.
TEST_F(RapporAnalyzerTest, PowerLawExperiment) {
static const uint32_t kNumCandidates = 20000;
static const uint32_t kNumCohorts = 128;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 128;
static const uint32_t kNumObservations = 1e+6;
static const bool print_estimates = false;
const double exponent = 30;
const int max_id = static_cast<const int>(kNumCandidates - 1);
std::vector<int> candidate_indices(kNumObservations);
std::vector<int> true_candidate_counts(kNumCandidates, 0);
// Create a "map" of shuffled ids to randomize the observed id values
std::vector<int> candidate_ids_list_shuffled(kNumCandidates);
std::iota(candidate_ids_list_shuffled.begin(),
candidate_ids_list_shuffled.end(), 0);
std::mt19937 g(random_dev_());
std::shuffle(candidate_ids_list_shuffled.begin(),
candidate_ids_list_shuffled.end(), g);
// Generate observations from the power law distribution on
// [0,kNumCandidates-1]
const double left = 0.0;
const double right = static_cast<const double>(max_id);
for (uint32_t i = 0; i < kNumObservations; i++) {
double random_power_law_number =
GenerateNumberFromPowerLaw(left, right, exponent);
int observed_candidate_id = static_cast<int>(random_power_law_number);
observed_candidate_id = std::min(observed_candidate_id, max_id);
observed_candidate_id = candidate_ids_list_shuffled[observed_candidate_id];
candidate_indices[i] = observed_candidate_id;
true_candidate_counts[observed_candidate_id]++;
}
prob_0_becomes_1_ = 0.05;
prob_1_stays_1_ = 0.95;
LongExperimentWithAnalyze("p=0.05, q=0.95, power-law distribution",
kNumCandidates, kNumBloomBits, kNumCohorts,
kNumHashes, candidate_indices,
true_candidate_counts, print_estimates);
prob_0_becomes_1_ = 0.25;
prob_1_stays_1_ = 0.75;
LongExperimentWithAnalyze("p=0.25, q=0.75, power-law distribution",
kNumCandidates, kNumBloomBits, kNumCohorts,
kNumHashes, candidate_indices,
true_candidate_counts, print_estimates);
}
// This is the same as PowerLawExperiment but the distribution of observations
// is exponential.
TEST_F(RapporAnalyzerTest, ExponentialExperiment) {
static const uint32_t kNumCandidates = 300;
static const uint32_t kNumCohorts = 2;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 128;
static const uint32_t kNumObservations = 1e+6;
static const bool print_estimates = true;
static const double lambda = 1.0; // the support of pdf for lambda == 1.0 ...
static const double approximate_max_generated_num =
6.0; // ... is roughly [0, 6.0]
const int max_id = static_cast<const int>(kNumCandidates - 1);
std::vector<int> candidate_indices(kNumObservations);
std::vector<int> true_candidate_counts(kNumCandidates, 0);
// Create a "map" of shuffled ids to randomize the observed id values
std::vector<int> candidate_ids_list_shuffled =
GenerateRandomMapOfIds(kNumCandidates);
// Generate observations from the exponential distribution on
// [0,kNumCandidates-1]
std::exponential_distribution<double> exp_distribution(lambda);
for (uint32_t i = 0; i < kNumObservations; i++) {
double random_exponential_number = exp_distribution(random_dev_);
random_exponential_number /= approximate_max_generated_num;
random_exponential_number *= static_cast<double>(kNumCandidates);
int observed_candidate_id = static_cast<int>(random_exponential_number);
observed_candidate_id = std::min(observed_candidate_id, max_id);
observed_candidate_id = candidate_ids_list_shuffled[observed_candidate_id];
candidate_indices[i] = observed_candidate_id;
true_candidate_counts[observed_candidate_id]++;
}
prob_0_becomes_1_ = 0.05;
prob_1_stays_1_ = 0.95;
LongExperimentWithAnalyze("p=0.05, q=0.95, exponential distribution",
kNumCandidates, kNumBloomBits, kNumCohorts,
kNumHashes, candidate_indices,
true_candidate_counts, print_estimates);
prob_0_becomes_1_ = 0.25;
prob_1_stays_1_ = 0.75;
LongExperimentWithAnalyze("p=0.25, q=0.75, exponential distribution",
kNumCandidates, kNumBloomBits, kNumCohorts,
kNumHashes, candidate_indices,
true_candidate_counts, print_estimates);
}
// This is the same as PowerLawExperiment but the distribution of observations
// comes from normal distribution.
TEST_F(RapporAnalyzerTest, NormalDistExperiment) {
static const uint32_t kNumCandidates = 100;
static const uint32_t kNumCohorts = 2;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 32;
static const uint32_t kNumObservations = 1e+5;
static const bool print_estimates = true;
const double mean = static_cast<const double>(kNumCandidates / 2);
// Most probablility weight is within +/- 3 standard deviations.
const double sd = static_cast<const double>(mean / 10);
const int max_id = static_cast<const int>(kNumCandidates - 1);
std::vector<int> candidate_indices(kNumObservations);
std::vector<int> true_candidate_counts(kNumCandidates, 0);
// Create a "map" of shuffled ids to randomize the observed id values.
std::vector<int> candidate_ids_list_shuffled =
GenerateRandomMapOfIds(kNumCandidates);
// Generate observations from the normal distribution.
std::normal_distribution<double> nrm_distribution(mean, sd);
for (uint32_t i = 0; i < kNumObservations; i++) {
double random_normal_number = nrm_distribution(random_dev_);
int observed_candidate_id = static_cast<int>(random_normal_number);
observed_candidate_id =
std::max(0, std::min(observed_candidate_id, max_id));
observed_candidate_id = candidate_ids_list_shuffled[observed_candidate_id];
candidate_indices[i] = observed_candidate_id;
true_candidate_counts[observed_candidate_id]++;
}
prob_0_becomes_1_ = 0.05;
prob_1_stays_1_ = 0.95;
LongExperimentWithAnalyze("p=0.05, q=0.95, normal distribution",
kNumCandidates, kNumBloomBits, kNumCohorts,
kNumHashes, candidate_indices,
true_candidate_counts, print_estimates);
prob_0_becomes_1_ = 0.25;
prob_1_stays_1_ = 0.75;
LongExperimentWithAnalyze("p=0.25, q=0.75, normal distribution",
kNumCandidates, kNumBloomBits, kNumCohorts,
kNumHashes, candidate_indices,
true_candidate_counts, print_estimates);
}
// This is the same as PowerLawExperiment but the observations come from a
// uniform distribution supported on some small set of candidates.
TEST_F(RapporAnalyzerTest, KOutOfNExperiment) {
// For this test to be meaningful we should have kNumObservedCandidates <<
// kNumCandidates
static const uint32_t kNumCandidates = 2000;
static const uint32_t kNumObservedCandidates = 10;
static const uint32_t kNumCohorts = 50;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 32;
static const uint32_t kNumObservations = 1e+5;
static const bool print_estimates = true;
const int max_id = static_cast<const int>(kNumCandidates - 1);
std::vector<int> candidate_indices(kNumObservations);
std::vector<int> true_candidate_counts(kNumCandidates, 0);
// Create a "map" of shuffled ids to randomize the observed id values
std::vector<int> candidate_ids_list_shuffled =
GenerateRandomMapOfIds(kNumCandidates);
// Generate observations
for (uint32_t i = 0; i < kNumObservations; i++) {
int observed_candidate_id = i % kNumObservedCandidates;
observed_candidate_id = std::min(observed_candidate_id, max_id);
observed_candidate_id = candidate_ids_list_shuffled[observed_candidate_id];
candidate_indices[i] = observed_candidate_id;
true_candidate_counts[observed_candidate_id]++;
}
prob_0_becomes_1_ = 0.05;
prob_1_stays_1_ = 0.95;
LongExperimentWithAnalyze("p=0.05, q=0.95, k out of N distribution",
kNumCandidates, kNumBloomBits, kNumCohorts,
kNumHashes, candidate_indices,
true_candidate_counts, print_estimates);
prob_0_becomes_1_ = 0.25;
prob_1_stays_1_ = 0.75;
LongExperimentWithAnalyze("p=0.25, q=0.75, k out of N distribution",
kNumCandidates, kNumBloomBits, kNumCohorts,
kNumHashes, candidate_indices,
true_candidate_counts, print_estimates);
}
} // namespace rappor
} // namespace cobalt