blob: add20d45fb87a20956b867ccb562c3b290efe261 [file] [log] [blame]
// Copyright 2016 The Fuchsia Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "algorithms/rappor/rappor_analyzer_test.h"
namespace cobalt {
namespace rappor {
using encoder::ClientSecret;
void RapporAnalyzerTest::SetAnalyzer(uint32_t num_candidates,
uint32_t num_bloom_bits,
uint32_t num_cohorts,
uint32_t num_hashes) {
PopulateRapporCandidateList(num_candidates, &candidate_list_);
config_ = Config(num_bloom_bits, num_cohorts, num_hashes, prob_0_becomes_1_,
prob_1_stays_1_);
analyzer_.reset(new RapporAnalyzer(config_, &candidate_list_));
}
void RapporAnalyzerTest::BuildCandidateMap() {
EXPECT_EQ(grpc::OK, analyzer_->BuildCandidateMap().error_code());
const uint32_t num_candidates =
analyzer_->candidate_map_.candidate_list->candidates_size();
const uint32_t num_cohorts = analyzer_->config_->num_cohorts();
const uint32_t num_hashes = analyzer_->config_->num_hashes();
const uint32_t num_bits = analyzer_->config_->num_bits();
// Expect the number of candidates to be correct,
EXPECT_EQ(num_candidates,
analyzer_->candidate_map_.candidate_cohort_maps.size());
// and for each candidate...
for (size_t candidate = 0; candidate < num_candidates; candidate++) {
// expect the number of cohorts to be correct,
EXPECT_EQ(num_cohorts,
analyzer_->candidate_map_.candidate_cohort_maps[candidate]
.cohort_hashes.size());
// and for each cohort...
for (size_t cohort = 0; cohort < num_cohorts; cohort++) {
// expect the number of hashes to be correct,
EXPECT_EQ(num_hashes,
analyzer_->candidate_map_.candidate_cohort_maps[candidate]
.cohort_hashes[cohort]
.bit_indices.size());
// and for each hash...
for (size_t hash = 0; hash < num_hashes; hash++) {
// Expect the bit index to be in the range [0, num_bits).
auto bit_index = GetCandidateMapValue(candidate, cohort, hash);
EXPECT_GE(bit_index, 0u);
EXPECT_LT(bit_index, num_bits);
}
}
}
// Validate the associated sparse matrix.
EXPECT_EQ(num_candidates, candidate_matrix().cols());
EXPECT_EQ(num_cohorts * num_bits, candidate_matrix().rows());
EXPECT_LE(num_candidates * num_cohorts, candidate_matrix().nonZeros());
EXPECT_GE(num_candidates * num_cohorts * num_hashes,
candidate_matrix().nonZeros());
}
uint16_t RapporAnalyzerTest::GetCandidateMapValue(uint16_t candidate_index,
uint16_t cohort_index,
uint16_t hash_index) {
EXPECT_GT(analyzer_->candidate_map_.candidate_cohort_maps.size(),
candidate_index);
EXPECT_GT(analyzer_->candidate_map_.candidate_cohort_maps[candidate_index]
.cohort_hashes.size(),
cohort_index);
EXPECT_GT(analyzer_->candidate_map_.candidate_cohort_maps[candidate_index]
.cohort_hashes[cohort_index]
.bit_indices.size(),
hash_index);
return analyzer_->candidate_map_.candidate_cohort_maps[candidate_index]
.cohort_hashes[cohort_index]
.bit_indices[hash_index];
}
std::string RapporAnalyzerTest::BuildBitString(uint16_t candidate_index,
uint16_t cohort_index) {
return BuildBinaryString(
analyzer_->config_->num_bits(),
analyzer_->candidate_map_.candidate_cohort_maps[candidate_index]
.cohort_hashes[cohort_index]
.bit_indices);
}
void RapporAnalyzerTest::AddObservation(uint32_t cohort,
std::string binary_string) {
EXPECT_TRUE(analyzer_->AddObservation(
RapporObservationFromString(cohort, binary_string)));
}
void RapporAnalyzerTest::ExtractEstimatedBitCountRatios(
Eigen::VectorXd* est_bit_count_ratios) {
std::vector<double> est_std_errors_not_used;
EXPECT_TRUE(analyzer_
->ExtractEstimatedBitCountRatiosAndStdErrors(
est_bit_count_ratios, &est_std_errors_not_used)
.ok());
}
void RapporAnalyzerTest::ExtractEstimatedBitCountRatiosAndStdErrors(
Eigen::VectorXd* est_bit_count_ratios,
std::vector<double>* est_std_errors) {
EXPECT_TRUE(analyzer_
->ExtractEstimatedBitCountRatiosAndStdErrors(
est_bit_count_ratios, est_std_errors)
.ok());
}
void RapporAnalyzerTest::AddObservationsForCandidates(
const std::vector<int>& candidate_indices) {
for (auto index : candidate_indices) {
// Construct a new encoder with a new ClientSecret so that a random
// cohort is selected.
RapporEncoder encoder(config_, ClientSecret::GenerateNewSecret());
// Encode the current candidate string using |encoder|.
ValuePart value_part;
value_part.set_string_value(CandidateString(index));
RapporObservation observation;
encoder.Encode(value_part, &observation);
EXPECT_TRUE(analyzer_->AddObservation(observation));
}
}
int RapporAnalyzerTest::GenerateNumberFromPowerLaw(const double left,
const double right,
const double exponent) {
// double precision must be used because of potentially large powers taken
std::uniform_real_distribution<double> uniform_0_1_distribution(0.0, 1.0);
double random_between_0_1 = uniform_0_1_distribution(random_dev_);
double left_to_exponent_plus_1 = std::pow(left, exponent + 1);
double random_power_law_number =
(std::pow(right, exponent + 1) - left_to_exponent_plus_1) *
random_between_0_1 +
left_to_exponent_plus_1;
random_power_law_number =
std::pow(random_power_law_number, 1.0f / (exponent + 1));
return random_power_law_number;
}
std::vector<int> RapporAnalyzerTest::GenerateRandomMapOfIds(
const int num_candidates) {
// Create a "map" of shuffled ids to randomize the observed id values
std::vector<int> candidate_ids_list_shuffled(num_candidates);
std::iota(candidate_ids_list_shuffled.begin(),
candidate_ids_list_shuffled.end(), 0);
std::mt19937 g(random_dev_());
std::shuffle(candidate_ids_list_shuffled.begin(),
candidate_ids_list_shuffled.end(), g);
return candidate_ids_list_shuffled;
}
std::vector<int> RapporAnalyzerTest::CountsEstimatesFromResults(
const std::vector<CandidateResult>& results) {
uint32_t num_candidates = results.size();
std::vector<int> count_estimates(num_candidates);
for (size_t i = 0; i < num_candidates; i++) {
count_estimates[i] = static_cast<int>(round(results[i].count_estimate));
}
return count_estimates;
}
Eigen::VectorXd RapporAnalyzerTest::VectorFromCounts(
const std::vector<int>& counts) {
uint32_t num_candidates = counts.size();
Eigen::VectorXd counts_vector(num_candidates);
for (size_t i = 0; i < num_candidates; i++) {
counts_vector(i) = static_cast<double>(counts[i]);
}
return counts_vector;
}
void RapporAnalyzerTest::CheckExactSolution(
const std::vector<int>& exact_candidate_counts) {
Eigen::VectorXd est_bit_count_ratios;
ExtractEstimatedBitCountRatios(&est_bit_count_ratios);
Eigen::VectorXd exact_count_vector = VectorFromCounts(exact_candidate_counts);
EXPECT_EQ(analyzer_->candidate_matrix_.cols(),
static_cast<const int64_t>(exact_candidate_counts.size()));
Eigen::VectorXd rhs = analyzer_->candidate_matrix_ * exact_count_vector;
rhs /= exact_count_vector.sum();
Eigen::VectorXd difference = rhs - est_bit_count_ratios;
LOG(ERROR)
<< "How well does the exact solution reproduce the right hand side?"
<< difference.norm() / rhs.norm();
}
void RapporAnalyzerTest::PrintTrueCountsAndEstimates(
const std::string& case_label, uint32_t num_candidates,
const std::vector<CandidateResult>& results,
const std::vector<int>& true_candidate_counts) {
std::vector<int> count_estimates = CountsEstimatesFromResults(results);
std::ostringstream true_stream;
for (size_t i = 0; i < num_candidates; i++) {
if (true_candidate_counts[i] > 0) {
true_stream << "beta(" << i << ") == " << true_candidate_counts[i]
<< std::endl;
}
}
LOG(ERROR) << "-------------------------------------";
LOG(ERROR) << case_label;
LOG(ERROR) << "True counts: " << true_stream.str();
std::ostringstream estimate_stream;
for (size_t i = 0; i < num_candidates; i++) {
if (count_estimates[i] > 0) {
estimate_stream << "beta(" << i << ") == " << count_estimates[i]
<< std::endl;
}
}
LOG(ERROR) << " Estimates: " << estimate_stream.str();
}
void RapporAnalyzerTest::AssessUtility(
const std::vector<CandidateResult>& results,
const std::vector<int>& true_candidate_counts) {
// Get the estimates vector as well as the number of nonzero estimates
int num_candidates = results.size();
std::vector<int> estimates_vector = CountsEstimatesFromResults(results);
int how_many_nonzeros =
std::count_if(estimates_vector.begin(), estimates_vector.end(),
[](const int a) { return a > 0; });
// Sort candidate ids in an ascending order according to their real and
// estimated values
std::vector<int> estimated_id_order(num_candidates);
std::iota(estimated_id_order.begin(), estimated_id_order.end(), 0);
std::vector<int> true_id_order = estimated_id_order;
std::sort(estimated_id_order.begin(), estimated_id_order.end(),
[&estimates_vector](const int a, const int b) {
return estimates_vector[a] > estimates_vector[b];
});
std::sort(true_id_order.begin(), true_id_order.end(),
[&true_candidate_counts](const int a, const int b) {
return true_candidate_counts[a] > true_candidate_counts[b];
});
// Compute the false positive rates for a grid of values
LOG(ERROR) << "Identified " << how_many_nonzeros << " nonzero estimates.";
LOG(ERROR) << "The measure of false positives for identified top n hitters:";
std::vector<int> top_hitters_analyzed = {10, 20, 50, 100, 200,
300, 500, 1000, 2000, 5000};
int num_hitters = 0;
for (auto it = top_hitters_analyzed.begin(); it != top_hitters_analyzed.end();
++it) {
num_hitters = *it;
if (num_hitters > num_candidates) {
break;
}
float false_positives = 0;
auto after_nth_element_it = true_id_order.begin() + num_hitters;
for (int i = 0; i < num_hitters; i++) {
auto where_ith_estimated_element = std::find(
true_id_order.begin(), after_nth_element_it, estimated_id_order[i]);
if (where_ith_estimated_element == after_nth_element_it) {
false_positives += 1.0;
}
}
LOG(ERROR) << "The false positive rate at n = " << num_hitters << " is "
<< false_positives / num_hitters;
}
}
grpc::Status RapporAnalyzerTest::ComputeLeastSquaresFitQR(
const Eigen::VectorXd& est_bit_count_ratios,
std::vector<CandidateResult>* results) {
// cast from smaller to larger type for comparisons
const size_t num_candidates =
static_cast<const size_t>(analyzer_->candidate_matrix_.cols());
EXPECT_EQ(results->size(), num_candidates);
// define the QR solver and perform the QR decomposition followed by
// least squares solve
Eigen::SparseQR<Eigen::SparseMatrix<double, Eigen::ColMajor>,
Eigen::COLAMDOrdering<int>>
qrsolver;
EXPECT_EQ(analyzer_->candidate_matrix_.rows(), est_bit_count_ratios.size());
EXPECT_GT(analyzer_->candidate_matrix_.rows(), 0);
// explicitly construct Eigen::ColMajor matrix from candidate_matrix_
// (the documentation for Eigen::SparseQR requires it)
// compute() as well as Eigen::COLAMDOrdering require compressed
// matrix
Eigen::SparseMatrix<double, Eigen::ColMajor> candidate_matrix_col_major =
analyzer_->candidate_matrix_;
candidate_matrix_col_major.makeCompressed();
qrsolver.compute(candidate_matrix_col_major);
if (qrsolver.info() != Eigen::Success) {
std::string message = "Eigen::SparseQR decomposition was unsuccessfull";
return grpc::Status(grpc::INTERNAL, message);
}
Eigen::VectorXd result_vals = qrsolver.solve(est_bit_count_ratios);
if (qrsolver.info() != Eigen::Success) {
std::string message = "Eigen::SparseQR solve was unsuccessfull";
return grpc::Status(grpc::INTERNAL, message);
}
// write to the results vector
EXPECT_EQ(num_candidates, results->size());
for (size_t i = 0; i < num_candidates; i++) {
results->at(i).count_estimate =
result_vals[i] * analyzer_->bit_counter_.num_observations();
results->at(i).std_error = 0;
}
return grpc::Status::OK;
}
void RapporAnalyzerTest::RunSimpleLinearRegressionReference(
const std::string& case_label, uint32_t num_candidates,
uint32_t num_bloom_bits, uint32_t num_cohorts, uint32_t num_hashes,
std::vector<int> candidate_indices,
std::vector<int> true_candidate_counts) {
SetAnalyzer(num_candidates, num_bloom_bits, num_cohorts, num_hashes);
AddObservationsForCandidates(candidate_indices);
// set up the matrix
auto status = analyzer_->BuildCandidateMap();
if (!status.ok()) {
EXPECT_EQ(grpc::OK, status.error_code());
return;
}
// set up the right hand side of the equation
Eigen::VectorXd est_bit_count_ratios;
ExtractEstimatedBitCountRatios(&est_bit_count_ratios);
std::vector<CandidateResult> results(num_candidates);
status = ComputeLeastSquaresFitQR(est_bit_count_ratios, &results);
if (!status.ok()) {
EXPECT_EQ(grpc::OK, status.error_code());
return;
}
PrintTrueCountsAndEstimates(case_label, num_candidates, results,
true_candidate_counts);
}
void RapporAnalyzerTest::ShortExperimentWithAnalyze(
const std::string& case_label, uint32_t num_candidates,
uint32_t num_bloom_bits, uint32_t num_cohorts, uint32_t num_hashes,
std::vector<int> candidate_indices, std::vector<int> true_candidate_counts,
const bool print_estimates) {
SetAnalyzer(num_candidates, num_bloom_bits, num_cohorts, num_hashes);
AddObservationsForCandidates(candidate_indices);
std::vector<CandidateResult> results;
auto status = analyzer_->Analyze(&results);
if (!status.ok()) {
EXPECT_EQ(grpc::OK, status.error_code());
return;
}
if (results.size() != num_candidates) {
EXPECT_EQ(num_candidates, results.size());
return;
}
if (print_estimates) {
PrintTrueCountsAndEstimates(case_label, num_candidates, results,
true_candidate_counts);
}
}
void RapporAnalyzerTest::LongExperimentWithAnalyze(
const std::string& case_label, uint32_t num_candidates,
uint32_t num_bloom_bits, uint32_t num_cohorts, uint32_t num_hashes,
std::vector<int> candidate_indices, std::vector<int> true_candidate_counts,
const bool print_estimates) {
SetAnalyzer(num_candidates, num_bloom_bits, num_cohorts, num_hashes);
AddObservationsForCandidates(candidate_indices);
std::vector<CandidateResult> results;
auto start_analyze_time = std::chrono::steady_clock::now();
auto status = analyzer_->Analyze(&results);
if (!status.ok()) {
EXPECT_EQ(grpc::OK, status.error_code());
return;
}
auto end_analyze_time = std::chrono::steady_clock::now();
LOG(ERROR) << "Analyze() took "
<< std::chrono::duration_cast<std::chrono::seconds>(
end_analyze_time - start_analyze_time)
.count()
<< " seconds.";
if (results.size() != num_candidates) {
EXPECT_EQ(num_candidates, results.size());
return;
}
CheckExactSolution(true_candidate_counts);
if (print_estimates) {
PrintTrueCountsAndEstimates(case_label, num_candidates, results,
true_candidate_counts);
}
AssessUtility(results, true_candidate_counts);
}
void RapporAnalyzerTest::CompareAnalyzeToSimpleRegression(
const std::string& case_label, uint32_t num_candidates,
uint32_t num_bloom_bits, uint32_t num_cohorts, uint32_t num_hashes,
std::vector<int> candidate_indices,
std::vector<int> true_candidate_counts) {
SetAnalyzer(num_candidates, num_bloom_bits, num_cohorts, num_hashes);
AddObservationsForCandidates(candidate_indices);
// compute and print the results of Analyze()
std::vector<CandidateResult> results_analyze;
auto status = analyzer_->Analyze(&results_analyze);
if (!status.ok()) {
EXPECT_EQ(grpc::OK, status.error_code());
return;
}
if (results_analyze.size() != num_candidates) {
EXPECT_EQ(num_candidates, results_analyze.size());
return;
}
std::string print_label = case_label + " analyze ";
PrintTrueCountsAndEstimates(print_label, num_candidates, results_analyze,
true_candidate_counts);
// compute and print the results for simple linear regression
Eigen::VectorXd est_bit_count_ratios;
ExtractEstimatedBitCountRatios(&est_bit_count_ratios);
print_label = case_label + " least squares ";
std::vector<CandidateResult> results_ls(num_candidates);
status = ComputeLeastSquaresFitQR(est_bit_count_ratios, &results_ls);
if (!status.ok()) {
EXPECT_EQ(grpc::OK, status.error_code());
return;
}
PrintTrueCountsAndEstimates(print_label, num_candidates, results_ls,
true_candidate_counts);
}
RapporConfig config_;
std::unique_ptr<RapporAnalyzer> analyzer_;
RapporCandidateList candidate_list_;
// By default this test uses p=0, q=1. Individual tests may override this.
double prob_0_becomes_1_ = 0.0;
double prob_1_stays_1_ = 1.0;
// Random device
std::random_device random_dev_;
} // namespace rappor
} // namespace cobalt