blob: 1b8532cd0fde931d61005e2dcc58bfc84c5757e9 [file] [log] [blame]
// Copyright 2016 The Fuchsia Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "algorithms/rappor/rappor_analyzer.h"
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <algorithm>
#include <chrono>
#include <random>
#include <string>
#include <utility>
#include <vector>
#include "algorithms/rappor/rappor_encoder.h"
#include "algorithms/rappor/rappor_test_utils.h"
#include "encoder/client_secret.h"
#include "third_party/eigen/Eigen/SparseQR"
#include "third_party/googletest/googletest/include/gtest/gtest.h"
namespace cobalt {
namespace rappor {
using encoder::ClientSecret;
namespace {
std::string CandidateString(int i) {
return std::string("candidate string") + std::to_string(i);
}
// Populates |candidate_list| with |num_candidates| candidates;
void PopulateRapporCandidateList(uint32_t num_candidates,
RapporCandidateList* candidate_list) {
candidate_list->Clear();
for (size_t i = 0; i < num_candidates; i++) {
candidate_list->add_candidates(CandidateString(i));
}
}
// Makes a RapporConfig with the given data.
RapporConfig Config(uint32_t num_bloom_bits, uint32_t num_cohorts,
uint32_t num_hashes, double p, double q) {
RapporConfig config;
config.set_num_bloom_bits(num_bloom_bits);
config.set_num_hashes(num_hashes);
config.set_num_cohorts(num_cohorts);
config.set_prob_0_becomes_1(p);
config.set_prob_1_stays_1(q);
return config;
}
// Given a string of "0"s and "1"s of length a multiple of 8, and a cohort,
// returns a RapporObservation for the given cohort whose data is equal to the
// bytes whose binary representation is given by the string.
RapporObservation RapporObservationFromString(
uint32_t cohort, const std::string& binary_string) {
RapporObservation obs;
obs.set_cohort(cohort);
obs.set_data(BinaryStringToData(binary_string));
return obs;
}
} // namespace
class RapporAnalyzerTest : public ::testing::Test {
protected:
// Sets the member variable analyzer_ to a new RapporAnalyzer configured
// with the given arguments and the current values of prob_0_becomes_1_,
// prob_1_stays_1_.
void SetAnalyzer(uint32_t num_candidates, uint32_t num_bloom_bits,
uint32_t num_cohorts, uint32_t num_hashes) {
PopulateRapporCandidateList(num_candidates, &candidate_list_);
config_ = Config(num_bloom_bits, num_cohorts, num_hashes, prob_0_becomes_1_,
prob_1_stays_1_);
analyzer_.reset(new RapporAnalyzer(config_, &candidate_list_));
}
void BuildCandidateMap() {
EXPECT_EQ(grpc::OK, analyzer_->BuildCandidateMap().error_code());
const uint32_t num_candidates =
analyzer_->candidate_map_.candidate_list->candidates_size();
const uint32_t num_cohorts = analyzer_->config_->num_cohorts();
const uint32_t num_hashes = analyzer_->config_->num_hashes();
const uint32_t num_bits = analyzer_->config_->num_bits();
// Expect the number of candidates to be correct,
EXPECT_EQ(num_candidates,
analyzer_->candidate_map_.candidate_cohort_maps.size());
// and for each candidate...
for (size_t candidate = 0; candidate < num_candidates; candidate++) {
// expect the number of cohorts to be correct,
EXPECT_EQ(num_cohorts,
analyzer_->candidate_map_.candidate_cohort_maps[candidate]
.cohort_hashes.size());
// and for each cohort...
for (size_t cohort = 0; cohort < num_cohorts; cohort++) {
// expect the number of hashes to be correct,
EXPECT_EQ(num_hashes,
analyzer_->candidate_map_.candidate_cohort_maps[candidate]
.cohort_hashes[cohort]
.bit_indices.size());
// and for each hash...
for (size_t hash = 0; hash < num_hashes; hash++) {
// Expect the bit index to be in the range [0, num_bits).
auto bit_index = GetCandidateMapValue(candidate, cohort, hash);
EXPECT_GE(bit_index, 0u);
EXPECT_LT(bit_index, num_bits);
}
}
}
// Validate the associated sparse matrix.
EXPECT_EQ(num_candidates, candidate_matrix().cols());
EXPECT_EQ(num_cohorts * num_bits, candidate_matrix().rows());
EXPECT_LE(num_candidates * num_cohorts, candidate_matrix().nonZeros());
EXPECT_GE(num_candidates * num_cohorts * num_hashes,
candidate_matrix().nonZeros());
}
// This should be invoked after BuildCandidateMap. It returns the bit index
// within the CandidateMap for the given |candidate_index|, |cohort_index|,
// and |hash_index|.
uint16_t GetCandidateMapValue(uint16_t candidate_index, uint16_t cohort_index,
uint16_t hash_index) {
EXPECT_GT(analyzer_->candidate_map_.candidate_cohort_maps.size(),
candidate_index);
EXPECT_GT(analyzer_->candidate_map_.candidate_cohort_maps[candidate_index]
.cohort_hashes.size(),
cohort_index);
EXPECT_GT(analyzer_->candidate_map_.candidate_cohort_maps[candidate_index]
.cohort_hashes[cohort_index]
.bit_indices.size(),
hash_index);
return analyzer_->candidate_map_.candidate_cohort_maps[candidate_index]
.cohort_hashes[cohort_index]
.bit_indices[hash_index];
}
// Builds and returns a bit string (i.e. a string of ASCII '0's and '1's)
// representing the Bloom filter implicitly stored within the CandidateMap
// for the given |candidate_index| and |cohort_index|.
std::string BuildBitString(uint16_t candidate_index, uint16_t cohort_index) {
return BuildBinaryString(
analyzer_->config_->num_bits(),
analyzer_->candidate_map_.candidate_cohort_maps[candidate_index]
.cohort_hashes[cohort_index]
.bit_indices);
}
const Eigen::SparseMatrix<float, Eigen::RowMajor>& candidate_matrix() {
return analyzer_->candidate_matrix_;
}
void AddObservation(uint32_t cohort, std::string binary_string) {
EXPECT_TRUE(analyzer_->AddObservation(
RapporObservationFromString(cohort, binary_string)));
}
void ExtractEstimatedBitCountRatios(Eigen::VectorXf* est_bit_count_ratios) {
EXPECT_TRUE(
analyzer_->ExtractEstimatedBitCountRatios(est_bit_count_ratios).ok());
}
void AddObservationsForCandidates(const std::vector<int>& candidate_indices) {
for (auto index : candidate_indices) {
// Construct a new encoder with a new ClientSecret so that a random
// cohort is selected.
RapporEncoder encoder(config_, ClientSecret::GenerateNewSecret());
// Encode the current candidate string using |encoder|.
ValuePart value_part;
value_part.set_string_value(CandidateString(index));
RapporObservation observation;
encoder.Encode(value_part, &observation);
EXPECT_TRUE(analyzer_->AddObservation(observation));
}
}
// Generate a random number from power law distribution on the interval
// [|left|,|right|] with given |exponent|
int GenerateNumberFromPowerLaw(const double left, const double right,
const double exponent) {
// double precision must be used because of potentially large powers taken
std::uniform_real_distribution<double> uniform_0_1_distribution(0.0, 1.0);
double random_between_0_1 = uniform_0_1_distribution(random_dev_);
double left_to_exponent_plus_1 = std::pow(left, exponent + 1);
double random_power_law_number =
(std::pow(right, exponent + 1) - left_to_exponent_plus_1) *
random_between_0_1 +
left_to_exponent_plus_1;
random_power_law_number =
std::pow(random_power_law_number, 1.0f / (exponent + 1));
return random_power_law_number;
}
std::vector<int> CountsEstimatesFromResults(
const std::vector<CandidateResult>& results) {
uint32_t num_candidates = results.size();
std::vector<int> count_estimates(num_candidates);
for (size_t i = 0; i < num_candidates; i++) {
count_estimates[i] = static_cast<int>(round(results[i].count_estimate));
}
return count_estimates;
}
void PrintTrueCountsAndEstimates(
const std::string& case_label, uint32_t num_candidates,
const std::vector<CandidateResult>& results,
const std::vector<int>& true_candidate_counts) {
std::vector<int> count_estimates = CountsEstimatesFromResults(results);
std::ostringstream true_stream;
for (size_t i = 0; i < num_candidates; i++) {
if (true_candidate_counts[i] > 0) {
true_stream << "beta(" << i << ") == " << true_candidate_counts[i]
<< std::endl;
}
}
LOG(ERROR) << "-------------------------------------";
LOG(ERROR) << case_label;
LOG(ERROR) << "True counts: " << true_stream.str();
std::ostringstream estimate_stream;
for (size_t i = 0; i < num_candidates; i++) {
if (count_estimates[i] > 0) {
estimate_stream << "beta(" << i << ") == " << count_estimates[i]
<< std::endl;
}
}
LOG(ERROR) << " Estimates: " << estimate_stream.str();
}
// Assess utility of |results|. The informal measure suggested by mironov is:
// "Largest n such that at most 10% of the n highest hitters are identified as
// such incorrectly (are false positives)". Obviously, this makes sense only
// for n in some range (it may unjusty suggest that the results are bad for n
// too small, or for n too large, that they are good when in fact they are
// not). Also, 10% is arbitrary. So instead, we just compute the false
// positive rates for n in some grid set. We also print the total number of
// nonzero estimates identified.
void AssessUtility(const std::vector<CandidateResult>& results,
const std::vector<int>& true_candidate_counts) {
// Get the estimates vector as well as the number of nonzero estimates
int num_candidates = results.size();
std::vector<int> estimates_vector = CountsEstimatesFromResults(results);
int how_many_nonzeros =
std::count_if(estimates_vector.begin(), estimates_vector.end(),
[](const int a) { return a > 0; });
// Sort candidate ids in an ascending order according to their real and
// estimated values
std::vector<int> estimated_id_order(num_candidates);
std::iota(estimated_id_order.begin(), estimated_id_order.end(), 0);
std::vector<int> true_id_order = estimated_id_order;
std::sort(estimated_id_order.begin(), estimated_id_order.end(),
[&estimates_vector](const int a, const int b) {
return estimates_vector[a] > estimates_vector[b];
});
std::sort(true_id_order.begin(), true_id_order.end(),
[&true_candidate_counts](const int a, const int b) {
return true_candidate_counts[a] > true_candidate_counts[b];
});
// Compute the false positive rates for a grid of values
LOG(ERROR) << "Identified " << how_many_nonzeros << " nonzero estimates.";
LOG(ERROR)
<< "The measure of false positives for identified top n hitters:";
std::vector<int> top_hitters_analyzed = {10, 20, 50, 100, 200,
300, 500, 1000, 2000, 5000};
int num_hitters = 0;
for (auto it = top_hitters_analyzed.begin();
it != top_hitters_analyzed.end(); ++it) {
num_hitters = *it;
if (num_hitters > num_candidates) {
break;
}
float false_positives = 0;
auto after_nth_element_it = true_id_order.begin() + num_hitters;
for (int i = 0; i < num_hitters; i++) {
auto where_ith_estimated_element = std::find(
true_id_order.begin(), after_nth_element_it, estimated_id_order[i]);
if (where_ith_estimated_element == after_nth_element_it) {
false_positives += 1.0;
}
}
LOG(ERROR) << "The false positive rate at n = " << num_hitters << " is "
<< false_positives / num_hitters;
}
}
// Checks correctness of the solution stored in |results| in an explicit way.
// This is not an automated test but rather a tool to manually assess the
// minimizer quality.
//
// Assumes that *analyzer_ contains minimizer data from previous run.
// The problem is (as formulated in lossmin library):
// min L(beta) ==
// 1/(2*N) * ||X * beta - y||_2^2 + 1/2 * l2 *||beta||_2^2 + l1 *||beta||_1,
// with variable beta. We assume l1,l2 >= 0.
// In our case, X == candidate_matrix(),
// beta == |results| / analyzer_->bit_counter().num_observations(),
// y == est_bit_count_ratios (observed ratios computed by calling
// analyzer_->ExtractEstimatedBitCountRatios(&est_bit_count_ratios)),
// l1 == analyzer_->minimizer_data.l1,
// l2 == analyzer_->minimizer_data.l2,
// N == canididate_matrix.rows().
//
// Let grad denote the gradient of
// F(beta) = 1/(2*N) * ||X * beta - y||_2^2 + 1/2 * l2 ||beta||_2^2.
// Note that grad == 1/N * X^T(X * beta - y) + beta.
//
// The KKT condition (in exact arithmetic) can be
// written explicitly in the following way:
// If beta_i > 0, then grad_i == -l1
// If beta_i < 0, then grad_i == l1
// If beta_i == 0, then -l1 <= grad_i <= l1.
//
// A point beta is a minimizer iff the KKT condition holds for beta
// (this minimizer need not be unique though).
//
// We check the KKT condition up to a given accuracy:
// |tol_cand| is the absolute tolerance at which we measure values of beta
// |tol_grad| is the absolute tolerance at which we measure values of grad
//
// Thus, beta_i > 0 is replaced by beta_i > tol_cand, beta_i < 0 is replaced
// by beta_i < -tol_cand, grad_i == +/- l1 is replaced by
// grad_i <=/>= +/- l1 +/- tol_grad
// and similarly for the inequality check.
// tol_cand and tol_grad should be consistent with implementation of
// lossmin::LossMinimizer::ConvergenceCheck but other values can be useful
// for testing.
// TODO(bazyli) make sure these checks remain consistent with lossmin and
// floatig point arithmetics.
//
// The test also prints quantitative violation of KKT condition as a mean
// violation per coordinate.
void CheckSolutionCorrectness(const float tol_cand, const float tol_grad,
const std::vector<CandidateResult>& results) {
// Populate the values from result to an Eigen::VectorXf object;
// (It looks like Eigen doesn't have its own iterators); this should be
// clear:
const size_t num_candidates = results.size();
Eigen::VectorXf candidate_estimates(num_candidates);
for (size_t i = 0; i < num_candidates; i++) {
candidate_estimates(i) = results[i].count_estimate /
analyzer_->bit_counter().num_observations();
}
// Get the penalty paramaters
const float l1 = analyzer_->minimizer_data_.l1;
const float l2 = analyzer_->minimizer_data_.l2;
// Extract y and compute the gradient = X^T * (X * beta - y) + l2 beta
Eigen::VectorXf est_bit_count_ratios;
auto status =
analyzer_->ExtractEstimatedBitCountRatios(&est_bit_count_ratios);
if (!status.ok()) {
return;
}
EXPECT_EQ(est_bit_count_ratios.size(), candidate_matrix().rows());
EXPECT_EQ(candidate_estimates.size(), candidate_matrix().cols());
Eigen::VectorXf gradient =
candidate_matrix().transpose() *
(candidate_matrix() * candidate_estimates - est_bit_count_ratios);
// Scale regression part of the gradient for consistency with lossmin
// library
gradient /= candidate_matrix().rows();
gradient += l2 * candidate_estimates;
std::ostringstream kkt_stream;
LOG(ERROR) << "Analyzing the minimizer data";
LOG(ERROR) << "Converged? " << analyzer_->minimizer_data_.converged;
LOG(ERROR) << "How many epochs? "
<< analyzer_->minimizer_data_.num_epochs_run;
LOG(ERROR) << "Final l1 penalty == " << analyzer_->minimizer_data_.l1;
LOG(ERROR) << "Checking solution correctness at each coordinate ...";
// Check the KKT condition for each coordinate
int num_errs = 0;
for (size_t i = 0; i < num_candidates; i++) {
float beta_i = candidate_estimates(i);
float grad_i = gradient(i);
if ((std::abs(beta_i) < tol_cand && std::abs(grad_i) > l1 + tol_grad) ||
(beta_i > tol_cand && std::abs(grad_i + l1) > tol_grad) ||
(beta_i < -tol_cand && std::abs(grad_i - l1) > tol_grad)) {
kkt_stream << "Solution is not a minimizer at tolerance == " << tol_grad
<< " because beta_k == " << beta_i
<< " and grad_k == " << grad_i << " at k == " << i
<< " while l1 == " << l1 << std::endl;
num_errs++;
}
}
LOG(ERROR) << kkt_stream.str();
LOG(ERROR) << "All coordinates examined. Found " << num_errs
<< " coordinates violating optimality conditions.";
EXPECT_EQ(num_errs, 0);
// Report also the measure of total violation of KKT condition
Eigen::VectorXf kkt_violation = (candidate_estimates.array() >= tol_cand)
.select(gradient.array() + l1, 0)
.matrix();
kkt_violation += (candidate_estimates.array() <= -tol_cand)
.select(gradient.array() - l1, 0)
.matrix();
kkt_violation += ((abs(candidate_estimates.array()) < tol_cand)
.select(abs(gradient.array()) - l1, 0)
.max(0))
.matrix();
LOG(ERROR) << "The total measure of KKT condition violation == "
<< kkt_violation.norm() / num_candidates;
}
// Computes the least squares fit on the candidate matrix using QR,
// for the given rhs in |est_bit_count_ratios| and saves it to |results|
grpc::Status ComputeLeastSquaresFitQR(
const Eigen::VectorXf& est_bit_count_ratios,
std::vector<CandidateResult>* results) {
// cast from smaller to larger type for comparisons
const size_t num_candidates =
static_cast<const size_t>(analyzer_->candidate_matrix_.cols());
EXPECT_EQ(results->size(), num_candidates);
// define the QR solver and perform the QR decomposition followed by
// least squares solve
Eigen::SparseQR<Eigen::SparseMatrix<float, Eigen::ColMajor>,
Eigen::COLAMDOrdering<int>>
qrsolver;
EXPECT_EQ(analyzer_->candidate_matrix_.rows(), est_bit_count_ratios.size());
EXPECT_GT(analyzer_->candidate_matrix_.rows(), 0);
// explicitly construct Eigen::ColMajor matrix from candidate_matrix_
// (the documentation for Eigen::SparseQR requires it)
// compute() as well as Eigen::COLAMDOrdering require compressed
// matrix
Eigen::SparseMatrix<float, Eigen::ColMajor> candidate_matrix_col_major =
analyzer_->candidate_matrix_;
candidate_matrix_col_major.makeCompressed();
qrsolver.compute(candidate_matrix_col_major);
if (qrsolver.info() != Eigen::Success) {
std::string message = "Eigen::SparseQR decomposition was unsuccessfull";
return grpc::Status(grpc::INTERNAL, message);
}
Eigen::VectorXf result_vals = qrsolver.solve(est_bit_count_ratios);
if (qrsolver.info() != Eigen::Success) {
std::string message = "Eigen::SparseQR solve was unsuccessfull";
return grpc::Status(grpc::INTERNAL, message);
}
// write to the results vector
EXPECT_EQ(num_candidates, results->size());
for (size_t i = 0; i < num_candidates; i++) {
results->at(i).count_estimate =
result_vals[i] * analyzer_->bit_counter_.num_observations();
results->at(i).std_error = 0;
}
return grpc::Status::OK;
}
// Runs a simple least squares problem for Ax = b on the candidate matrix
// using QR algorithm from eigen library; this is to see the results
// without penalty terms (note: in an overdetermined system the solution
// is not unique so this is more a helper testing function to cross-check
// the behavior of regression without penalty)
void RunSimpleLinearRegressionReference(
const std::string& case_label, uint32_t num_candidates,
uint32_t num_bloom_bits, uint32_t num_cohorts, uint32_t num_hashes,
std::vector<int> candidate_indices,
std::vector<int> true_candidate_counts) {
SetAnalyzer(num_candidates, num_bloom_bits, num_cohorts, num_hashes);
AddObservationsForCandidates(candidate_indices);
// set up the matrix
auto status = analyzer_->BuildCandidateMap();
if (!status.ok()) {
EXPECT_EQ(grpc::OK, status.error_code());
return;
}
// set up the right hand side of the equation
Eigen::VectorXf est_bit_count_ratios;
status = analyzer_->ExtractEstimatedBitCountRatios(&est_bit_count_ratios);
if (!status.ok()) {
EXPECT_EQ(grpc::OK, status.error_code());
return;
}
std::vector<CandidateResult> results(num_candidates);
status = ComputeLeastSquaresFitQR(est_bit_count_ratios, &results);
if (!status.ok()) {
EXPECT_EQ(grpc::OK, status.error_code());
return;
}
PrintTrueCountsAndEstimates(case_label, num_candidates, results,
true_candidate_counts);
}
// Invokes the Analyze() method using the given parameters. Checks that
// the algorithms converges and that the result vector has the correct length.
// Doesn't check the result vector at all but uses LOG(ERROR) statments
// to print the true candidate counts and the computed estimates to the
// console for the sake of experimentation.
void DoExperimentWithAnalyze(const std::string& case_label,
uint32_t num_candidates, uint32_t num_bloom_bits,
uint32_t num_cohorts, uint32_t num_hashes,
std::vector<int> candidate_indices,
std::vector<int> true_candidate_counts,
const bool print_estimates) {
SetAnalyzer(num_candidates, num_bloom_bits, num_cohorts, num_hashes);
AddObservationsForCandidates(candidate_indices);
std::vector<CandidateResult> results;
auto start_analyze_time = std::chrono::steady_clock::now();
auto status = analyzer_->Analyze(&results);
if (!status.ok()) {
EXPECT_EQ(grpc::OK, status.error_code());
return;
}
auto end_analyze_time = std::chrono::steady_clock::now();
LOG(ERROR) << "Analyze() took "
<< std::chrono::duration_cast<std::chrono::seconds>(
end_analyze_time - start_analyze_time)
.count()
<< " seconds.";
if (results.size() != num_candidates) {
EXPECT_EQ(num_candidates, results.size());
return;
}
if (print_estimates) {
PrintTrueCountsAndEstimates(case_label, num_candidates, results,
true_candidate_counts);
}
CheckSolutionCorrectness(1e-4, 1e-4, results);
AssessUtility(results, true_candidate_counts);
}
// Does the same as DoExperimentWithAnalyze except it also
// computes the estimates for both Analyze and simple regression using QR,
// which is computed on exactly the same system.
void CompareAnalyzeToSimpleRegression(
const std::string& case_label, uint32_t num_candidates,
uint32_t num_bloom_bits, uint32_t num_cohorts, uint32_t num_hashes,
std::vector<int> candidate_indices,
std::vector<int> true_candidate_counts) {
SetAnalyzer(num_candidates, num_bloom_bits, num_cohorts, num_hashes);
AddObservationsForCandidates(candidate_indices);
// compute and print the results of Analyze()
std::vector<CandidateResult> results_analyze;
auto status = analyzer_->Analyze(&results_analyze);
if (!status.ok()) {
EXPECT_EQ(grpc::OK, status.error_code());
return;
}
if (results_analyze.size() != num_candidates) {
EXPECT_EQ(num_candidates, results_analyze.size());
return;
}
std::string print_label = case_label + " analyze ";
PrintTrueCountsAndEstimates(print_label, num_candidates, results_analyze,
true_candidate_counts);
// compute and print the results for simple linear regression
Eigen::VectorXf est_bit_count_ratios;
status = analyzer_->ExtractEstimatedBitCountRatios(&est_bit_count_ratios);
if (!status.ok()) {
EXPECT_EQ(grpc::OK, status.error_code());
return;
}
print_label = case_label + " least squares ";
std::vector<CandidateResult> results_ls(num_candidates);
status = ComputeLeastSquaresFitQR(est_bit_count_ratios, &results_ls);
if (!status.ok()) {
EXPECT_EQ(grpc::OK, status.error_code());
return;
}
PrintTrueCountsAndEstimates(print_label, num_candidates, results_ls,
true_candidate_counts);
}
RapporConfig config_;
std::unique_ptr<RapporAnalyzer> analyzer_;
RapporCandidateList candidate_list_;
// By default this test uses p=0, q=1. Individual tests may override this.
double prob_0_becomes_1_ = 0.0;
double prob_1_stays_1_ = 1.0;
// Random device
std::random_device random_dev_;
};
// Tests the function BuildCandidateMap. We build one small CandidateMap and
// then we explicitly check every value against a known value. We have not
// independently verified the SHA-256 hash values and so rather than a test
// of correctness this is firstly a sanity test: we can eyeball the values
// and confirm they look sane, and secondly a regression test.
TEST_F(RapporAnalyzerTest, BuildCandidateMapSmallTest) {
static const uint32_t kNumCandidates = 5;
static const uint32_t kNumCohorts = 3;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 8;
SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
BuildCandidateMap();
// clang-format off
int expected_bit_indices[kNumCandidates][kNumCohorts*kNumHashes] = {
// cihj means cohort = i and hash-index = j.
// c0h0 c0h1 c1h0 c1h1 c2h0 c2h2
{3, 5, 2, 6, 3, 6}, // candidate 0
{1, 5, 4, 7, 2, 0}, // candidate 1
{3, 0, 2, 0, 1, 4}, // candidate 2
{5, 1, 2, 4, 2, 4}, // candidate 3
{1, 4, 3, 1, 2, 6}, // candidate 4
};
// clang-format on
for (size_t candidate = 0; candidate < kNumCandidates; candidate++) {
for (size_t cohort = 0; cohort < kNumCohorts; cohort++) {
for (size_t hash = 0; hash < kNumHashes; hash++) {
EXPECT_EQ(expected_bit_indices[candidate][cohort * kNumHashes + hash],
GetCandidateMapValue(candidate, cohort, hash))
<< "(" << candidate << "," << cohort * kNumHashes + hash << ")";
}
}
}
// Check the associated sparse matrix.
std::ostringstream stream;
stream << candidate_matrix().block(0, 0, kNumCohorts * kNumBloomBits,
kNumCandidates);
const char* kExpectedMatrixString =
"0 0 0 0 0 \n"
"0 0 0 0 0 \n"
"1 1 0 1 0 \n"
"0 0 0 0 1 \n"
"1 0 1 0 0 \n"
"0 0 0 0 0 \n"
"0 1 0 1 1 \n"
"0 0 1 0 0 \n"
"0 1 0 0 0 \n"
"1 0 0 0 0 \n"
"0 0 0 0 0 \n"
"0 1 0 1 0 \n"
"0 0 0 0 1 \n"
"1 0 1 1 0 \n"
"0 0 0 0 1 \n"
"0 0 1 0 0 \n"
"0 0 0 0 0 \n"
"1 0 0 0 1 \n"
"0 0 0 0 0 \n"
"0 0 1 1 0 \n"
"1 0 0 0 0 \n"
"0 1 0 1 1 \n"
"0 0 1 0 0 \n"
"0 1 0 0 0 \n";
EXPECT_EQ(kExpectedMatrixString, stream.str());
}
// This test is identical to the previous test except that kNumBloomBits = 4
// instead of 8. The purpose of this test is to force the situation in which
// the two hash functions for a given cohort and a given candidate give the
// same value. For example below we see that for candidate 0, cohort 1, both
// hash functions yielded a 2. We want to test that the associated sparse
// matrix has a "1" in the corresponding position (in this case that is
// row 5, column 0) and does not have a "2" in that position. In other words
// we want to test that we correctly added only one entry to the list of
// triples that defined the sparse matrix and not two entries.
TEST_F(RapporAnalyzerTest, BuildCandidateMapSmallTestWithDuplicates) {
static const uint32_t kNumCandidates = 5;
static const uint32_t kNumCohorts = 3;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 4;
SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
BuildCandidateMap();
// clang-format off
int expected_bit_indices[kNumCandidates][kNumCohorts*kNumHashes] = {
// cihj means cohort = i and hash-index = j.
// c0h0 c0h1 c1h0 c1h1 c2h0 c2h2
{3, 1, 2, 2, 3, 2}, // candidate 0
{1, 1, 0, 3, 2, 0}, // candidate 1
{3, 0, 2, 0, 1, 0}, // candidate 2
{1, 1, 2, 0, 2, 0}, // candidate 3
{1, 0, 3, 1, 2, 2}, // candidate 4
};
// clang-format on
for (size_t candidate = 0; candidate < kNumCandidates; candidate++) {
for (size_t cohort = 0; cohort < kNumCohorts; cohort++) {
for (size_t hash = 0; hash < kNumHashes; hash++) {
EXPECT_EQ(expected_bit_indices[candidate][cohort * kNumHashes + hash],
GetCandidateMapValue(candidate, cohort, hash))
<< "(" << candidate << "," << cohort * kNumHashes + hash << ")";
}
}
}
// Check the associated sparse matrix.
std::ostringstream stream;
stream << candidate_matrix().block(0, 0, kNumCohorts * kNumBloomBits,
kNumCandidates);
const char* kExpectedMatrixString =
"1 0 1 0 0 \n"
"0 0 0 0 0 \n"
"1 1 0 1 1 \n"
"0 0 1 0 1 \n"
"0 1 0 0 1 \n"
"1 0 1 1 0 \n"
"0 0 0 0 1 \n"
"0 1 1 1 0 \n"
"1 0 0 0 0 \n"
"1 1 0 1 1 \n"
"0 0 1 0 0 \n"
"0 1 1 1 0 \n";
EXPECT_EQ(kExpectedMatrixString, stream.str());
}
// Tests the function BuildCandidateMap. We build many different CandidateMaps
// with many different parameters. We are testing firstly that the procedure
// completes without error, secondly that the shape of the produced
// data structure is correct and thirdly that the bit indexes are in the range
// [0, num_bloom_bits). The latter two checks occur inside of
// BuildCandidateMap.
TEST_F(RapporAnalyzerTest, BuildCandidateMapSmokeTest) {
for (auto num_candidates : {11, 51, 99}) {
for (auto num_cohorts : {23, 45}) {
for (auto num_hashes : {2, 6, 7}) {
for (auto num_bloom_bits : {16, 128}) {
SetAnalyzer(num_candidates, num_bloom_bits, num_cohorts, num_hashes);
BuildCandidateMap();
}
}
}
}
}
// Tests the function BuildCandidateMap. We test that the map that is built
// is consistent with the Bloom filters that are built by an encoder.
TEST_F(RapporAnalyzerTest, BuildCandidateMapCompareWithEncoder) {
static const uint32_t kNumCandidates = 10;
static const uint32_t kNumCohorts = 20;
static const uint32_t kNumHashes = 5;
static const uint32_t kNumBloomBits = 64;
SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
BuildCandidateMap();
for (size_t candidate = 0; candidate < kNumCandidates; candidate++) {
// Construct a new encoder with a new ClientSecret so that a random
// cohort is selected.
RapporEncoder encoder(config_, ClientSecret::GenerateNewSecret());
// Encode the current candidate string using |encoder|.
ValuePart value_part;
value_part.set_string_value(CandidateString(candidate));
RapporObservation observation;
encoder.Encode(value_part, &observation);
// Since p=0 and q=1 the RapporObservation contains the raw Bloom filter
// with no noise added. Confirm that the BloomFilter is the same as
// the one implied by the CandidateMap at the appropriate candidate
// and cohort.
EXPECT_EQ(BuildBitString(candidate, encoder.cohort()),
DataToBinaryString(observation.data()));
}
}
// Tests the function ExtractEstimatedBitCountRatios(). We build one small
// estimated bit count ratio vector and explicitly check its values. We
// use no-randomness: p = 0, q = 1 so that the estimated bit counts are
// identical to the true bit counts.
TEST_F(RapporAnalyzerTest, ExtractEstimatedBitCountRatiosSmallNonRandomTest) {
static const uint32_t kNumCandidates = 10;
static const uint32_t kNumCohorts = 3;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 8;
SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
AddObservation(0, "00001010");
AddObservation(0, "00010010");
AddObservation(1, "00001010");
AddObservation(1, "00010010");
AddObservation(1, "00100010");
AddObservation(2, "00001010");
AddObservation(2, "00010010");
AddObservation(2, "00010010");
AddObservation(2, "00100010");
Eigen::VectorXf est_bit_count_ratios;
ExtractEstimatedBitCountRatios(&est_bit_count_ratios);
std::ostringstream stream;
stream << est_bit_count_ratios.block(0, 0, kNumCohorts * kNumBloomBits, 1);
const char* kExpectedVectorString =
" 0\n"
" 0\n"
" 0\n"
" 0.5\n"
" 0.5\n"
" 0\n"
" 1\n"
" 0\n"
" 0\n"
" 0\n"
"0.333333\n"
"0.333333\n"
"0.333333\n"
" 0\n"
" 1\n"
" 0\n"
" 0\n"
" 0\n"
" 0.25\n"
" 0.5\n"
" 0.25\n"
" 0\n"
" 1\n"
" 0";
EXPECT_EQ(kExpectedVectorString, stream.str());
}
// This is not really a test so much as an experiment with the Analyze() method.
// It invokes Analyze() in a few very simple cases, checks that the the
// algorithm converges and that the result vector has the correct size. Then
// it prints out the true candidate counts and the computed estimates.
TEST_F(RapporAnalyzerTest, DISABLED_ExperimentWithAnalyze) {
static const uint32_t kNumCandidates = 10;
static const uint32_t kNumCohorts = 3;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 8;
static const bool print_estimates = true;
std::vector<int> candidate_indices(100, 5);
std::vector<int> true_candidate_counts = {0, 0, 0, 0, 0, 100, 0, 0, 0, 0};
DoExperimentWithAnalyze(
"p=0, q=1, only candidate 5", kNumCandidates, kNumBloomBits, kNumCohorts,
kNumHashes, candidate_indices, true_candidate_counts, print_estimates);
candidate_indices = std::vector<int>(20, 1);
candidate_indices.insert(candidate_indices.end(), 20, 4);
candidate_indices.insert(candidate_indices.end(), 60, 9);
true_candidate_counts = {0, 20, 0, 0, 20, 0, 0, 0, 0, 60};
DoExperimentWithAnalyze("p=0, q=1, several candidates", kNumCandidates,
kNumBloomBits, kNumCohorts, kNumHashes,
candidate_indices, true_candidate_counts,
print_estimates);
prob_0_becomes_1_ = 0.1;
prob_1_stays_1_ = 0.9;
candidate_indices = std::vector<int>(100, 5);
true_candidate_counts = {0, 0, 0, 0, 0, 100, 0, 0, 0, 0};
DoExperimentWithAnalyze("p=0.1, q=0.9, only candidate 5", kNumCandidates,
kNumBloomBits, kNumCohorts, kNumHashes,
candidate_indices, true_candidate_counts,
print_estimates);
candidate_indices = std::vector<int>(20, 1);
candidate_indices.insert(candidate_indices.end(), 20, 4);
candidate_indices.insert(candidate_indices.end(), 60, 9);
true_candidate_counts = {0, 20, 0, 0, 20, 0, 0, 0, 0, 60};
DoExperimentWithAnalyze("p=0.1, q=0.9, several candidates", kNumCandidates,
kNumBloomBits, kNumCohorts, kNumHashes,
candidate_indices, true_candidate_counts,
print_estimates);
}
// Comparison of Analyze and simple least squares
// It invokes Analyze() in a few very simple cases, checks that the the
// algorithm converges and that the result vector has the correct size. For each
// case, it also computes the least squares solution using QR for exactly the
// same system and prints both solutions (note that the least squares solution
// is not always unique)
TEST_F(RapporAnalyzerTest, DISABLED_CompareAnalyzeToRegression) {
static const uint32_t kNumCandidates = 10;
static const uint32_t kNumCohorts = 3;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 8;
std::vector<int> candidate_indices(100, 5);
std::vector<int> true_candidate_counts = {0, 0, 0, 0, 0, 100, 0, 0, 0, 0};
CompareAnalyzeToSimpleRegression("p=0, q=1, only candidate 5", kNumCandidates,
kNumBloomBits, kNumCohorts, kNumHashes,
candidate_indices, true_candidate_counts);
candidate_indices = std::vector<int>(20, 1);
candidate_indices.insert(candidate_indices.end(), 20, 4);
candidate_indices.insert(candidate_indices.end(), 60, 9);
true_candidate_counts = {0, 20, 0, 0, 20, 0, 0, 0, 0, 60};
CompareAnalyzeToSimpleRegression(
"p=0, q=1, several candidates", kNumCandidates, kNumBloomBits,
kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts);
prob_0_becomes_1_ = 0.1;
prob_1_stays_1_ = 0.9;
candidate_indices = std::vector<int>(100, 5);
true_candidate_counts = {0, 0, 0, 0, 0, 100, 0, 0, 0, 0};
CompareAnalyzeToSimpleRegression(
"p=0.1, q=0.9, only candidate 5", kNumCandidates, kNumBloomBits,
kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts);
candidate_indices = std::vector<int>(20, 1);
candidate_indices.insert(candidate_indices.end(), 20, 4);
candidate_indices.insert(candidate_indices.end(), 60, 9);
true_candidate_counts = {0, 20, 0, 0, 20, 0, 0, 0, 0, 60};
CompareAnalyzeToSimpleRegression(
"p=0.1, q=0.9, several candidates", kNumCandidates, kNumBloomBits,
kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts);
}
// This is similar to ExperimentWithAnalyze but the true candidate counts are
// distributed according to the power law; we specify the number of
// observations and the exponent parameter of the power law. The ids are then
// shuffled so that it is not true that large ids are more frequent.
// Additionally, we test accuracy of RAPPOR as a privacy-preserving algorithm
// for the specified values of p (prob_0_becomes_1_) and q (prob_1_stays_1_), by
// calling AccessUtility.
// Note: encoding observations is time consuming so large tests may take long.
TEST_F(RapporAnalyzerTest, DISABLED_PowerLawExperiment) {
static const uint32_t kNumCandidates = 20000;
static const uint32_t kNumCohorts = 128;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 128;
static const uint32_t kNumObservations = 1e+6;
static const bool print_estimates = false;
const double exponent = 30;
const int max_id = static_cast<const int>(kNumCandidates - 1);
std::vector<int> candidate_indices(kNumObservations);
std::vector<int> true_candidate_counts(kNumCandidates, 0);
// Create a "map" of shuffled ids to randomize the observed id values
std::vector<int> candidate_ids_list_shuffled(kNumCandidates);
std::iota(candidate_ids_list_shuffled.begin(),
candidate_ids_list_shuffled.end(), 0);
std::mt19937 g(random_dev_());
std::shuffle(candidate_ids_list_shuffled.begin(),
candidate_ids_list_shuffled.end(), g);
// Generate observations from the power law distribution on
// [0,kNumCandidates-1]
const double left = 0.0;
const double right = static_cast<const double>(max_id);
for (uint32_t i = 0; i < kNumObservations; i++) {
double random_power_law_number =
GenerateNumberFromPowerLaw(left, right, exponent);
int observed_candidate_id = static_cast<int>(random_power_law_number);
observed_candidate_id = std::min(observed_candidate_id, max_id);
observed_candidate_id = candidate_ids_list_shuffled[observed_candidate_id];
candidate_indices[i] = observed_candidate_id;
true_candidate_counts[observed_candidate_id]++;
}
DoExperimentWithAnalyze("p=0, q=1, exponential distribution", kNumCandidates,
kNumBloomBits, kNumCohorts, kNumHashes,
candidate_indices, true_candidate_counts,
print_estimates);
prob_0_becomes_1_ = 0.25;
prob_1_stays_1_ = 0.75;
DoExperimentWithAnalyze("p=0.5, q=0.75, exponential distribution",
kNumCandidates, kNumBloomBits, kNumCohorts,
kNumHashes, candidate_indices, true_candidate_counts,
print_estimates);
}
} // namespace rappor
} // namespace cobalt