blob: fca101e67a7c18d0a59708f2297dcc6f6053d40a [file] [log] [blame]
// Copyright 2018 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "algorithms/rappor/rappor_analyzer_test.h"
namespace cobalt {
namespace rappor {
using encoder::ClientSecret;
// Tests the function BuildCandidateMap. We build one small CandidateMap and
// then we explicitly check every value against a known value. We have not
// independently verified the SHA-256 hash values and so rather than a test
// of correctness this is firstly a sanity test: we can eyeball the values
// and confirm they look sane, and secondly a regression test.
TEST_F(RapporAnalyzerTest, BuildCandidateMapSmallTest) {
static const uint32_t kNumCandidates = 5;
static const uint32_t kNumCohorts = 3;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 8;
SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
BuildCandidateMap();
// clang-format off
int expected_bit_indices[kNumCandidates][kNumCohorts*kNumHashes] = {
// cihj means cohort = i and hash-index = j.
// c0h0 c0h1 c1h0 c1h1 c2h0 c2h2
{3, 5, 2, 6, 3, 6}, // candidate 0
{1, 5, 4, 7, 2, 0}, // candidate 1
{3, 0, 2, 0, 1, 4}, // candidate 2
{5, 1, 2, 4, 2, 4}, // candidate 3
{1, 4, 3, 1, 2, 6}, // candidate 4
};
// clang-format on
for (size_t candidate = 0; candidate < kNumCandidates; candidate++) {
for (size_t cohort = 0; cohort < kNumCohorts; cohort++) {
for (size_t hash = 0; hash < kNumHashes; hash++) {
EXPECT_EQ(expected_bit_indices[candidate][cohort * kNumHashes + hash],
GetCandidateMapValue(candidate, cohort, hash))
<< "(" << candidate << "," << cohort * kNumHashes + hash << ")";
}
}
}
// Check the associated sparse matrix.
std::ostringstream stream;
stream << candidate_matrix().block(0, 0, kNumCohorts * kNumBloomBits,
kNumCandidates);
const char* kExpectedMatrixString =
"0 0 0 0 0 \n"
"0 0 0 0 0 \n"
"1 1 0 1 0 \n"
"0 0 0 0 1 \n"
"1 0 1 0 0 \n"
"0 0 0 0 0 \n"
"0 1 0 1 1 \n"
"0 0 1 0 0 \n"
"0 1 0 0 0 \n"
"1 0 0 0 0 \n"
"0 0 0 0 0 \n"
"0 1 0 1 0 \n"
"0 0 0 0 1 \n"
"1 0 1 1 0 \n"
"0 0 0 0 1 \n"
"0 0 1 0 0 \n"
"0 0 0 0 0 \n"
"1 0 0 0 1 \n"
"0 0 0 0 0 \n"
"0 0 1 1 0 \n"
"1 0 0 0 0 \n"
"0 1 0 1 1 \n"
"0 0 1 0 0 \n"
"0 1 0 0 0 \n";
EXPECT_EQ(kExpectedMatrixString, stream.str());
}
// This test is identical to the previous test except that kNumBloomBits = 4
// instead of 8. The purpose of this test is to force the situation in which
// the two hash functions for a given cohort and a given candidate give the
// same value. For example below we see that for candidate 0, cohort 1, both
// hash functions yielded a 2. We want to test that the associated sparse
// matrix has a "1" in the corresponding position (in this case that is
// row 5, column 0) and does not have a "2" in that position. In other words
// we want to test that we correctly added only one entry to the list of
// triples that defined the sparse matrix and not two entries.
TEST_F(RapporAnalyzerTest, BuildCandidateMapSmallTestWithDuplicates) {
static const uint32_t kNumCandidates = 5;
static const uint32_t kNumCohorts = 3;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 4;
SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
BuildCandidateMap();
// clang-format off
int expected_bit_indices[kNumCandidates][kNumCohorts*kNumHashes] = {
// cihj means cohort = i and hash-index = j.
// c0h0 c0h1 c1h0 c1h1 c2h0 c2h2
{3, 1, 2, 2, 3, 2}, // candidate 0
{1, 1, 0, 3, 2, 0}, // candidate 1
{3, 0, 2, 0, 1, 0}, // candidate 2
{1, 1, 2, 0, 2, 0}, // candidate 3
{1, 0, 3, 1, 2, 2}, // candidate 4
};
// clang-format on
for (size_t candidate = 0; candidate < kNumCandidates; candidate++) {
for (size_t cohort = 0; cohort < kNumCohorts; cohort++) {
for (size_t hash = 0; hash < kNumHashes; hash++) {
EXPECT_EQ(expected_bit_indices[candidate][cohort * kNumHashes + hash],
GetCandidateMapValue(candidate, cohort, hash))
<< "(" << candidate << "," << cohort * kNumHashes + hash << ")";
}
}
}
// Check the associated sparse matrix.
std::ostringstream stream;
stream << candidate_matrix().block(0, 0, kNumCohorts * kNumBloomBits,
kNumCandidates);
const char* kExpectedMatrixString =
"1 0 1 0 0 \n"
"0 0 0 0 0 \n"
"1 1 0 1 1 \n"
"0 0 1 0 1 \n"
"0 1 0 0 1 \n"
"1 0 1 1 0 \n"
"0 0 0 0 1 \n"
"0 1 1 1 0 \n"
"1 0 0 0 0 \n"
"1 1 0 1 1 \n"
"0 0 1 0 0 \n"
"0 1 1 1 0 \n";
EXPECT_EQ(kExpectedMatrixString, stream.str());
}
// Tests the function BuildCandidateMap. We build many different CandidateMaps
// with many different parameters. We are testing firstly that the procedure
// completes without error, secondly that the shape of the produced
// data structure is correct and thirdly that the bit indexes are in the range
// [0, num_bloom_bits). The latter two checks occur inside of
// BuildCandidateMap.
TEST_F(RapporAnalyzerTest, BuildCandidateMapSmokeTest) {
for (auto num_candidates : {11, 51, 99}) {
for (auto num_cohorts : {23, 45}) {
for (auto num_hashes : {2, 6, 7}) {
for (auto num_bloom_bits : {16, 128}) {
SetAnalyzer(num_candidates, num_bloom_bits, num_cohorts, num_hashes);
BuildCandidateMap();
}
}
}
}
}
// Tests the function BuildCandidateMap. We test that the map that is built
// is consistent with the Bloom filters that are built by an encoder.
TEST_F(RapporAnalyzerTest, BuildCandidateMapCompareWithEncoder) {
static const uint32_t kNumCandidates = 10;
static const uint32_t kNumCohorts = 20;
static const uint32_t kNumHashes = 5;
static const uint32_t kNumBloomBits = 64;
SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
BuildCandidateMap();
for (size_t candidate = 0; candidate < kNumCandidates; candidate++) {
// Construct a new encoder with a new ClientSecret so that a random
// cohort is selected.
RapporEncoder encoder(config_, ClientSecret::GenerateNewSecret());
// Encode the current candidate string using |encoder|.
ValuePart value_part;
value_part.set_string_value(CandidateString(candidate));
RapporObservation observation;
encoder.Encode(value_part, &observation);
// Since p=0 and q=1 the RapporObservation contains the raw Bloom filter
// with no noise added. Confirm that the BloomFilter is the same as
// the one implied by the CandidateMap at the appropriate candidate
// and cohort.
EXPECT_EQ(BuildBitString(candidate, encoder.cohort()),
DataToBinaryString(observation.data()));
}
}
// Tests the function ExtractEstimatedBitCountRatios(). We build one small
// estimated bit count ratio vector and explicitly check its values. We
// use no-randomness: p = 0, q = 1 so that the estimated bit counts are
// identical to the true bit counts.
TEST_F(RapporAnalyzerTest, ExtractEstimatedBitCountRatiosSmallNonRandomTest) {
static const uint32_t kNumCandidates = 10;
static const uint32_t kNumCohorts = 3;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 8;
SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
AddObservation(0, "00001010");
AddObservation(0, "00010010");
AddObservation(1, "00001010");
AddObservation(1, "00010010");
AddObservation(1, "00100010");
AddObservation(2, "00001010");
AddObservation(2, "00010010");
AddObservation(2, "00010010");
AddObservation(2, "00100010");
Eigen::VectorXd est_bit_count_ratios;
ExtractEstimatedBitCountRatios(&est_bit_count_ratios);
std::ostringstream stream;
stream << est_bit_count_ratios.block(0, 0, kNumCohorts * kNumBloomBits, 1);
const char* kExpectedVectorString =
" 0\n"
" 0\n"
" 0\n"
" 0.5\n"
" 0.5\n"
" 0\n"
" 1\n"
" 0\n"
" 0\n"
" 0\n"
"0.333333\n"
"0.333333\n"
"0.333333\n"
" 0\n"
" 1\n"
" 0\n"
" 0\n"
" 0\n"
" 0.25\n"
" 0.5\n"
" 0.25\n"
" 0\n"
" 1\n"
" 0";
EXPECT_EQ(kExpectedVectorString, stream.str());
}
// This test invokes Analyze() in a few very simple cases, checks that the the
// algorithm converges and that the result vector has the correct size.
TEST_F(RapporAnalyzerTest, SimpleAnalyzeTest) {
static const uint32_t kNumCandidates = 10;
static const uint32_t kNumCohorts = 3;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 8;
static const bool print_estimates = false;
std::vector<int> candidate_indices(100, 5);
std::vector<int> true_candidate_counts = {0, 0, 0, 0, 0, 100, 0, 0, 0, 0};
ShortExperimentWithAnalyze(
"p=0, q=1, only candidate 5", kNumCandidates, kNumBloomBits, kNumCohorts,
kNumHashes, candidate_indices, true_candidate_counts, print_estimates);
candidate_indices = std::vector<int>(20, 1);
candidate_indices.insert(candidate_indices.end(), 20, 4);
candidate_indices.insert(candidate_indices.end(), 60, 9);
true_candidate_counts = {0, 20, 0, 0, 20, 0, 0, 0, 0, 60};
ShortExperimentWithAnalyze("p=0, q=1, several candidates", kNumCandidates,
kNumBloomBits, kNumCohorts, kNumHashes,
candidate_indices, true_candidate_counts,
print_estimates);
prob_0_becomes_1_ = 0.1;
prob_1_stays_1_ = 0.9;
candidate_indices = std::vector<int>(100, 5);
true_candidate_counts = {0, 0, 0, 0, 0, 100, 0, 0, 0, 0};
ShortExperimentWithAnalyze("p=0.1, q=0.9, only candidate 5", kNumCandidates,
kNumBloomBits, kNumCohorts, kNumHashes,
candidate_indices, true_candidate_counts,
print_estimates);
candidate_indices = std::vector<int>(20, 1);
candidate_indices.insert(candidate_indices.end(), 20, 4);
candidate_indices.insert(candidate_indices.end(), 60, 9);
true_candidate_counts = {0, 20, 0, 0, 20, 0, 0, 0, 0, 60};
ShortExperimentWithAnalyze("p=0.1, q=0.9, several candidates", kNumCandidates,
kNumBloomBits, kNumCohorts, kNumHashes,
candidate_indices, true_candidate_counts,
print_estimates);
}
// Runs Analyze() on the corner-case of one candidate and one cohort.
// The result should be exact.
TEST_F(RapporAnalyzerTest, SingleCandidateTest) {
static const uint32_t kNumCandidates = 1;
static const uint32_t kNumCohorts = 1;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 8;
// No noise introduced
prob_0_becomes_1_ = 0.0;
prob_1_stays_1_ = 1.0;
SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
std::vector<int> candidate_indices(100, 0);
AddObservationsForCandidates(candidate_indices);
std::vector<CandidateResult> results;
auto status = analyzer_->Analyze(&results);
EXPECT_EQ(results.size(), kNumCandidates);
EXPECT_GE(results[0].count_estimate, 99);
EXPECT_LE(results[0].count_estimate, 101);
}
// Runs Analyze with one cohort and the number of bits much larger than the
// number of candidates. No noise is introduced. The heavy hitter should be
// reproduced nearly exactly (assuming second step of RAPPOR is "on" and
// working). Note(bazyli): this test is based on the assumption that the number
// of bits in the cohort is so large that no or almost no of the candidates will
// share a non-zero bit with the heavy hitter.
TEST_F(RapporAnalyzerTest, ExactSolutionTest) {
// The probability that the heaviest hitter will not share a nonzero bit with
// any other candidate when kNumCandidates = 10, kNumCohorts = 1, kNumHashes =
// 2, kNumBloomBits = 128, is (125 * 126 / (127 * 128))^ 9 = 0.75. In that
// case, the heaviest hitter must be solved exactly. Even if it does share a
// bit or two with some other candidate(s), this is unlikely to alter the
// result significantly.
static const uint32_t kNumCandidates = 10;
static const uint32_t kNumCohorts = 1;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 128;
// No noise introduced.
prob_0_becomes_1_ = 0.0;
prob_1_stays_1_ = 1.0;
SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
std::vector<int> candidate_indices(10, 0);
candidate_indices.insert(candidate_indices.end(), 10, 1);
candidate_indices.insert(candidate_indices.end(), 10, 2);
candidate_indices.insert(candidate_indices.end(), 10, 3);
candidate_indices.insert(candidate_indices.end(), 10, 4);
candidate_indices.insert(candidate_indices.end(), 100, 9);
AddObservationsForCandidates(candidate_indices);
std::vector<CandidateResult> results;
auto status = analyzer_->Analyze(&results);
// Check that the heavy hitter is reproduced almost exactly
EXPECT_GE(results[9].count_estimate, 95);
EXPECT_LE(results[9].count_estimate, 105);
// Candidates that are not observed should have zero counts
EXPECT_EQ(results[5].count_estimate, 0);
EXPECT_EQ(results[6].count_estimate, 0);
EXPECT_EQ(results[7].count_estimate, 0);
EXPECT_EQ(results[8].count_estimate, 0);
// All standard errors should be numerically 0
for (uint32_t i = 0; i < kNumCandidates; i++) {
EXPECT_LE(results[i].std_error, 1e-12);
}
}
// Runs Analyzer with two cohorts and large number of bits. Very small noise is
// introduced. The heaviest hitter should be reproduced very well (assuming
// second step of RAPPOR is "on" and working). This test is based on similar
// assumptions that ExactSolutionTest but the check is relaxed because of noise
// and multiple cohorts.
TEST_F(RapporAnalyzerTest, HighAccuracySolutionTest) {
static const uint32_t kNumCandidates = 10;
static const uint32_t kNumCohorts = 2;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 64;
// Small noise introduced.
prob_0_becomes_1_ = 0.1;
prob_1_stays_1_ = 0.9;
SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
std::vector<int> candidate_indices(10, 0);
candidate_indices.insert(candidate_indices.end(), 10, 1);
candidate_indices.insert(candidate_indices.end(), 10, 2);
candidate_indices.insert(candidate_indices.end(), 10, 3);
candidate_indices.insert(candidate_indices.end(), 10, 4);
candidate_indices.insert(candidate_indices.end(), 100, 9);
AddObservationsForCandidates(candidate_indices);
std::vector<CandidateResult> results;
auto status = analyzer_->Analyze(&results);
EXPECT_GE(results[9].count_estimate, 70);
EXPECT_LE(results[9].count_estimate, 120);
}
// Runs Analyzer on a moderately under-determined system (number of rows <
// number of columns) with more than one cohort. Small noise is introduced.
// There are a number of (equal) heavy hitters, which should be identified.
// TODO(bazyli): this test is heuristic.
TEST_F(RapporAnalyzerTest, KHeavyHittersTest) {
static const uint32_t kNumCandidates = 100;
static const uint32_t kNumCohorts = 4;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 16;
// Small noise introduced
prob_0_becomes_1_ = 0.1;
prob_1_stays_1_ = 0.9;
SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
std::vector<int> candidate_indices;
candidate_indices.insert(candidate_indices.end(), 100, 1);
candidate_indices.insert(candidate_indices.end(), 100, 2);
candidate_indices.insert(candidate_indices.end(), 100, 4);
candidate_indices.insert(candidate_indices.end(), 100, 8);
candidate_indices.insert(candidate_indices.end(), 100, 16);
candidate_indices.insert(candidate_indices.end(), 100, 32);
candidate_indices.insert(candidate_indices.end(), 100, 64);
AddObservationsForCandidates(candidate_indices);
std::vector<CandidateResult> results;
auto status = analyzer_->Analyze(&results);
// Check if the heavy hitters are identified
EXPECT_GE(results[1].count_estimate, 50);
EXPECT_LE(results[1].count_estimate, 150);
EXPECT_GE(results[2].count_estimate, 50);
EXPECT_LE(results[2].count_estimate, 150);
EXPECT_GE(results[4].count_estimate, 50);
EXPECT_LE(results[4].count_estimate, 150);
EXPECT_GE(results[8].count_estimate, 50);
EXPECT_LE(results[8].count_estimate, 150);
EXPECT_GE(results[16].count_estimate, 50);
EXPECT_LE(results[16].count_estimate, 150);
EXPECT_GE(results[32].count_estimate, 50);
EXPECT_LE(results[32].count_estimate, 150);
}
// Runs a small instance of RAPPOR with large noise.
// The standard errors of identified heavy hitters should be nonzero.
// Note(bazyli): this test is heuristic test.
TEST_F(RapporAnalyzerTest, StandardErrorTest) {
static const uint32_t kNumCandidates = 50;
static const uint32_t kNumCohorts = 2;
static const uint32_t kNumHashes = 2;
static const uint32_t kNumBloomBits = 32;
// Large noise introduced.
prob_0_becomes_1_ = 0.4;
prob_1_stays_1_ = 0.6;
SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
std::vector<int> candidate_indices(1000, 0);
candidate_indices.insert(candidate_indices.end(), 1000, 10);
candidate_indices.insert(candidate_indices.end(), 1000, 20);
candidate_indices.insert(candidate_indices.end(), 1000, 30);
candidate_indices.insert(candidate_indices.end(), 1000, 40);
AddObservationsForCandidates(candidate_indices);
std::vector<CandidateResult> results;
auto status = analyzer_->Analyze(&results);
EXPECT_GT(results[0].std_error, 0);
EXPECT_GT(results[10].std_error, 0);
EXPECT_GT(results[20].std_error, 0);
EXPECT_GT(results[30].std_error, 0);
EXPECT_GT(results[40].std_error, 0);
}
} // namespace rappor
} // namespace cobalt