Delete unused String RAPPOR code
Bug: 42098
Change-Id: I925bc1b49d183c70cc62b9e9c6c1598f594a32ee
diff --git a/src/algorithms/rappor/BUILD.gn b/src/algorithms/rappor/BUILD.gn
index 857795f..70cc4da 100644
--- a/src/algorithms/rappor/BUILD.gn
+++ b/src/algorithms/rappor/BUILD.gn
@@ -17,7 +17,6 @@
deps = [
"$cobalt_root/src:logging",
- "$cobalt_root/src/lib/crypto_util",
]
public_deps = [
@@ -48,20 +47,6 @@
configs += [ ":rappor_config" ]
}
-if (!is_fuchsia_tree) {
- source_set("lasso_runner") {
- sources = [
- "lasso_runner.cc",
- ]
- deps = [
- "$cobalt_root/src:logging",
- "$cobalt_root/src/lib/lossmin/minimizers",
- "//third_party/eigen",
- ]
- configs += [ ":rappor_config" ]
- }
-}
-
source_set("rappor_config_helper") {
sources = [
"rappor_config_helper.cc",
@@ -75,31 +60,6 @@
configs += [ ":rappor_config" ]
}
-if (!is_fuchsia_tree) {
- source_set("rappor_analyzer") {
- sources = [
- "basic_rappor_analyzer.cc",
- "bloom_bit_counter.cc",
- "rappor_analyzer.cc",
- "rappor_analyzer_utils.cc",
- ]
-
- deps = [
- ":lasso_runner",
- ":rappor_config_validator",
- ":rappor_encoder",
- "$cobalt_root/src:logging",
- "$cobalt_root/src/lib/lossmin/minimizers",
- "//third_party/eigen",
- "//third_party/grpc:grpc++",
- ]
- public_deps = [
- "$cobalt_root/src/lib/crypto_util",
- ]
- configs += [ ":rappor_config" ]
- }
-}
-
test("rappor_tests") {
sources = [
"rappor_config_helper_test.cc",
@@ -107,47 +67,6 @@
"rappor_test_utils.cc",
"rappor_test_utils_test.cc",
]
-
- if (!is_fuchsia_tree) {
- sources += [
- "basic_rappor_analyzer_test.cc",
- "bloom_bit_counter_test.cc",
- "rappor_analyzer_test.cc",
- "rappor_analyzer_unit_tests.cc",
- ]
- }
-
- deps = [
- ":rappor_config_helper",
- ":rappor_encoder",
- "$cobalt_root/src:logging",
- "//third_party/gflags",
- "//third_party/googletest:gtest",
- "//third_party/grpc:grpc++",
- ]
- if (!is_fuchsia_tree) {
- deps += [ ":rappor_analyzer" ]
- } else {
- deps += [ "//third_party/googletest:gtest_main" ]
- }
- configs += [ ":rappor_config" ]
-}
-
-test("rappor_manual_tests") {
- if (!is_fuchsia_tree) {
- output_dir = "$root_out_dir/tests/manual"
- }
-
- sources = [
- "rappor_test_utils.cc",
- "rappor_test_utils_test.cc",
- ]
- if (!is_fuchsia_tree) {
- sources += [
- "rappor_analyzer_manual_tests.cc",
- "rappor_analyzer_test.cc",
- ]
- }
deps = [
":rappor_config_helper",
":rappor_encoder",
@@ -157,36 +76,12 @@
"//third_party/googletest:gtest_main",
"//third_party/grpc:grpc++",
]
- if (!is_fuchsia_tree) {
- deps += [ ":rappor_analyzer" ]
- }
- configs += [ "$cobalt_root:cobalt_config" ]
-}
-
-if (!is_fuchsia_tree) {
- source_set("lasso_runner_tests") {
- testonly = true
- sources = [
- "lasso_runner_test.cc",
- "lasso_runner_unit_tests.cc",
- ]
- deps = [
- ":lasso_runner",
- "$cobalt_root/src:logging",
- "//third_party/googletest:gtest",
- ]
- configs += [ "$cobalt_root:cobalt_config" ]
- }
+ configs += [ ":rappor_config" ]
}
group("tests") {
testonly = true
deps = [
- ":rappor_manual_tests",
":rappor_tests",
]
-
- if (!is_fuchsia_tree) {
- deps += [ ":lasso_runner_tests" ]
- }
}
diff --git a/src/algorithms/rappor/bloom_bit_counter.cc b/src/algorithms/rappor/bloom_bit_counter.cc
deleted file mode 100644
index c1e8ff4..0000000
--- a/src/algorithms/rappor/bloom_bit_counter.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright 2017 The Fuchsia Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "src/algorithms/rappor/bloom_bit_counter.h"
-
-#include <cmath>
-#include <utility>
-#include <vector>
-
-#include <glog/logging.h>
-
-#include "src/lib/util/log_based_metrics.h"
-
-namespace cobalt::rappor {
-
-// Stackdriver metric constants
-namespace {
-constexpr char kBloomBitCounterConstructorFailure[] = "bloom-bit-counter-constructor-failure";
-constexpr char kAddObservationFailure[] = "bloom-bin-counter-add-observation-failure";
-constexpr int kBitsPerByte = 8;
-} // namespace
-
-BloomBitCounter::BloomBitCounter(const RapporConfig& config)
- : config_(new RapporConfigValidator(config)), num_bloom_bytes_(0) {
- if (!config_->valid()) {
- LOG_STACKDRIVER_COUNT_METRIC(ERROR, kBloomBitCounterConstructorFailure)
- << "RapporConfig is invalid";
- return;
- }
- estimated_bloom_counts_.reserve(config_->num_cohorts());
- const auto num_bits = config_->num_bits();
- for (size_t cohort = 0; cohort < config_->num_cohorts(); cohort++) {
- estimated_bloom_counts_.emplace_back(cohort, num_bits);
- }
- num_bloom_bytes_ = (num_bits + kBitsPerByte - 1) / kBitsPerByte;
-}
-
-bool BloomBitCounter::AddObservation(const RapporObservation& obs) {
- if (!config_->valid()) {
- LOG_STACKDRIVER_COUNT_METRIC(ERROR, kAddObservationFailure) << "RapporConfig is invalid";
- observation_errors_++;
- return false;
- }
- if (obs.data().size() != num_bloom_bytes_) {
- LOG_STACKDRIVER_COUNT_METRIC(ERROR, kAddObservationFailure)
- << "RapporObservation has the wrong number of bytes: " << obs.data().size()
- << ". Expecting " << num_bloom_bytes_;
- observation_errors_++;
- return false;
- }
- auto cohort = obs.cohort();
- if (cohort >= config_->num_cohorts()) {
- LOG_STACKDRIVER_COUNT_METRIC(ERROR, kAddObservationFailure)
- << "RapporObservation has an invalid cohort index: " << cohort
- << ". num_cohorts= " << config_->num_cohorts();
- observation_errors_++;
- return false;
- }
- // We have a good observation.
- num_observations_++;
- estimated_bloom_counts_[cohort].num_observations++;
-
- // We iterate through the bits of the observation "from right to left",
- // i.e. from the least-significant bit of the last byte to the
- // most-significant bit of the first byte. If the ith bit is set we increment
- // bit_sums[i] for the appropriate cohort.
- //
- // NOTE(rudominer) Possible performance optimizations: Consider using x86
- // vector operations or the find-first-bit-set instruction or simply checking
- // for zero bytes.
- std::vector<size_t>& bit_sums = estimated_bloom_counts_[cohort].bit_sums;
- size_t bit_index = 0;
- for (size_t byte_index = num_bloom_bytes_ - 1; byte_index >= 0; byte_index--) {
- uint8_t bit_mask = 1;
- for (int bit_in_byte_index = 0; bit_in_byte_index < kBitsPerByte; bit_in_byte_index++) {
- if (bit_index >= bit_sums.size()) {
- return true;
- }
- if (bit_mask & obs.data()[byte_index]) {
- bit_sums[bit_index]++;
- }
- bit_index++;
- bit_mask <<= 1;
- }
- }
- return true;
-}
-
-const std::vector<CohortCounts>& BloomBitCounter::EstimateCounts() {
- double q = config_->prob_1_stays_1();
- double p = config_->prob_0_becomes_1();
- double one_minus_q_plus_p = 1.0 - (q + p);
- double divisor = q - p; // divisor != 0 because we don't allow q == p.
- double abs_divisor = std::abs(divisor);
- // Compute the estimated counts and std_error for each cohort.
- for (auto& cohort_counts : estimated_bloom_counts_) {
- double N = cohort_counts.num_observations;
- double Npq = N * p * q;
- double correction = p * N;
- // Note(rudominer) When we support PRR then we need to modify the above
- // formulas as follows. Let f = prob_rr. Then let
- // p11 = q * (1 - f/2) + p * f / 2;
- // p01 = p * (1 - f/2) + q * f / 2;
- // correction = p01 * N;
- // divisor = p11 - p01; // (1 - f) * (q - p)
-
- std::vector<size_t>& bit_sums = cohort_counts.bit_sums;
- std::vector<double>& count_estimates = cohort_counts.count_estimates;
- std::vector<double>& std_errors = cohort_counts.std_errors;
- count_estimates.resize(bit_sums.size());
- std_errors.resize(bit_sums.size());
-
- // Compute the estimate count and std_error for each bloom bit for the
- // current cohort.
- for (size_t bit_index = 0; bit_index < bit_sums.size(); bit_index++) {
- double Y = bit_sums[bit_index];
- // See go/cobalt-basic-rappor-analysis for an explanation of the
- // formulas we use for count_estimate and std_error.
- count_estimates[bit_index] = (Y - correction) / divisor;
- std_errors[bit_index] = sqrt(Y * one_minus_q_plus_p + Npq) / abs_divisor;
- }
- }
-
- return estimated_bloom_counts_;
-}
-
-} // namespace cobalt::rappor
diff --git a/src/algorithms/rappor/bloom_bit_counter.h b/src/algorithms/rappor/bloom_bit_counter.h
deleted file mode 100644
index f008390..0000000
--- a/src/algorithms/rappor/bloom_bit_counter.h
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright 2017 The Fuchsia Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef COBALT_SRC_ALGORITHMS_RAPPOR_BLOOM_BIT_COUNTER_H_
-#define COBALT_SRC_ALGORITHMS_RAPPOR_BLOOM_BIT_COUNTER_H_
-
-#include <memory>
-#include <vector>
-
-#include "src/algorithms/rappor/rappor_config_validator.h"
-
-namespace cobalt {
-namespace rappor {
-
-// Forward declaration. See definition below.
-struct CohortCounts;
-
-// A BloomBitCounter is used for performing the first steps of a string RAPPOR
-// analysis: adding the raw counts for each bit of each cohort and computing
-// the estimated true counts and std errors for each bit.
-//
-// Usage:
-// - Construct a BloomBitCounter
-// - Invoke AddObservation() many times to add all of the observations.
-// - Invoke EstimateCounts() to retrieve the raw bit sums, estimated counts
-// and std errors for each bit position of each cohort.
-// - The accesors num_observations() and observation_errors() may be used to
-// discover the number of times AddObservation() was invoked successfully
-// and unsuccessfully.
-class BloomBitCounter {
- public:
- // Constructs a BloomBitCounter for the given config. All of the observations
- // added via AddObservation() must have been encoded using this config. If
- // the config is not valid then all calls to AddObservation() will return
- // false.
- explicit BloomBitCounter(const RapporConfig& config);
-
- // Adds an additional observation to be counted. The observation must have
- // been encoded using the RapporConfig passed to the constructor.
- //
- // Returns true to indicate the observation was added without error and
- // so num_observations() was incremented or false to indicate there was
- // an error and so observation_errors() was incremented.
- bool AddObservation(const RapporObservation& obs);
-
- // The number of times that AddObservation() was invoked minus the value
- // of observation_errors().
- [[nodiscard]] size_t num_observations() const { return num_observations_; }
-
- // The number of times that AddObservation() was invoked and the observation
- // was discarded due to an error. If this number is not zero it indicates
- // that the Analyzer received data that was not created by a legitimate
- // Cobalt client. See the error logs for details of the errors.
- [[nodiscard]] size_t observation_errors() const { return observation_errors_; }
-
- // Computes estimates for the number of times each bloom bit in each cohort
- // was set. The returned vector of CohortCounts will be in order of
- // cohort number from 0 to num_cohorts - 1.
- const std::vector<CohortCounts>& EstimateCounts();
-
- std::shared_ptr<RapporConfigValidator> config() { return config_; }
-
- private:
- friend class BloomBitCounterTest;
-
- std::shared_ptr<RapporConfigValidator> config_;
-
- size_t num_observations_ = 0;
- size_t observation_errors_ = 0;
-
- std::vector<CohortCounts> estimated_bloom_counts_;
-
- // The number of bytes needed to store the bloom bits in each observation.
- size_t num_bloom_bytes_;
-};
-
-// Stores the accumulated bit sums and the adjusted count estimates
-// for the bloom bits of a single cohort. A vector of CohortCounts is
-// returned from BloomBitCounter::EstimateCounts().
-struct CohortCounts {
- CohortCounts(uint32_t cohort_num, size_t num_bits)
- : cohort_num(cohort_num), bit_sums(num_bits, 0) {}
-
- // Which cohort is this?
- uint32_t cohort_num;
-
- // The number of valid observations seen for this cohort. These observations
- // are reflected in the counts in |bit_sums| and |count_estimates|.
- size_t num_observations = 0;
-
- // The raw sums for each bit position for this cohort. The sums are listed
- // in bit order "from right to left". That is, bit_sums[0] contains the sum
- // for the right-most bit, i.e. the least significant bit.
- std::vector<size_t> bit_sums;
-
- // The following two vectors are either empty to indicate that they
- // have not yet been computed, or else they have size equal to the size
- // of |bit_sums|. In the latter case the values are listed
- // in bit order "from right to left". That is, count_estimates[0] and
- // std_error[0] contain values for the right-most bit, i.e. the least
- // significant bit of the last byte of the Bloom filter.
-
- // The adjusted counts giving our estimate of the true pre-encoded count
- // for each bit.
- std::vector<double> count_estimates;
-
- // The standard errors corresponding to |count_estimates|.
- std::vector<double> std_errors;
-};
-
-} // namespace rappor
-} // namespace cobalt
-
-#endif // COBALT_SRC_ALGORITHMS_RAPPOR_BLOOM_BIT_COUNTER_H_
diff --git a/src/algorithms/rappor/bloom_bit_counter_test.cc b/src/algorithms/rappor/bloom_bit_counter_test.cc
deleted file mode 100644
index 07097f0..0000000
--- a/src/algorithms/rappor/bloom_bit_counter_test.cc
+++ /dev/null
@@ -1,394 +0,0 @@
-// Copyright 2016 The Fuchsia Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "src/algorithms/rappor/bloom_bit_counter.h"
-
-#include <algorithm>
-#include <cmath>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <gflags/gflags.h>
-
-#include "src/algorithms/rappor/rappor_encoder.h"
-#include "src/algorithms/rappor/rappor_test_utils.h"
-#include "src/lib/crypto_util/random_test_utils.h"
-#include "src/logging.h"
-#include "third_party/googletest/googletest/include/gtest/gtest.h"
-
-namespace cobalt::rappor {
-
-namespace {
-// Makes a RapporConfig with the given data (and num_hashes=2).
-RapporConfig Config(uint32_t num_bloom_bits, uint32_t num_cohorts, float p, float q) {
- RapporConfig config;
- config.num_bloom_bits = num_bloom_bits;
- config.num_hashes = 2;
- config.num_cohorts = num_cohorts;
- config.prob_0_becomes_1 = p;
- config.prob_1_stays_1 = q;
- return config;
-}
-
-} // namespace
-
-class BloomBitCounterTest : public ::testing::Test {
- protected:
- // Sets the member variable bit_counter_ to a new BloomBitCounter configured
- // with the given arguments and the current values of prob_0_becomes_1_,
- // prob_1_stays_1_.
- void SetBitCounter(uint32_t num_bloom_bits, uint32_t num_cohorts) {
- bit_counter_ = std::make_unique<BloomBitCounter>(
- Config(num_bloom_bits, num_cohorts, prob_0_becomes_1_, prob_1_stays_1_));
- add_good_observation_call_count_ = 0;
- add_bad_observation_call_count_ = 0;
- }
-
- // Adds an observation to bit_counter_ described by |binary_string| for the
- // given cohort. Expects the operation to result in an error.
- void AddObservationExpectFalse(uint32_t cohort, const std::string &binary_string) {
- EXPECT_FALSE(bit_counter_->AddObservation(RapporObservationFromString(cohort, binary_string)));
- CheckState(add_good_observation_call_count_, ++add_bad_observation_call_count_);
- }
-
- // Adds an observation to bit_counter_ described by |binary_string| for the
- // given cohort. Expects the operation to succeed.
- void AddObservation(uint32_t cohort, const std::string &binary_string) {
- EXPECT_TRUE(bit_counter_->AddObservation(RapporObservationFromString(cohort, binary_string)));
- CheckState(++add_good_observation_call_count_, add_bad_observation_call_count_);
- }
-
- // Invokes AddObservation() many times.
- void AddObservations(uint32_t cohort, const std::string &binary_string, int num_times) {
- for (int count = 0; count < num_times; count++) {
- SCOPED_TRACE(std::string("count=") + std::to_string(count));
- AddObservation(cohort, binary_string);
- }
- }
-
- // Checks that bit_counter_ has the expected number observations and errors.
- void CheckState(size_t expected_num_observations, size_t expected_observation_errors) {
- EXPECT_EQ(expected_num_observations, bit_counter_->num_observations());
- EXPECT_EQ(expected_observation_errors, bit_counter_->observation_errors());
- }
-
- // Checks that bit_counter_ has the expected raw count for the given cohort
- // and bit index.
- void ExpectRawCount(uint32_t cohort, size_t index, size_t expected_count) {
- EXPECT_EQ(expected_count, bit_counter_->estimated_bloom_counts_[cohort].bit_sums[index]);
- }
-
- // Checks that bit_counter_ has the expected raw counts for the given cohort.
- void ExpectRawCounts(uint32_t cohort, const std::vector<size_t> &expected_counts) {
- EXPECT_EQ(expected_counts, bit_counter_->estimated_bloom_counts_[cohort].bit_sums);
- }
-
- // Invokes bit_countr_->EstimateCounts() and checks the count and std_error in
- // the given bit index for the given cohort.
- void EstimateCountsAndCheck(size_t cohort, size_t index, double expected_estimate,
- double expected_std_err) {
- auto estimated_counts = bit_counter_->EstimateCounts();
- ASSERT_GT(estimated_counts.size(), cohort)
- << "size=" << estimated_counts.size() << ", cohort=" << cohort;
- ASSERT_GT(estimated_counts[cohort].count_estimates.size(), index)
- << "size=" << estimated_counts[cohort].count_estimates.size() << ", index=" << index;
- ASSERT_GT(estimated_counts[cohort].std_errors.size(), index)
- << "size=" << estimated_counts[cohort].std_errors.size() << ", index=" << index;
- EXPECT_FLOAT_EQ(expected_estimate, estimated_counts[cohort].count_estimates[index]);
- EXPECT_FLOAT_EQ(expected_std_err, estimated_counts[cohort].std_errors[index]);
- }
-
- // Tests BloomBitCounter focusing on only a single bit at a time.
- //
- // We use 32 bits and 5 cohorts. Multiple times we single out one bit and one
- // cohort. Only that bit receives any non-zero observation data and only that
- // bit is validated.
- //
- // Uses the currently set values for prob_0_becomes_1_ and prob_1_stays_1_.
- // There will be |n| total observations with |y| 1's and |n-y| 0's.
- void OneBitTest(int n, int y, double expected_estimate, double expected_std_err) {
- // We pick five different bits out of the 32 bits to test.
- for (int bit_index : {0, 1, 8, 19, 31}) {
- // We pick five different cohorts of the 100 cohorts to test.
- for (int cohort : {0, 1, 47, 61, 99}) {
- SCOPED_TRACE(std::to_string(bit_index) + ", " + std::to_string(cohort) + ", " +
- std::to_string(n) + ", " + std::to_string(y));
- // Construct a BloomBitCounter with 32 bits and 100 cohorts.
- SetBitCounter(32, 100); // NOLINT
- // Add y observations with a 1 in position |bit_index|.
- // NOLINTNEXTLINE
- AddObservations(cohort, BuildBitPatternString(32, bit_index, '1', '0'), y);
- // Add n-y observations with a 0 in position |bit_index|.
- // NOLINTNEXTLINE
- AddObservations(cohort, BuildBitPatternString(32, bit_index, '0', '0'), n - y);
-
- // Analyze and check position |bit_index|
- EstimateCountsAndCheck(cohort, bit_index, expected_estimate, expected_std_err);
- }
- }
- }
-
- public:
- // By default this test uses p=0, q=1. Individual tests may override this.
- float prob_0_becomes_1_ = 0.0;
- float prob_1_stays_1_ = 1.0;
- std::unique_ptr<BloomBitCounter> bit_counter_;
- int add_bad_observation_call_count_ = 0;
- int add_good_observation_call_count_ = 0;
-};
-
-// Tests the raw counts when there are four bits and two cohorts
-TEST_F(BloomBitCounterTest, RawCounts4x2) {
- // Construct a bloom bit counter with four bits and 2 cohorts.
- SetBitCounter(4, 2);
-
- AddObservation(0, "00000000");
- ExpectRawCounts(0, {0, 0, 0, 0});
- ExpectRawCounts(1, {0, 0, 0, 0});
-
- AddObservation(1, "00000000");
- ExpectRawCounts(0, {0, 0, 0, 0});
- ExpectRawCounts(1, {0, 0, 0, 0});
-
- AddObservation(0, "00000001");
- ExpectRawCounts(0, {1, 0, 0, 0});
- ExpectRawCounts(1, {0, 0, 0, 0});
-
- AddObservation(0, "00000001");
- ExpectRawCounts(0, {2, 0, 0, 0});
- ExpectRawCounts(1, {0, 0, 0, 0});
-
- AddObservation(0, "00000010");
- ExpectRawCounts(0, {2, 1, 0, 0});
- ExpectRawCounts(1, {0, 0, 0, 0});
-
- AddObservation(0, "00000010");
- ExpectRawCounts(0, {2, 2, 0, 0});
- ExpectRawCounts(1, {0, 0, 0, 0});
-
- AddObservation(1, "00000010");
- ExpectRawCounts(0, {2, 2, 0, 0});
- ExpectRawCounts(1, {0, 1, 0, 0});
-
- AddObservation(1, "00000010");
- ExpectRawCounts(0, {2, 2, 0, 0});
- ExpectRawCounts(1, {0, 2, 0, 0});
-
- AddObservation(0, "00000011");
- ExpectRawCounts(0, {3, 3, 0, 0});
- ExpectRawCounts(1, {0, 2, 0, 0});
-
- AddObservation(0, "00000100");
- ExpectRawCounts(0, {3, 3, 1, 0});
- ExpectRawCounts(1, {0, 2, 0, 0});
-
- AddObservation(0, "00000101");
- ExpectRawCounts(0, {4, 3, 2, 0});
- ExpectRawCounts(1, {0, 2, 0, 0});
-
- AddObservation(0, "00000011");
- ExpectRawCounts(0, {5, 4, 2, 0}); // NOLINT
- ExpectRawCounts(1, {0, 2, 0, 0});
-
- AddObservation(1, "00001010");
- ExpectRawCounts(0, {5, 4, 2, 0}); // NOLINT
- ExpectRawCounts(1, {0, 3, 0, 1});
-
- AddObservation(1, "00001010");
- ExpectRawCounts(0, {5, 4, 2, 0}); // NOLINT
- ExpectRawCounts(1, {0, 4, 0, 2});
-
- for (int i = 0; i < 1000; i++) { // NOLINT
- AddObservation(0, "00001100");
- AddObservation(1, "00000110");
- }
- ExpectRawCounts(0, {5, 4, 1002, 1000}); // NOLINT
- ExpectRawCounts(1, {0, 1004, 1000, 2}); // NOLINT
-
- // The extra high-order-bits should be ignored
- AddObservation(0, "11110000");
- AddObservation(1, "11110000");
- ExpectRawCounts(0, {5, 4, 1002, 1000}); // NOLINT
- ExpectRawCounts(1, {0, 1004, 1000, 2}); // NOLINT
-}
-
-// Tests the raw counts when there are 1024 bits and 100 cohorts
-TEST_F(BloomBitCounterTest, RawCounts1024x100) {
- // Construct a bloom bit counter with 1024 bits and 100 cohorts.
- SetBitCounter(1024, 100); // NOLINT
- // Iterate 100 times
- for (int iteration = 0; iteration < 100; iteration++) { // NOLINT
- // For i = 0, 10, 20, 30, .....
- for (int bit_index = 0; bit_index < 1024; bit_index += 10) { // NOLINT
- // Add observations with bit i alone set for cohorts 0, 51, 97.
- // NOLINTNEXTLINE
- AddObservation(0, BuildBitPatternString(1024, bit_index, '1', '0'));
- // NOLINTNEXTLINE
- AddObservation(51, BuildBitPatternString(1024, bit_index, '1', '0'));
- // NOLINTNEXTLINE
- AddObservation(97, BuildBitPatternString(1024, bit_index, '1', '0'));
- }
- }
-
- // Check the counts.
- for (int bit_index = 0; bit_index < 1024; bit_index++) { // NOLINT
- size_t expected_count = (bit_index % 10 == 0 ? 100 : 0); // NOLINT
- ExpectRawCount(0, bit_index, expected_count);
- ExpectRawCount(1, bit_index, 0);
- ExpectRawCount(51, bit_index, expected_count); // NOLINT
- ExpectRawCount(52, bit_index, 0); // NOLINT
- ExpectRawCount(97, bit_index, expected_count); // NOLINT
- ExpectRawCount(98, bit_index, 0); // NOLINT
- }
-}
-
-// Tests that AddObservation() returns false when an invalid config is
-// provided to the constructor.
-TEST_F(BloomBitCounterTest, InvalidConfig) {
- // Set prob_0_becomes_1 to an invalid value.
- prob_0_becomes_1_ = 1.1; // NOLINT
-
- // Construct a bloom bit counter with 8 bits and 2 cohorts.
- SetBitCounter(8, 2); // NOLINT
-
- AddObservationExpectFalse(0, "00000000");
- AddObservationExpectFalse(1, "00000000");
-}
-
-// Tests that AddObservation() returns false when an invalid observation
-// is added.
-TEST_F(BloomBitCounterTest, InvalidObservations) {
- // Construct a bloom bit counter with 8 bits and 2 cohorts.
- SetBitCounter(8, 2); // NOLINT
-
- // Attempt to add observations with 2 bytes instead of one.
- AddObservationExpectFalse(0, "0000000000000000");
- AddObservationExpectFalse(1, "0000000000000000");
- AddObservationExpectFalse(0, "0000000100000000");
- AddObservationExpectFalse(1, "0000000100000000");
-
- // Successfully add observations with one bytes.
- AddObservation(0, "00000001");
- AddObservation(1, "00000001");
-
- // Attempt to add an observation for cohort 3.
- AddObservationExpectFalse(3, "00000001");
-}
-
-// Invokes OneBitTest on various y using n=100, p=0, q=1
-TEST_F(BloomBitCounterTest, OneBitTestN100P0Q1) {
- const int n = 100;
- const double expected_std_err = 0;
-
- // Test with various values of y. expected_estimate = y.
- for (int y : {0, 1, 34, 49, 50, 51, 71, 99, 100}) {
- SCOPED_TRACE(std::to_string(y));
- OneBitTest(n, y, y, expected_std_err);
- }
-}
-
-// Invokes OneBitTest on various y using n=100, p=0.2, q=0.8
-TEST_F(BloomBitCounterTest, OneBitTestN100P02Q08) {
- prob_0_becomes_1_ = 0.2; // NOLINT
- prob_1_stays_1_ = 0.8; // NOLINT
- int n = 100; // NOLINT
-
- // This is the formula for computing expected_estimate when n=100, p=0.2,
- // q=0.8.
- // NOLINTNEXTLINE
- auto estimator = [](double y) { return (y - 20.0) * 5.0 / 3.0; };
- // This is the expected standard error for n=100, p=0.2, q=0.8, independent
- // of y.
- // NOLINTNEXTLINE
- double expected_std_err = 20.0 / 3.0;
-
- // Test with various values of y.
- for (int y : {0, 1, 34, 49, 50, 51, 71, 99, 100}) {
- SCOPED_TRACE(std::to_string(y));
- OneBitTest(n, y, estimator(y), expected_std_err);
- }
-}
-
-// Invokes OneBitTest on various y using n=1000, p=0.15, q=0.85
-TEST_F(BloomBitCounterTest, OneBitTestN1000P015Q085) {
- prob_0_becomes_1_ = 0.15; // NOLINT
- prob_1_stays_1_ = 0.85; // NOLINT
- int n = 1000; // NOLINT
-
- // This is the formula for computing expected_estimate when n=1000, p=0.15,
- // q=0.85.
- // NOLINTNEXTLINE
- auto estimator = [](double y) { return (y - 150.0) * 10.0 / 7.0; };
- // This is the expected standard error for n=1000, p=0.15, q=0.85,
- // independent
- // of y.
- // NOLINTNEXTLINE
- double expected_std_err = sqrt(127.5) * 10.0 / 7.0;
-
- // Test with various values of y.
- for (int y : {0, 1, 71, 333, 444, 555, 666, 777, 888, 999, 1000}) {
- SCOPED_TRACE(std::to_string(y));
- OneBitTest(n, y, estimator(y), expected_std_err);
- }
-}
-
-// Invokes OneBitTest on various y using n=5000, p=0.5, q=0.9. Notice that
-// p + q > 1
-TEST_F(BloomBitCounterTest, OneBitTestN5000P05Q09) {
- prob_0_becomes_1_ = 0.5; // NOLINT
- prob_1_stays_1_ = 0.9; // NOLINT
- const int n = 5000;
-
- // This is the formula for computing expected_estimate when n=5000, p=0.5,
- // q=0.9.
- // NOLINTNEXTLINE
- auto estimator = [](double y) { return (y - 2500.0) * 5.0 / 2.0; };
-
- // This is the formula for computing expected_std_err when n=5000, p=0.5,
- // q=0.9.
- // NOLINTNEXTLINE
- auto std_err = [](double y) { return sqrt(y * -0.4 + 2250.0) * 5.0 / 2.0; };
-
- // Test with various values of y.
- for (int y : {0, 1, 49, 222, 1333, 2444, 3555, 4999, 5000}) {
- SCOPED_TRACE(std::to_string(y));
- OneBitTest(n, y, estimator(y), std_err(y));
- }
-}
-
-// Invokes OneBitTest on various y using n=5000, p=0.05, q=0.5. Notice that
-// p + q < 1
-TEST_F(BloomBitCounterTest, OneBitTestN5000P005Q05) {
- prob_0_becomes_1_ = 0.05; // NOLINT
- prob_1_stays_1_ = 0.5; // NOLINT
- const int n = 5000;
-
- // This is the formula for computing expected_estimate when n=5000, p=0.05,
- // q=0.5.
- // NOLINTNEXTLINE
- auto estimator = [](double y) { return (y - 250.0) / 0.45; };
-
- // This is the formula for computing expected_std_err when n=5000, p=0.05,
- // q=0.5.
- // NOLINTNEXTLINE
- auto std_err = [](double y) { return sqrt(y * 0.45 + 125.0) / 0.45; };
-
- // Test with various values of y.
- for (int y : {0, 1, 49, 222, 1333, 2444, 3555, 4999, 5000}) {
- SCOPED_TRACE(std::to_string(y));
- OneBitTest(n, y, estimator(y), std_err(y));
- }
-}
-
-} // namespace cobalt::rappor
diff --git a/src/algorithms/rappor/lasso_runner.cc b/src/algorithms/rappor/lasso_runner.cc
deleted file mode 100644
index 4a68fb7..0000000
--- a/src/algorithms/rappor/lasso_runner.cc
+++ /dev/null
@@ -1,410 +0,0 @@
-// Copyright 2018 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "src/algorithms/rappor/lasso_runner.h"
-
-#include <algorithm>
-
-#include <glog/logging.h>
-
-using cobalt_lossmin::GradientEvaluator;
-using cobalt_lossmin::InstanceSet;
-using cobalt_lossmin::LabelSet;
-using cobalt_lossmin::ParallelBoostingWithMomentum;
-using cobalt_lossmin::Weights;
-
-namespace cobalt::rappor {
-
-namespace {
-// ***************************************************************************
-// Constants used by the cobalt_lossmin minimizers in both steps of RAPPOR
-// (lasso_runner.RunFirstRapporStep and lasso_runner.GetExactValuesAndStdErrs).
-// They are meant to be generic so modify them with caution.
-//
-// kZeroThreshold is the (relative) proportion below which we assume a
-// candidate count to be effectively zero; that is, we assume a candidate to
-// be zero if its estimated count is below kZeroThreshold of the
-// bit_counter_.num_observations(). It cannot be exactly zero for
-// performance reasons (also, exact zero makes little if any sense
-// numerically). Also, smaller values might be more difficult for individual
-// runs of the minimizer.
-// You can change kZeroThreshold to the value you think is best (smaller
-// than what you would deem negligible); however, do not get close to double
-// precision accuracy (1e-16). A reasonable value could be between 1e-4 and
-// 1e-8.
-constexpr double kZeroThreshold = 1e-6;
-// kL2toL1Ratio is the ratio between l1 and l2 penalty.
-// Although pure lasso (or linear regression) does not include any l2 penalty,
-// a tiny bit can improve stability. A value of kL2toL1Ratio less or equal to
-// 1e-2 should not affect the interpretation of the solution.
-constexpr double kL2toL1Ratio = 1e-3;
-// kLossEpochs denotes how often the current objective value is recorded in
-// the solver (it will be recorded every kLossEpochs epochs).
-// kConvergenceMeasures denotes how often the minimizer checks convergence:
-// the algorithm will check convergence
-// every kConvergenceEpochs epochs and terminate if at least
-// one of the following is true.
-// 1. The minimizer reached the solution up to the accuracy defined by
-// minimizer.covergence_threshold(). In this case,
-// minimizer.reached_solution() == true and minimizer.converged() == true.
-// 2. The algorithm stopped improving as measured by the
-// simple convergence check of the minimizer.
-// In this case, minimizer.converged() == true.
-// kLossEpochs and kConvergenceEpochs must be positive (and
-// probably both not larger than 10). Also, kLossEpochs <=
-// kConvergenceEpochs makes more sense.
-constexpr int kLossEpochs = 5;
-constexpr int kConvergenceMeasures = 5;
-// kAlpha is a constant from parallel boosting with momentum paper,
-// must be 0 < kAlpha < 1; cobalt_lossmin library default initial choice is 0.5,
-// but we need to be able to reset this value in minimizer if needed.
-constexpr double kAlpha = 0.5;
-// kMinConvergenceThreshold is an absolute lower bound for the convergence
-// thresholds in the algorithm; It should be something within a couple of
-// orders of magnitude of the double precision (reasonable values may be
-// between 1e-12 and 1e-14). This is introduced because convergence thresholds
-// are computed relatively to the initial gradient norm and so they might be
-// too low if the initial guess is very good.
-constexpr double kMinConvergenceThreshold = 1e-12;
-//
-// ***************************************************************************
-// Constants of the lasso_runner that will be used in RunFirstLassoStep.
-//
-// Note(bazyli) about modifying constants:
-// 1. To improve the accuracy of the solution you may want to
-// set the convergence thresholds: kRelativeConvergenceThreshold,
-// kRelativeInLassoPathConvergenceThreshold, and kSimpleConvergenceThreshold
-// to smaller values, at the expense of more computations (epochs) performed.
-// See their descriptions below.
-// 2. You may want to modify kMaxEpochs if it is too strict (which may be the
-// case if you set very small convergence thresholds), but kNumLassoSteps and
-// kL1maxToL1minRatio should be quite generic so modify them with caution.
-// 3. You may test whether linear path (kUseLinearPath == true) or
-// (kUseLinearPath == false) is better for your case. The run times and result
-// quality may slightly differ (see below).
-//
-// Define minimizer and lasso-path-specific parameters.
-// kRelativeConvergenceThreshold is the improvement that we expect to achieve
-// at convergence, relative to initial gradient. That is, if ||g|| is the
-// initial 2-norm norm of the gradient, we expect the final (i.e. after the
-// last lasso problem) total measure of KKT violation to be equal to
-// kRelativeConvergenceThreshold * ||g||. Value smaller than 1e-8 gives a full
-// (single precision) convergence and is probably overshooting because the
-// purpose of the first step is mostly to identify potential nonzero
-// coefficients. Something between 1e-5 and 1e-7 should be enough.
-// Even larger value can be chosen for performance reasons. This accuracy will
-// be used only in the final lasso subproblem.
-// kRelativeInLassoPathConvergenceThreshold has the same interpretation but
-// will be used inside the lasso path (before the last subproblem). We
-// probably want to set this to be slightly (e.g. ten times) less restrictive
-// (larger) than kRelativeConvergenceThreshold for efficiency because the
-// solutions inside the lasso path serve as a "warm-up" before the last
-// subproblem; this subproblem, on the other hand, can be more accurate and
-// should benefit more from "momentum" computations in the algorithm.
-// kSimpleConvergenceThreshold == d means that the algorithm will stop if the
-// best relative improvement between two consecutive measures of the objective
-// in the last kConvergenceMeasures measures, is below d. In other words,
-// relative improvements less than d will cause the minimizer to return.
-// See the description of kConvergenceEpochs and
-// kLossEpochs above. A rule-of-thumb value for kSimpleConvergenceThreshold
-// can be some small fraction of the inverse of kMaxEpochs below.
-//
-// Note: The actual absolute values of convergence
-// thresholds used in the algorithm are capped by
-// loss_minimizer.min_convergence_threshold_ in case ||g|| is almost zero in the
-// first place (see below). Note: If you are bumping into "The lasso path did
-// not reach the last subproblem" or "The lasso path did not reach the last
-// subproblem." in Analyze, it might mean that the convergence thresholds are
-// too strict for the specified kMaxEpochs limit, given the difficulty of the
-// problem.
-constexpr double kRelativeConvergenceThreshold = 1e-5;
-constexpr double kRelativeInLassoPathConvergenceThreshold = 1e-4;
-constexpr double kSimpleConvergenceThreshold = 1e-5;
-// kMaxEpochs denotes the limit on the total number of epochs (iterations)
-// run; the actual number can be up to two times larger
-// because every individual lasso subproblem (run of
-// minimizer) has the same bound and the total epochs count is updated after
-// the subproblem is run.
-// Note: If you are bumping into "The lasso path did not reach the
-// last subproblem." error in Analyze, it is possible that this number is too
-// strict (low) for the convergence thresholds above.
-constexpr int kMaxEpochs = 20000;
-// kNumLassoSteps is the number of subproblems solved in the lasso path;
-// it is not true that more steps will take more time: there should be a "sweet
-// spot", and definitely this number should not be too small (probably something
-// between 50 and 500).
-constexpr int kNumLassoSteps = 100;
-// kL1maxToL1minRatio is the ratio between the first and the last l1 penalty
-// value in the lasso path; should probably be something between 1e-6 and
-// 1e-3 (the R library glmnet uses 1e-3 by default).
-constexpr double kL1maxToL1minRatio = 1e-3;
-// kUseLinearPath specifies whether the lasso path should be linear (true) or
-// logarithmic. Linear path means that the l1 penalties form an arithmetic
-// sequence from largest to smallest; logarithmic path means that the penalties
-// form a geometric series. Note(bazyli): linear path is more conservative in
-// decreasing the penalties in the initial part of the lasso path. Since we are
-// interested in the heavy hitters, we may want to prefer to add them slowly in
-// this phase. Logarithmic path, on the other hand, decreases the penalty fast
-// in the initial phase but more slowly towards the end of the lasso path. This
-// is numerically more stable and may be faster but may introduce a lot of
-// nonzero coefficients in the initial phase.
-constexpr bool kUseLinearPath = true;
-//
-// ***************************************************************************
-// Constants related to GetExactValuesAndStdErrors
-//
-// The convergence thresholds and
-// kMaxEpochsSingleRun have the same definitions as the corresponding
-// constants in RunFirstRapporStep. However, we may want the convergence
-// thresholds to be more strict, because here we are interested in more exact
-// values, and the problem to solve should be easier.
-//
-// Initialize the minimizer constants.
-// Relative convergence thresholds have the same interpretation as the ones
-// used in RunFirstRapporStep. Consult their description if you plan to modify
-// the values.
-constexpr double kRelativeConvergenceThreshold2Step = 1e-6;
-constexpr double kSimpleConvergenceThreshold2Step = 1e-6;
-// kNumRuns is the number of runs to estimate standard deviations of
-// coefficients. The problem will be solved kNumRuns times, each time
-// with a slightly different right hand side.
-// This number probably should not be smaller than 5 but not too large:
-// large value should give a better approximation of standard errors but
-// kNumRuns == n means that the whole problem will be solved n times so it
-// affects the runtime.
-constexpr int kNumRuns = 20;
-// kMaxEpochsSingleRun is the maximum number of epochs in a single
-// run of the algorithm. So total number of epochs run will be bounded by
-// kNumRuns * kMaxEpochsSingleRun.
-constexpr int kMaxEpochsSingleRun = 5000;
-// Note(bazyli): If convergence thresholds are too small relative to
-// kMaxEpochsSingleRun then some of the runs of the algorithm may not
-// converge. If less than 5 converge then the standard errors are set to 0. If
-// none converges, then the exact_est_candidate_weights is set to be the same
-// as est_candidate_weights. This is a safeguard solution and if it happens,
-// we may want to adjust the constants so that it doesn't happen again (e.g.
-// increase kMaxEpochsSingleRun).
-} // namespace
-
-LassoRunner::LassoRunner(const InstanceSet* matrix) : matrix_(matrix) {
- zero_threshold_ = kZeroThreshold;
- l2_to_l1_ratio_ = kL2toL1Ratio;
- alpha_ = kAlpha;
- min_convergence_threshold_ = kMinConvergenceThreshold;
- num_lasso_steps_ = kNumLassoSteps;
- l1_max_to_l1_min_ratio_ = kL1maxToL1minRatio;
- use_linear_path_ = kUseLinearPath;
-}
-
-void LassoRunner::RunFirstRapporStep(const int max_nonzero_coeffs, const double max_solution_1_norm,
- const LabelSet& as_label_set, Weights* est_candidate_weights,
- std::vector<int>* second_step_cols) {
- // Construct the lossmin minimizer objects.
- // TODO(bazyli): remove this copy
- InstanceSet candidate_matrix = *matrix_;
- GradientEvaluator grad_eval(candidate_matrix, as_label_set);
- ParallelBoostingWithMomentum minimizer(0.0, 0.0, grad_eval); // penalties will be set later.
-
- // Initialize the solution vector to zero vector for the lasso path and
- // compute the initial gradient (this will be used to get initial l1 penalty
- // and convergence thresholds).
- const int num_candidates = candidate_matrix.cols();
- Weights initial_gradient = Weights::Zero(num_candidates);
- grad_eval.SparseGradient(*est_candidate_weights, &initial_gradient);
-
- // Set the minimizer absolute convergence constants.
- const double initial_mean_gradient_norm = initial_gradient.norm() / num_candidates;
- const double kConvergenceThreshold = std::max(
- min_convergence_threshold_, kRelativeConvergenceThreshold * initial_mean_gradient_norm);
- const double kInLassoPathConvergenceThreshold =
- std::max(min_convergence_threshold_,
- kRelativeInLassoPathConvergenceThreshold * initial_mean_gradient_norm);
-
- // Set the constants for lasso path computations.
- const double l1max = initial_gradient.array().abs().maxCoeff();
- const double l1min = l1_max_to_l1_min_ratio_ * l1max;
- const double l2 = l2_to_l1_ratio_ * l1min;
- const double l1delta = (l1max - l1min) / num_lasso_steps_;
- const double l1mult = std::exp(std::log(l1_max_to_l1_min_ratio_) / num_lasso_steps_);
-
- // Set up the minimizer.
- minimizer.set_zero_threshold(zero_threshold_);
- minimizer.set_convergence_threshold(kInLassoPathConvergenceThreshold);
- minimizer.set_simple_convergence_threshold(kSimpleConvergenceThreshold);
- minimizer.set_l2(l2);
- minimizer.compute_and_set_learning_rates(); // learning rates must be
- // re-computed when l2 penalty
- // changes.
- VLOG(4) << "Lasso in-path convergence threshold ==" << kInLassoPathConvergenceThreshold;
- VLOG(4) << "Lasso final convergence threshold ==" << kConvergenceThreshold;
-
- // Initialize variables to track the lasso path computations.
- std::vector<double> loss_history;
- double solution_1_norm = 0;
- int total_epochs_run = 0;
- int how_many_nonzero_coeffs = 0;
-
- // Perform lasso path computations.
- int i = 0;
- double l1_this_step = use_linear_path_ ? l1max - l1delta : l1max * l1mult;
-
- for (; i < num_lasso_steps_ && total_epochs_run < kMaxEpochs; i++) {
- VLOG(4) << "Minimizing " << i << "-th Lasso subproblem";
-
- if (how_many_nonzero_coeffs >= max_nonzero_coeffs || i == num_lasso_steps_ - 1 ||
- solution_1_norm >= max_solution_1_norm) {
- // Enter the final lasso subproblem.
- minimizer.set_convergence_threshold(kConvergenceThreshold);
- if (i < num_lasso_steps_ - 1) {
- // If stopping criteria are met before reaching maximum number of steps,
- // use l1 from previous run.
- l1_this_step = use_linear_path_ ? l1_this_step + l1delta : l1_this_step / l1mult;
- i = num_lasso_steps_ - 1;
- }
- VLOG(4) << "Entered last Run";
- }
-
- minimizer.set_l1(std::max(l1min, l1_this_step));
- minimizer.set_reached_solution(false);
- minimizer.set_converged(false);
-
- // Set minimizer input as in the parallel boosting with momentum paper.
- minimizer.set_phi_center(*est_candidate_weights);
- minimizer.set_alpha(alpha_);
- minimizer.set_beta(1 - alpha_);
-
- VLOG(4) << "The l1 penalty used == " << l1_this_step;
-
- minimizer.Run(kMaxEpochs, kLossEpochs, kConvergenceMeasures, est_candidate_weights,
- &loss_history);
-
- // Compute the 1-norm of the current solution and the number of nonzero
- // coefficients.
- solution_1_norm = est_candidate_weights->lpNorm<1>();
- how_many_nonzero_coeffs = (est_candidate_weights->array() > zero_threshold_).count();
-
- VLOG(4) << "Ran " << minimizer.num_epochs_run() << " epochs in this step.";
- VLOG(4) << "Num of nonzero coefficients found: " << how_many_nonzero_coeffs;
- VLOG(4) << "Solution 1-norm == " << solution_1_norm;
- total_epochs_run += minimizer.num_epochs_run();
-
- l1_this_step = use_linear_path_ ? l1_this_step - l1delta : l1_this_step * l1mult;
- }
-
- VLOG(4) << "Ran " << total_epochs_run << " epochs in total.";
-
- // Record minimizer info.
- minimizer_data_.reached_last_lasso_subproblem = (i == num_lasso_steps_);
- minimizer_data_.num_epochs_run = total_epochs_run;
- minimizer_data_.converged = minimizer.converged();
- minimizer_data_.reached_solution = minimizer.reached_solution();
- minimizer_data_.l1 = minimizer.l1();
- minimizer_data_.l2 = minimizer.l2();
- minimizer_data_.convergence_threshold = kConvergenceThreshold;
- minimizer_data_.zero_threshold = kZeroThreshold;
-
- // Prepare the vector of columns for the second step of RAPPOR.
- second_step_cols->clear();
- second_step_cols->reserve(how_many_nonzero_coeffs);
- for (int i = 0; i < est_candidate_weights->size(); i++) {
- if (est_candidate_weights->coeff(i) > zero_threshold_) {
- second_step_cols->push_back(i);
- }
- }
-}
-
-void LassoRunner::GetExactValuesAndStdErrs(const double l1, const Weights& est_candidate_weights,
- const std::vector<double>& est_standard_errs,
- const InstanceSet& instances,
- const LabelSet& as_label_set,
- Weights* exact_est_candidate_weights,
- Weights* est_candidate_errors) {
- // Note(bazyli): If convergence thresholds are too small relative to
- // kMaxEpochsSingleRun then some of the runs of the algorithm may not
- // converge. If less than 5 converge then the standard errors are set to 0. If
- // none converges, then the exact_est_candidate_weights is set to be the same
- // as est_candidate_weights. This is a safeguard solution and if it happens,
- // we may want to adjust the constants so that it doesn't happen again (e.g.
- // increase kMaxEpochsSingleRun).
- const double l2 = l2_to_l1_ratio_ * l1;
- const int num_candidates = est_candidate_weights.size();
- const int num_labels = as_label_set.size();
- const int min_num_converged_for_standard_errors = 5;
- int num_converged = 0; // We record the number of runs in which the minimizer
- // actually converged (although if everything is fine,
- // i.e. all constants have appropriate values for the given case, this
- // should be equal to num_runs at completion).
-
- // We will need the solutions from all runs to compute the mean solution and
- // standard errors.
- std::vector<Weights> est_weights_runs; // vector for storing solutions from different runs
- Weights mean_est_weights = Weights::Zero(num_candidates);
-
- // In each run create a new_label_set by adding Gaussian noise to the original
- // as_label_set.
- for (int i = 0; i < kNumRuns; i++) {
- LabelSet new_label_set = as_label_set;
-
- for (int j = 0; j < num_labels; j++) {
- std::normal_distribution<double> nrm_distr(0, static_cast<double>(est_standard_errs[j]));
- double noise = nrm_distr(random_dev_);
- new_label_set(j) += noise;
- }
-
- // We will run the minimizer for the right hand side with noise
- // (new_label_set). We always use the est_candidate_weights as the initial
- // guess.
-
- // Construct the minimizer and compute initial gradient.
- Weights new_candidate_weights = est_candidate_weights;
- std::vector<double> loss_history_not_used;
- GradientEvaluator grad_eval(instances, new_label_set);
- ParallelBoostingWithMomentum minimizer(l1, l2, grad_eval);
- Weights initial_gradient = Weights(num_candidates);
- grad_eval.SparseGradient(new_candidate_weights, &initial_gradient);
- double initial_mean_gradient_norm = initial_gradient.norm() / num_candidates;
- double convergence_threshold =
- std::max(min_convergence_threshold_,
- kRelativeConvergenceThreshold2Step * initial_mean_gradient_norm);
-
- // Set up and run the minimizer.
- minimizer.set_converged(false);
- minimizer.set_reached_solution(false);
- minimizer.set_phi_center(new_candidate_weights);
- minimizer.set_convergence_threshold(convergence_threshold);
- minimizer.set_zero_threshold(zero_threshold_);
- minimizer.set_simple_convergence_threshold(kSimpleConvergenceThreshold2Step);
- minimizer.set_alpha(alpha_);
- minimizer.set_beta(1 - alpha_);
- minimizer.Run(kMaxEpochsSingleRun, kLossEpochs, kConvergenceMeasures, &new_candidate_weights,
- &loss_history_not_used);
-
- if (minimizer.converged()) {
- // Update the mean and store the current solution.
- mean_est_weights += new_candidate_weights;
- est_weights_runs.push_back(new_candidate_weights);
- num_converged++;
- }
- }
-
- if (num_converged > 0) {
- mean_est_weights /= num_converged;
- } else {
- mean_est_weights = est_candidate_weights;
- }
-
- // Compute the sample means and standard deviations (standard errors).
- Weights sample_stds = Weights::Zero(num_candidates);
- if (num_converged >= min_num_converged_for_standard_errors) {
- for (auto& est_weight : est_weights_runs) {
- sample_stds += (est_weight - mean_est_weights).array().pow(2).matrix();
- }
- sample_stds = sample_stds / (num_converged - 1);
- sample_stds = sample_stds.array().sqrt();
- }
- *est_candidate_errors = sample_stds;
- *exact_est_candidate_weights = mean_est_weights;
-}
-
-} // namespace cobalt::rappor
diff --git a/src/algorithms/rappor/lasso_runner.h b/src/algorithms/rappor/lasso_runner.h
deleted file mode 100644
index 7324116..0000000
--- a/src/algorithms/rappor/lasso_runner.h
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright 2018 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef COBALT_SRC_ALGORITHMS_RAPPOR_LASSO_RUNNER_H_
-#define COBALT_SRC_ALGORITHMS_RAPPOR_LASSO_RUNNER_H_
-
-#include <math.h>
-
-#include <vector>
-
-#include "src/lib/lossmin/eigen-types.h"
-#include "src/lib/lossmin/minimizers/gradient-evaluator.h"
-#include "src/lib/lossmin/minimizers/parallel-boosting-with-momentum.h"
-
-#include "third_party/eigen/Eigen/SparseCore"
-
-namespace cobalt {
-namespace rappor {
-
-// MinimizerData is a struct for storing the info about cobalt_lossmin
-// minimizer.
-struct MinimizerData {
- bool converged;
-
- bool reached_solution;
-
- bool reached_last_lasso_subproblem;
-
- int num_epochs_run;
-
- double l1;
-
- double l2;
-
- double zero_threshold;
-
- double convergence_threshold;
-};
-
-// LassoRunner is a class for running the optimizations in both steps of RAPPOR
-// algorithm. These optimizations involve repeatedly solving problems of the
-// general form:
-//
-// min_x || A * x - b ||_2^2 + l1 || x ||_1 + l2 * || x ||_2^2,
-//
-// with variable x, where A is an m x n matrix, and l1, l2 >= 0.
-//
-// (1) RunFirstRapporStep computes the lasso path to identify nonzero
-// coefficients of x in the equation A * x = b where b is given approximately
-// and A can be such that m < n (see exact description below).
-//
-// (2) GetExactValuesAndStdErrs solves a single lasso problem a number of times,
-// each time introducing noise to the right hand side vector b in order to
-// estimate standard errors of the coefficients (see below).
-//
-// Both functions use the cobalt_lossmin::ParallelBoostingWithMomentum solver to
-// perform the optimizations.
-class LassoRunner {
- public:
- explicit LassoRunner(const cobalt_lossmin::InstanceSet* matrix);
-
- // Runs the first step of RAPPOR. That is, runs the lasso path, i.e.
- // computes the solutions to a sequence of lasso subproblems:
- // min 1/(2N) || A * x - y ||_2^2 + l1_i * || x ||_1 + 1/2 * l2 * || x ||_2^2,
- // with variable x, and decreasing values of l1 penalty:
- // l1_1 > l1_2 > l2_3 > ... > l1_n. The l2 penalty is meant to be
- // insignificant and is introduced for stability reasons (so l2 << l1_n).
- // A == candidate_matrix_ and y == |as_label_set|, N is
- // equal to the number of rows of A.
- //
- // The solution to each problem gives a "warm start" for the next one.
- // Initially, x == 0, and l1_1 is the smallest value such that x == 0 is the
- // solution to the first subproblem. The value of n and l1_n / l1_1 ratio are
- // defined inside the function,
- //
- // The lasso path can either be linear or logarithmic. This is specified
- // in the implementation. Linear path means that the difference between the
- // lasso penalties of each two consecutive subproblems is constant (so l1_i -
- // l1_(i+1) is the same for all i). The logarithmic path is a possible
- // alternative to the linear path (it is used by the R glmnet library); in it,
- // the l1_i form a geometric rather than arithmetic sequence, with
- // l1_(i+1)/l1_1 being constant and smaller than 1.0.
- //
- // If i is encountered such that the solution x of the i-th problem satisfies:
- // || x ||_1 >= |max_solution_1_norm| or || x ||_0 >= |max_nonzero_coeffs|, or
- // if i == n, then the lasso problem with l1_i is solved with a different
- // (possibly better) accuracy (set inside the function), and
- // the solution itself is written at |est_candidate_weights|, which should
- // be initialized to zero. The indices corresponding to positive entries of
- // the solution are stored in |second_step_cols|.
- //
- // Computing a lasso path (though not necessarily
- // linear) is the standard way to perform lasso computations and has a number
- // of advantages cited in literature. It is more numerically stable and more
- // efficient than computing just the last problem; it gives the entire path of
- // solutions and therefore can be used to choose the most meaningful value of
- // penalty (in our case we do not really know it a priori). The number of
- // nonzero coefficients of x (|| x ||_0) will be increasing as i increases.
- //
- // Note(bazyli): the logarithmic path is a possible alternative to the linear
- // path (it is used by the R glmnet library); in it, the l1_i form a geometric
- // rather than arithmetic sequence, with l1_(i+1)/l1_1 being constant and
- // smaller than 1.0. I found, however, that even though it typically results
- // in less problems solved, it often introduces many nonzero
- // coefficients in the initial subproblems of the lasso path. Since we are
- // interested in heavy hitters, we may not need to run the entire path and
- // want to be more conservative as to how quickly the new nonzero coefficients
- // are identified.
- void RunFirstRapporStep(int max_nonzero_coeffs, double max_solution_1_norm,
- const cobalt_lossmin::LabelSet& as_label_set,
- cobalt_lossmin::Weights* est_candidate_weights,
- std::vector<int>* second_step_cols);
- // Runs the second step of RAPPOR. Solves the problems
- // min 1/(2N) || A * x - y_i ||_2^2 + |l1| * || x ||_1 + 1/2 * |l2| * || x
- // ||_2^2 with variable x, where A == |instances|, y_i == |as_label_set| +
- // err_i, for i = 1,2,..., |num_runs|. Here, (err_i)_j is a Gaussian error
- // with standard deviation equal to |est_stand_errs|[j]. All (err_i)_j, are
- // independent. N is equal to the number of rows of A.
- //
- // It then computes the standard errors ex_i from all runs for each of the
- // solutions x_i. The mean x from all num_runs is written at
- // |exact_est_candidate_weights|. (If none of the problems converged, then
- // unchanged |est_candidate_weights| will be written). The standard errors are
- // written at |est_candidate_errors|. (However, if less than 5 problems
- // converged, then standard errors are set to zero.)
- //
- // The problem is repeatedly solved using the parallel boosting with momentum
- // algorithm with |est_candidate_weights| as the initial guess.
- // The variables exact_est_candidate_weights and est_candidate_errors
- void GetExactValuesAndStdErrs(double l1, const cobalt_lossmin::Weights& est_candidate_weights,
- const std::vector<double>& est_standard_errs,
- const cobalt_lossmin::InstanceSet& instances,
- const cobalt_lossmin::LabelSet& as_label_set,
- cobalt_lossmin::Weights* exact_est_candidate_weights,
- cobalt_lossmin::Weights* est_candidate_errors);
-
- // Returns reference to minimizer_data_.
- [[nodiscard]] const MinimizerData& minimizer_data() const { return minimizer_data_; }
-
- private:
- friend class LassoRunnerTest;
-
- // Pointer to the matrix A.
- const Eigen::SparseMatrix<double, Eigen::RowMajor>* matrix_;
-
- // Random device (used for generating Gaussian noise in
- // GetSignificantNonZeros).
- std::random_device random_dev_;
-
- // Stores info about lossmin minimizer after RunFirstRapporStep.
- MinimizerData minimizer_data_;
-
- // Constants used inside RunFirstRapporStep and GetExactValuesAndStdErrs.
- // We also use them in automated testing. They are reset in the constructor.
- // See the implementation in .cc file.
- //
- // zero_threshold_ is the number below which we assume a
- // coefficient of the solution to be effectively zero;
- // it is used in minimizers. It should be probably be not smaller than 1e-14.
- double zero_threshold_;
- // l2_to_l1_ratio_ is the ratio between l1 and l2 penalties used in all
- // minimizers.
- double l2_to_l1_ratio_;
- // alpha_ is a constant from parallel boosting with momentum algorithm;
- // it must be a number satisfying 0 < alpha_ < 1.
- double alpha_;
- // min_convergence_threshold_ is the minimum convergence threshold used in all
- // minimizers. Its purpose is to prevent the convergence thresholds in the
- // from becoming numerically too small. It should probably be not smaller than
- // 1e-14.
- double min_convergence_threshold_;
- // num_lasso_steps_ is the number of subproblems solved in the lasso path in
- // RunFirstRapporStep. It must be a positive integer.
- int num_lasso_steps_;
- // l1_max_to_l1_min_ratio_ is the ratio between the largest (initial) and
- // smallest (final) penalty in the lasso path. It must be smaller than 1.0.
- double l1_max_to_l1_min_ratio_;
- // use_linear_path_ specifies if linear path should be used (instead of
- // logarithmic),
- bool use_linear_path_;
-};
-
-} // namespace rappor
-} // namespace cobalt
-
-#endif // COBALT_SRC_ALGORITHMS_RAPPOR_LASSO_RUNNER_H_
diff --git a/src/algorithms/rappor/lasso_runner_test.cc b/src/algorithms/rappor/lasso_runner_test.cc
deleted file mode 100644
index 2dc8df6..0000000
--- a/src/algorithms/rappor/lasso_runner_test.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright 2018 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "src/algorithms/rappor/lasso_runner_test.h"
-
-namespace cobalt::rappor {
-
-using cobalt_lossmin::InstanceSet;
-using cobalt_lossmin::LabelSet;
-using cobalt_lossmin::Weights;
-
-void LassoRunnerTest::SetLassoRunner(const InstanceSet* matrix) {
- lasso_runner_ = std::make_unique<LassoRunner>(matrix);
-}
-
-void LassoRunnerTest::CheckFirstRapporStepCorrectness(const LabelSet& right_hand_side,
- const Weights& results) {
- // Get the penalty paramaters.
- const double l1 = lasso_runner_->minimizer_data_.l1;
- const double l2 = lasso_runner_->minimizer_data_.l2;
- const double zero_threshold = lasso_runner_->minimizer_data_.zero_threshold;
- // Reference to the problem matrix.
- const Eigen::SparseMatrix<double, Eigen::RowMajor>* A_matrix(lasso_runner_->matrix_);
-
- // Check that the minimizer converged and that minimizer info makes sense.
- EXPECT_EQ(lasso_runner_->minimizer_data_.converged, true);
- EXPECT_GE(l1, 0);
- EXPECT_GE(l2, 0);
- EXPECT_GT(lasso_runner_->minimizer_data_.convergence_threshold, 1e-16);
- EXPECT_GE(lasso_runner_->minimizer_data_.convergence_threshold,
- lasso_runner_->min_convergence_threshold_);
- EXPECT_GT(lasso_runner_->minimizer_data_.num_epochs_run,
- 0); // this check assumes that the initial guess was not exact
- if (l1 > 0) {
- EXPECT_GT(l1, l2);
- }
-
- // Check dimensions correctness and compute the
- // gradient = A^T * (A * x - y) / N + l2 * x.
- EXPECT_EQ(right_hand_side.size(), A_matrix->rows());
- EXPECT_EQ(results.size(), A_matrix->cols());
- Eigen::VectorXd residual = *A_matrix * results;
- residual -= right_hand_side;
- Eigen::VectorXd gradient = A_matrix->transpose() * residual;
- // Scale regression part of the gradient
- gradient /= A_matrix->rows();
- gradient += l2 * results;
-
- // Compute the KKT condition violation.
- Eigen::VectorXd estimate_error;
- estimate_error = ((results.array() > zero_threshold).select(gradient.array() + l1, 0)).matrix();
- estimate_error += ((results.array() < -zero_threshold).select(gradient.array() - l1, 0)).matrix();
- estimate_error +=
- ((abs(results.array()) <= zero_threshold).select(abs(gradient.array()) - l1, 0).max(0))
- .matrix();
- // The correctness check is relatively lax because the algorithm could have
- // converged before reaching the solution. On the other hand, the convergence
- // thresholds should be such that this holds.
- EXPECT_LE(estimate_error.norm() / results.size(), 1e-3);
-}
-
-void LassoRunnerTest::CheckNonzeroCandidates(const std::vector<int>& nonzero_cols,
- const Weights& results) {
- for (int i = 0; i < results.size(); i++) {
- if (std::find(nonzero_cols.begin(), nonzero_cols.end(), i) != nonzero_cols.end()) {
- EXPECT_GE(results[i], lasso_runner_->zero_threshold_);
- } else {
- EXPECT_LE(results[i], lasso_runner_->zero_threshold_);
- }
- }
-}
-
-void LassoRunnerTest::CheckLassoRunnerParameters() {
- EXPECT_GT(lasso_runner_->zero_threshold_, 1e-15);
- EXPECT_LT(lasso_runner_->zero_threshold_, 1e-2);
- EXPECT_GT(lasso_runner_->alpha_, 0.0);
- EXPECT_LT(lasso_runner_->alpha_, 1.0);
- EXPECT_GT(lasso_runner_->min_convergence_threshold_, 1e-15);
- EXPECT_GE(lasso_runner_->l1_max_to_l1_min_ratio_, 0);
- EXPECT_LT(lasso_runner_->l1_max_to_l1_min_ratio_, 1e-2);
- EXPECT_GT(lasso_runner_->num_lasso_steps_, 10);
-}
-
-// Creates a random sparse m x n matrix with positive entries.
-InstanceSet LassoRunnerTest::RandomMatrix(const int m, const int n, const int num_nonzero_entries) {
- std::vector<Eigen::Triplet<double>> triplets(num_nonzero_entries);
- std::uniform_int_distribution<int> m_distribution(0, m - 1);
- std::uniform_int_distribution<int> n_distribution(0, n - 1);
- static const double min_entry = 1.0;
- static const double max_entry = 1.0;
- std::uniform_real_distribution<double> real_distribution(min_entry, max_entry);
- for (int k = 0; k < num_nonzero_entries; k++) {
- uint32_t i = m_distribution(random_dev_);
- uint32_t j = n_distribution(random_dev_);
- double entry = real_distribution(random_dev_);
- triplets.emplace_back(Eigen::Triplet<double>(i, j, entry));
- }
- InstanceSet matrix(m, n);
- matrix.setFromTriplets(triplets.begin(), triplets.end());
-
- return matrix;
-}
-
-} // namespace cobalt::rappor
diff --git a/src/algorithms/rappor/lasso_runner_test.h b/src/algorithms/rappor/lasso_runner_test.h
deleted file mode 100644
index c718260..0000000
--- a/src/algorithms/rappor/lasso_runner_test.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright 2018 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef COBALT_SRC_ALGORITHMS_RAPPOR_LASSO_RUNNER_TEST_H_
-#define COBALT_SRC_ALGORITHMS_RAPPOR_LASSO_RUNNER_TEST_H_
-
-#include <memory>
-#include <vector>
-
-#include <glog/logging.h>
-
-#include "src/algorithms/rappor/lasso_runner.h"
-#include "src/lib/lossmin/eigen-types.h"
-#include "third_party/googletest/googletest/include/gtest/gtest.h"
-
-#include "third_party/eigen/Eigen/SparseCore"
-
-namespace cobalt {
-namespace rappor {
-
-static const float kVerySmallPenalty = 1e-6;
-
-class LassoRunnerTest : public ::testing::Test {
- protected:
- // Set the matrix pointed to by lasso_runner_.
- void SetLassoRunner(const cobalt_lossmin::InstanceSet* matrix);
-
- // Checks correctness of the solution to a single lasso problem stored in
- // |results|. lasso_runner_ must store the minimizer data.
- // Also checks that values stored in minimizer_data_ are reasonable.
- //
- // The convergence is checked by verifying the KKT conditions directly.
- // First, we compute the gradient of the objective without l1 penalty:
- // grad = (1/N) * A^T (A * x - b) + l2 * x,
- // where A == lasso_runner_->matrix_
- // x == results, l2 = lasso_runner_->minimizer_data.l2
- // b = right_hand_side, N = A.rows().
- //
- // The KKT condition is necessary
- // and sufficient for |results| to be a minimizer:
- // If results[i] < 0 then grad[i] == l1
- // If results[i] > 0 then grad[i] == -l1
- // If results[i] == 0 then -l1 <= grad[i] <= l1,
- //
- // where l1 = lasso_runner_->minimizer_data_.l1.
- // The function checks whether the norm of the violations of the
- // KKT condition is within a certain bound.
- void CheckFirstRapporStepCorrectness(const cobalt_lossmin::LabelSet& right_hand_side,
- const cobalt_lossmin::Weights& results);
-
- // Ensures that the last penalty in the lasso path is very small.
- // Logarithmic path is more appropriate in this case.
- void MakeLastLassoStepExact() {
- lasso_runner_->use_linear_path_ = false;
- lasso_runner_->l1_max_to_l1_min_ratio_ = kVerySmallPenalty;
- }
-
- // Checks that the |nonzero_cols| contains exactly column ids corresponding to
- // nonzero entries.
- void CheckNonzeroCandidates(const std::vector<int>& nonzero_cols,
- const cobalt_lossmin::Weights& results);
-
- // Checks that the constants critical for the lasso path have reasonable
- // values.
- void CheckLassoRunnerParameters();
-
- // Creates a random sparse |m| x |n| matrix with positive entries.
- // The number of nonzero entries will approximately equal
- // |num_nonzero_entries|.
- cobalt_lossmin::InstanceSet RandomMatrix(int m, int n, int num_nonzero_entries);
-
- std::unique_ptr<LassoRunner> lasso_runner_; // NOLINT
-
- // Random device
- std::random_device random_dev_; // NOLINT
-};
-
-} // namespace rappor
-} // namespace cobalt
-
-#endif // COBALT_SRC_ALGORITHMS_RAPPOR_LASSO_RUNNER_TEST_H_
diff --git a/src/algorithms/rappor/lasso_runner_unit_tests.cc b/src/algorithms/rappor/lasso_runner_unit_tests.cc
deleted file mode 100644
index 65c7b35..0000000
--- a/src/algorithms/rappor/lasso_runner_unit_tests.cc
+++ /dev/null
@@ -1,307 +0,0 @@
-// Copyright 2018 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "src/algorithms/rappor/lasso_runner_test.h"
-
-namespace cobalt::rappor {
-
-using cobalt_lossmin::InstanceSet;
-using cobalt_lossmin::LabelSet;
-using cobalt_lossmin::Weights;
-
-// Tests of the correctness of the first RAPPOR step.
-// Note: some of the tests are heuristic.
-
-// Test if zero right hand side gives a zero solution.
-TEST_F(LassoRunnerTest, ZeroSolution) {
- // Any matrix with zero right hand side and zero initial guess should give a
- // zero solution.
- // We will create a random m x n matrix with num_nonzero_entries.
- static const int m = 100;
- static const int n = 200;
- static const int num_nonzero_entries =
- 1000; // the actual number of non-zeros might be slightly different
- std::vector<Eigen::Triplet<double>> triplets(num_nonzero_entries);
-
- // Create a random m x n sparse matrix and zero right hand side.
- InstanceSet matrix = RandomMatrix(m, n, num_nonzero_entries);
- Weights zero_right_hand_side = Weights::Zero(m);
- LabelSet right_hand_side = zero_right_hand_side;
- Weights results = Weights::Zero(n);
-
- // The result should not depend on the values below.
- const int max_nonzero_coeffs = 1.0;
- const double max_solution_1_norm = 1.0;
- std::vector<int> second_step_cols;
-
- // Reset the runner and run first lasso step.
- lasso_runner_ = std::make_unique<LassoRunner>(&matrix);
- lasso_runner_->RunFirstRapporStep(max_nonzero_coeffs, max_solution_1_norm, right_hand_side,
- &results, &second_step_cols);
- // Check that the Lasso parameters make numerical sense.
- CheckLassoRunnerParameters();
- // Check that the result is zero.
- EXPECT_LE(results.norm(), 1e-12);
- EXPECT_EQ(second_step_cols.empty(), true);
-}
-
-// Checks that the result on a non-singular square matrix with full lasso path
-// is correct; it should be close to the actual solution.
-TEST_F(LassoRunnerTest, ExactSolution) {
- // We will create a random triangular n x n sparse matrix with ones on the
- // diagonal and additional num_nonzero_entries. Such matrix is nonsingular.
- static const int n = 50;
- static const int num_nonzero_entries =
- 50; // the actual number of non-zeros might be slightly different
- std::vector<Eigen::Triplet<double>> triplets(n + num_nonzero_entries);
-
- // If the absolute values of the additional random entries are at most 1.0,
- // the matrix will be well-conditioned.
- std::uniform_int_distribution<int> n_distribution(0, n - 1);
- std::uniform_real_distribution<double> real_distribution(0, 1.0);
-
- // Create the random matrix and a random solution.
- for (int k = 0; k < n; k++) {
- triplets.emplace_back(Eigen::Triplet<double>(k, k, 1.0));
- }
- int nonzero_entries_count = 0;
- while (nonzero_entries_count < num_nonzero_entries) {
- uint32_t i = n_distribution(random_dev_);
- uint32_t j = n_distribution(random_dev_);
- if (j > i) {
- double entry = real_distribution(random_dev_);
- triplets.emplace_back(Eigen::Triplet<double>(i, j, entry));
- nonzero_entries_count++;
- }
- }
- InstanceSet matrix(n, n);
- matrix.setFromTriplets(triplets.begin(), triplets.end());
-
- Weights random_solution(n);
- for (int k = 0; k < n; k++) {
- random_solution(k) = real_distribution(random_dev_);
- }
- Weights random_right_hand_side = matrix * random_solution;
- LabelSet right_hand_side = random_right_hand_side;
- Weights results = Weights::Zero(n);
-
- // Set the limits on the solution to make sure full lasso path is performed.
- const int max_nonzero_coeffs = n + 1;
- const double max_solution_1_norm = 10 * random_solution.lpNorm<1>();
- std::vector<int> second_step_cols;
-
- // Reset the lasso runner and perform the first step of RAPPOR.
- lasso_runner_ = std::make_unique<LassoRunner>(&matrix);
- MakeLastLassoStepExact();
- lasso_runner_->RunFirstRapporStep(max_nonzero_coeffs, max_solution_1_norm, right_hand_side,
- &results, &second_step_cols);
- // Check solution correctness.
- CheckLassoRunnerParameters();
- CheckFirstRapporStepCorrectness(right_hand_side, results);
- CheckNonzeroCandidates(second_step_cols, results);
- EXPECT_LE((results - random_solution).norm() / random_solution.norm(), 1e-3);
-}
-
-// Checks that the limit on the number of nonzero elements in the solution is
-// implemented correctly. Runs a bunch of examples and checks if the limit on
-// the number of nonzero coordinates is satisfied.
-TEST_F(LassoRunnerTest, MaxNonzeros) {
- // We will create m x n random sparse matrix and
- // repeat the test num_tests times. The matrix will have approximately
- // num_nonzero_entries.
- static const int num_tests = 10;
- static const int m = 100;
- static const int n = 50;
- static const int num_nonzero_entries =
- 500; // the actual number of nonzero entries might be slightly different
- static const int max_norm_factor = 100;
-
- std::uniform_real_distribution<double> real_distribution(-1.0, 1.0);
- std::uniform_int_distribution<int> n_distribution(1, n);
-
- for (int i = 0; i < num_tests; i++) {
- // Create a random matrix and right hand side.
- // Take random limit on the number of nonzero coordinates.
- // Solve and check if the limit is satisfied.
- // The other stopping criterion is purposely lax.
- InstanceSet matrix = RandomMatrix(m, n, num_nonzero_entries);
- Weights random_solution(n);
- for (int k = 0; k < n; k++) {
- random_solution(k) = real_distribution(random_dev_);
- }
- Weights random_right_hand_side = matrix * random_solution;
- LabelSet right_hand_side = random_right_hand_side;
-
- int max_nonzero_coeffs = n_distribution(random_dev_);
- double max_solution_1_norm = max_norm_factor * random_solution.lpNorm<1>();
-
- // Reset the lasso runner and run the first RAPPOR step.
- Weights results = Weights::Zero(n);
- std::vector<int> second_step_cols;
- lasso_runner_ = std::make_unique<LassoRunner>(&matrix);
- lasso_runner_->RunFirstRapporStep(max_nonzero_coeffs, max_solution_1_norm, right_hand_side,
- &results, &second_step_cols);
- // Check that parameters make numerical sense and that the solution is
- // correct.
- CheckLassoRunnerParameters();
- CheckNonzeroCandidates(second_step_cols, results);
- CheckFirstRapporStepCorrectness(right_hand_side, results);
-
- // Since the last step of lasso is computed with a different accuracy, we
- // relax the check. This is heuristic.
- EXPECT_LE(second_step_cols.size(), static_cast<uint32_t>(1.2 * max_nonzero_coeffs + 5));
- }
-}
-
-// Checks that the limit on the 1-norm of the solution is well-implemented.
-// This test is similar to MaxNonzeros. Run a bunch of examples and see if the
-// limit on the solution 1-norm is satisfied
-TEST_F(LassoRunnerTest, MaxNorm) {
- // We will create m x n random sparse matrix and repeat the test
- // num_tests times. The matrix will have approximately num_nonzero_entries.
- static const int num_tests = 10;
- static const int m = 100;
- static const int n = 50;
- static const int num_nonzero_entries =
- 500; // the actual number of nonzero entries might be slightly different
-
- std::uniform_real_distribution<double> real_distribution(0, 1.0);
- std::uniform_int_distribution<int> n_distribution(1, n);
-
- for (int i = 0; i < num_tests; i++) {
- // Create a random matrix and right hand side.
- // Take random limit on the 1-norm of the solution.
- // Solve and check if the limit is satisfied.
- // The other stopping criterion is purposely lax.
- InstanceSet matrix = RandomMatrix(m, n, num_nonzero_entries);
- Weights random_solution(n);
- for (int k = 0; k < n; k++) {
- random_solution(k) = real_distribution(random_dev_);
- }
- Weights random_right_hand_side = matrix * random_solution;
- LabelSet right_hand_side = random_right_hand_side;
-
- int max_nonzero_coeffs = n + 1;
- double max_solution_1_norm = real_distribution(random_dev_) * random_solution.lpNorm<1>() + 1.0;
-
- Weights results = Weights::Zero(n);
- std::vector<int> second_step_cols;
- lasso_runner_ = std::make_unique<LassoRunner>(&matrix);
- lasso_runner_->RunFirstRapporStep(max_nonzero_coeffs, max_solution_1_norm, right_hand_side,
- &results, &second_step_cols);
- // Check that parameters make numerical sense and that the solution is
- // correct.
- CheckLassoRunnerParameters();
- CheckNonzeroCandidates(second_step_cols, results);
- CheckFirstRapporStepCorrectness(right_hand_side, results);
-
- // Since the last step of lasso is computed with a different accuracy we
- // relax the check. This is heuristic.
- EXPECT_LE(results.lpNorm<1>(), static_cast<double>(1.5 * max_solution_1_norm));
- }
-}
-
-// Tests of the correctness of the second RAPPOR step.
-// Note: Some tests are inherently heuristic.
-
-// This test is similar to ExactSolution but checks exactness of the second
-// RAPPOR step. Also checks that zero standard errors of the label set give
-// zero standard errors of the result.
-TEST_F(LassoRunnerTest, SecondStepExactness) {
- // Create a random triangular m x n sparse matrix with ones on the diagonal
- // and additional num_nonzero_entries.
- static const int m = 50;
- static const int n = 50;
- static const int num_nonzero_entries =
- 100; // the actual number of non-zeros might be slightly different
- static const double l1_penalty = 1e-8;
- std::vector<Eigen::Triplet<double>> triplets(n + num_nonzero_entries);
-
- std::uniform_int_distribution<int> n_distribution(0, n - 1);
- std::uniform_int_distribution<int> m_distribution(0, m - 1);
- std::uniform_real_distribution<double> real_distribution(0.0, 1.0);
-
- // Create the random matrix and a random solution.
- Weights random_solution(n);
- for (int k = 0; k < n; k++) {
- triplets.emplace_back(Eigen::Triplet<double>(k, k, 1.0));
- random_solution(k) = real_distribution(random_dev_);
- }
- int nonzero_entries_count = 0;
- while (nonzero_entries_count < num_nonzero_entries) {
- uint32_t i = m_distribution(random_dev_);
- uint32_t j = n_distribution(random_dev_);
- if (j < i) {
- double entry = real_distribution(random_dev_);
- triplets.emplace_back(Eigen::Triplet<double>(i, j, entry));
- nonzero_entries_count++;
- }
- }
- InstanceSet matrix(m, n);
- matrix.setFromTriplets(triplets.begin(), triplets.end());
- Weights random_right_hand_side = matrix * random_solution;
- LabelSet right_hand_side = random_right_hand_side;
-
- // Set standard errors to zero and prepare the minimizer input.
- std::vector<double> standard_errs(m, 0.0);
- Weights initial_guess = Weights::Constant(n, 0.75); // NOLINT
- Weights results = Weights::Zero(n);
- Weights estimated_errs = Weights::Zero(n);
-
- // Reset the lasso runner and run the second RAPPOR step.
- lasso_runner_ = std::make_unique<LassoRunner>(&matrix);
- lasso_runner_->GetExactValuesAndStdErrs(l1_penalty, initial_guess, standard_errs, matrix,
- right_hand_side, &results, &estimated_errs);
-
- // Check the correctness of the solution and that the estimated errors are
- // zero.
- EXPECT_LE((results - random_solution).norm() / random_solution.norm(), 1e-3);
- EXPECT_LE(estimated_errs.norm(), 1e-12);
-}
-
-// Check the exactness of the standard error estimates.
-// We run the Second RAPPOR step on an identity matrix and check that
-// the estimated solution and errors are reasonable.
-// Note: The tests are inherently heuristic because of randomness in the
-// algorithm.
-TEST_F(LassoRunnerTest, SecondStepErrors) {
- static const int n = 50;
- static const double l1_penalty = 1e-8;
- std::vector<Eigen::Triplet<double>> triplets;
- std::uniform_real_distribution<double> real_distribution(0.5, 1.0); // NOLINT
-
- // Create the identity matrix and a random solution.
- Weights random_solution(n);
- for (int k = 0; k < n; k++) {
- random_solution(k) = real_distribution(random_dev_);
- triplets.push_back(Eigen::Triplet<double>(k, k, 1.0)); // NOLINT
- }
-
- InstanceSet matrix(n, n);
- matrix.setFromTriplets(triplets.begin(), triplets.end());
- LabelSet right_hand_side = random_solution;
-
- // Set standard errors to 10% of the entry.
- std::vector<double> standard_errs(n);
- Weights errors_to_compare(n);
- for (int k = 0; k < n; k++) {
- standard_errs[k] = 0.1 * random_solution(k); // NOLINT
- errors_to_compare(k) = standard_errs[k];
- }
-
- // Run the second RAPPOR step.
- Weights initial_guess = Weights::Constant(n, 0.75); // NOLINT
- Weights results = Weights::Zero(n);
- Weights estimated_errs = Weights::Zero(n);
- lasso_runner_ = std::make_unique<LassoRunner>(&matrix);
- lasso_runner_->GetExactValuesAndStdErrs(l1_penalty, initial_guess, standard_errs, matrix,
- right_hand_side, &results, &estimated_errs);
-
- // The solution should be close but we should not count on exactness.
- EXPECT_LE((results - random_solution).norm() / random_solution.norm(), 1e-1);
- // Check that the error estimates are reasonable.
- EXPECT_LE((estimated_errs - errors_to_compare).norm() / errors_to_compare.norm(), 0.5);
-}
-
-} // namespace cobalt::rappor
diff --git a/src/algorithms/rappor/rappor_analyzer.cc b/src/algorithms/rappor/rappor_analyzer.cc
deleted file mode 100644
index 148101e..0000000
--- a/src/algorithms/rappor/rappor_analyzer.cc
+++ /dev/null
@@ -1,508 +0,0 @@
-// Copyright 2017 The Fuchsia Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "src/algorithms/rappor/rappor_analyzer.h"
-
-#include <algorithm>
-#include <cmath>
-#include <random>
-
-#include <glog/logging.h>
-
-#include "src/algorithms/rappor/rappor_encoder.h"
-#include "src/lib/crypto_util/hash.h"
-#include "src/lib/lossmin/eigen-types.h"
-#include "src/lib/lossmin/minimizers/gradient-evaluator.h"
-#include "src/lib/lossmin/minimizers/parallel-boosting-with-momentum.h"
-#include "src/lib/util/log_based_metrics.h"
-
-using cobalt_lossmin::InstanceSet;
-using cobalt_lossmin::LabelSet;
-using cobalt_lossmin::Weights;
-
-namespace cobalt::rappor {
-
-// Stackdriver metric contants
-namespace {
-constexpr char kAnalyzeFailure[] = "rappor-analyzer-analyze-failure";
-} // namespace
-
-using crypto::byte;
-
-RapporAnalyzer::RapporAnalyzer(const RapporConfig& config, const RapporCandidateList* candidates)
- : bit_counter_(config), config_(bit_counter_.config()) {
- candidate_map_.candidate_list = candidates;
- // candidate_map_.candidate_cohort_maps remains empty for now. It
- // will be populated by BuildCandidateMap.
-}
-
-bool RapporAnalyzer::AddObservation(const RapporObservation& obs) {
- VLOG(5) << "RapporAnalyzer::AddObservation() cohort=" << obs.cohort();
- return bit_counter_.AddObservation(obs);
-}
-
-grpc::Status RapporAnalyzer::Analyze(std::vector<CandidateResult>* results_out) {
- CHECK(results_out);
-
- // TODO(rudominer) Consider inserting here an analysis of the distribution
- // of the number of Observations over the set of cohorts. The mathematics
- // of our algorithm below assumes that this distribution is uniform. If
- // it is not uniform in practice this may indicate a problem with client-
- // side code and we may wish to take some corrective action.
-
- auto status = BuildCandidateMap();
- if (!status.ok()) {
- return status;
- }
-
- // est_bit_count_ratios is the right-hand side vector b from the equation Ax =
- // b that we are estimating, and est_std_errors are the corresponding standard
- // errors of entries of b. See comments on the declaration of
- // ExtractEstimatedBitCountRatiosAndStdErrors() for a description of these
- // vectors.
- Eigen::VectorXd est_bit_count_ratios;
- std::vector<double> est_std_errors;
- ExtractEstimatedBitCountRatiosAndStdErrors(&est_bit_count_ratios, &est_std_errors);
-
- // Note(rudominer) The cobalt_lossmin's GradientEvaluator constructor takes a
- // const LabelSet& parameter but est_bit_count_ratios is a
- // VectorXf. These types are different:
- // LabelSet = Matrix<float, Dynamic, Dynamic, RowMajor>
- // VectorXf = Matrix<float, Dynamic, 1>
- // These are different in two ways: VectorXf has a static number of columns
- // and VectorXf uses ColumnMajor order.
- // Here we copy est_bit_count_ratios to a label set before passing it to
- // the GradientEvaluator. This works fine. Some notes about this:
- // (1) Eigen defines copy constructors that do the right thing.
- // (2) There is no compilation error if we pass est_bit_count_ratios directly
- // to the GradientEvaluator constructor. Somehow this works--I'm not sure
- // why. But doing this leads to some unknown memory corruption which
- // causes very bad non-deterministic runtime behavior. Be careful not
- // to do this.
- // (3) We could avoid doing a copy here by just declaring est_bit_count_ratios
- // as a LabelSet to begin with. But I don't like doing this because it
- // makes the code less understandable to define a known column vector
- // as a matrix with a dynamic number of columns in RowMajor order.
- LabelSet as_label_set = est_bit_count_ratios;
- LassoRunner lasso_runner(&candidate_matrix_);
-
- // In the first step, we compute the lasso path. That is,
- // we compute the solutions to a sequence of lasso subproblems
- // with decreasing values of l1 penalty. See the description of
- // RunFirstRapporStep for details.
- //
- // Set the parameters for RunFirstRapporStep.
- // Note:
- // 1. The minimizer-specific constants are defined in the LassoRuner class and
- // in the implementations of RunFirstRapporStep and GetExactValuesAndStdErrs.
- // It is a good idea to be familiar with them as some of them may need to be
- // adjusted.
- // 2. You can change kMaxNonzeroCoefficients and kColumns2RowsRatioSecondStep
- // depending on how many largest coefficients you care about; smaller numbers
- // mean fewer iterations (shorter lasso path in RunFirstRapporStep). See the
- // descriptions below. However, note that the algorithm may behave differently
- // when these constants are changed, i.e. they affect the entire algorithm
- // in RunFirstRapporStep, not only what is returned.
- // 3. Depending on the application, the value of kMaxSolution1Norm much lower
- // than 1.0 may be better.
- //
- // kMaxNonzeroCoefficients and kColumns2RowsRatioSecondStep will determine the
- // maximum expected number of nonzero coefficients in the solution. The final
- // subproblem of the lasso path will be entered when the number of nonzero
- // coefficients (i.e. coefficients larger than lasso_runner.zero_threshold())
- // identified, is at least min(kMaxNonzeroCoefficients,
- // kColumns2RowsRatioSecondStep * config_->num_bits() *
- // config_->num_cohorts()); otherwise the algorithm will perform all
- // kNumLassoSteps (unless it reaches the maximum number of epochs). Read below
- // for the explanation of the expression involving
- // kColumns2RowsRatioSecondStep.
- // Note: the actual number of identified nonzero
- // coefficients can be slightly larger or smaller.
- //
- // kColumns2RowsRatioSecondStep is the maximum ratio of the number of columns
- // to the number of rows of the matrix in the second step of RAPPOR. We will
- // stop the lasso computations if the number of identified nonzero
- // candidates reaches kColumns2RowsRatioSecondStep * M, where M ==
- // config_->num_bits() * config_->num_cohorts(); the number of columns of
- // the matrix in the second step is then at most kColumns2RowsRatioSecondStep
- // "times" the number of rows. Since we would like this matrix to have
- // linearly independent columns (in other words, such matrix has full column
- // rank), we must have kColumns2RowsRatioSecondStep < 1.0. Simulations on
- // random matrices suggest that it can be as high as 0.9 but the smaller the
- // value the higher the probability that the matrix will have full column
- // rank.
- // Note(bazyli): (mironov) says that this should not be too high because
- // of possible false positives but testing suggests that higher values may
- // actually give better results, and there are some arguments for that.
- //
- // Note(bazyli): Notice that if you want to detect
- // candidates that account for p proportion of the observations, you will look
- // for at most 1/p candidates. For example, if you only care about candidates
- // that account for at least 1% of observations, you may want to set
- // kMaxNonzeroCoefficients = 100. However, it does affect the whole algorithm
- // so be careful.
- static const int kMaxNonzeroCoefficients = 500;
- static const double kColumns2RowsRatioSecondStep = 0.7;
- // We expect the real underlying solution to have 1-norm equal to 1.0, and
- // even more so, the solution of the penalized problem should have norm
- // smaller than 1.0. This is specified by kMaxSolution1Norm.
- // Thus, we also want to stop the lasso computations before
- // the 1-norm of the current solution vector (est_candidate_weights)
- // approaches 1.0. This is closely related to the standard representation of
- // the lasso problem. If we stop when the 1-norm equals 1.0, the lasso
- // solution also solves the quadratic program: min || A x - y ||_2 subject to
- // || x ||_2 <= 1.0. However, note that if there is an exact solution to Ax =
- // y with || x ||_2 = 1.0, it will not be found in the lasso step because of
- // penalty. It may be found in the second RAPPOR step where the penalty is
- // insignificant.
- static const double kMaxSolution1Norm = 0.9;
- // kL1FirstToSecondStep is the ratio of the l1 penalty used in the last lasso
- // subproblem to the l1 penalty used in GetExactValuesAndStdErrs. This second
- // step of RAPPOR is conceptually a least squares problem. However, we
- // introduce a tiny bit of penalty for stability
- // (see below). This number should probably not be larger than 1e-3.
- static const double kL1FirstToSecondStep = 1e-3;
-
- // Perform initializations based on the chosen parameters.
- const int num_candidates = candidate_matrix_.cols();
- const uint32_t num_bits = config_->num_bits();
- const uint32_t num_cohorts = config_->num_cohorts();
- const uint32_t num_hashes = config_->num_hashes();
- // In the first step identify at most kColumns2RowsRatioSecondStep * M
- // coefficients, where M is the number of rows of the candidate matrix (but no
- // more than kMaxNonzeroCoefficients). This is a heuristic to ensure that the
- // matrix in the second step is full column rank.
- // TODO(bazyli): Add "bogus candidates" stopping criterion.
- const int max_nonzero_coeffs =
- std::min(num_candidates,
- std::min(static_cast<int>(kColumns2RowsRatioSecondStep * num_cohorts * num_bits),
- kMaxNonzeroCoefficients));
-
- // We will need to construct the matrix for the second step. This matrix
- // is composed of columns corresponding to identified nonzero candidates
- // stored in second_step_cols.
- std::vector<int> second_step_cols;
- // Initialize the solution vector to zero vector for the lasso path.
- Weights est_candidate_weights = Weights::Zero(num_candidates);
- // Run the first step of RAPPOR to get potential nonzero candidates.
- lasso_runner.RunFirstRapporStep(max_nonzero_coeffs, kMaxSolution1Norm, as_label_set,
- &est_candidate_weights, &second_step_cols);
-
- // Build the matrix for the second step of RAPPOR.
- const uint32_t second_step_num_candidates = second_step_cols.size();
- InstanceSet candidate_submatrix_second_step(candidate_matrix_.rows(), second_step_num_candidates);
- PrepareSecondRapporStepMatrix(&candidate_submatrix_second_step, second_step_cols,
- candidate_matrix_, num_cohorts, num_hashes);
-
- // We can now run the second step of RAPPOR.
- // ************************************************************************
- // Note(bazyli) We will use parallel boosting with
- // momentum again, with very small l1 and l2 penalties. We could use a
- // standard least squares solver here (e.g. based on QR decomposition).
- // However, we still cannot technically guarantee that the columns are
- // linearly independent, and so a tiny penalty will prevent the
- // coefficients from behaving ''wildly'' (this is a better-safe-than-sorry
- // choice); also, we have a very good initial point from the lasso
- // computations so we expect to converge fast (and the problem is smaller
- // than before). Conceptually, we are just solving the least squares
- // problem.
- //
- // The values from lasso computations are a bit biased (likely lower)
- // because of penalty, so the solution after this step should be closer to
- // the underlying distribution. Also, we want to compute standard errors.
- //
- // The RAPPOR paper performs least squares computation at this point; it
- // has closed-form expression for standard errors of coefficients (more
- // precisely, the covariance matrix), which we could use. However, this
- // expression involves the matrix (A_s^T A_s)^(-1), where A_s ==
- // candidate_submatrix_second_step; a priori the inverse might not exist
- // (or A_s^T A_s can be numerically singular or very ill-conditioned).
- // Although unlikely, we do not want to check that. Even if it is not the
- // case, computing the inverse might be computationally (too) heavy.
- // Therefore, we will simulate the behavior of the coefficients to
- // approximate the standard errors.
- //
- // The standard deviations in the linear regression problem min || A * x -
- // b||_2 are computed with the assumption that y_i = < a_i,x_i > + err_i,
- // where err_i are i.i.d. Gaussian errors (so technically speaking, these
- // they are not entirely correct in our case). Although we do not
- // compute the theoretical standard errors, we nevertheless make the same
- // assumption that the noise of detecting y_i (in our case, this is the
- // normalized count of bit i), is Gaussian with standard deviation equal
- // to the standard error of y_i (stored in the est_std_errors). We assume
- // these errors to be independent. We thus simulate the standard errors of
- // the coefficients by introducing this noise directly in y (as_label_set)
- // num_runs time, each time re-running the minimizer with the same input
- // (see the description of GetExactValuesAndStdErrs for details).
- //
- // In any event, the standard errors are conditioned on the choice made by
- // the lasso step (as are standard errors from least squares used in
- // RAPPOR paper). They do not account for anything random that happens in
- // the first RAPPOR step. Other possible choices to get the p-values
- // include: 1. bootstrap on the pairs (a_i,y_i) -- but this could give
- // zero columns. 2. p-values for the particular choice of final penalty in
- // lasso -- described in "Statistical Learning with Sparsity: The Lasso
- // and Generalizations" pp. 150 -- but this probably would account only
- // for the first step.
- // ************************************************************************
-
- // Prepare initial guess for the second step of RAPPOR.
- Weights est_candidate_weights_second_step = Weights(second_step_num_candidates);
- for (uint32_t i = 0; i < second_step_num_candidates; i++) {
- int first_step_col_num = second_step_cols[i];
- est_candidate_weights_second_step[i] = est_candidate_weights[first_step_col_num];
- }
-
- // Initialize the input. We will use kL1FirstToSecondStep fraction of the
- // penalty used in the last subproblem of lasso path.
- const double l1_second_step = kL1FirstToSecondStep * lasso_runner.minimizer_data().l1;
- Weights exact_candidate_weights_second_step(num_candidates);
- Weights est_candidate_errors_second_step(num_candidates);
- // Run the second step of RAPPOR to obtain better estimates of candidate
- // values, and estimates of standard errors.
- lasso_runner.GetExactValuesAndStdErrs(l1_second_step, est_candidate_weights_second_step,
- est_std_errors, candidate_submatrix_second_step,
- as_label_set, &exact_candidate_weights_second_step,
- &est_candidate_errors_second_step);
-
- // Prepare the final solution vector.
- results_out->resize(num_candidates);
- for (int i = 0; i < num_candidates; i++) {
- results_out->at(i).count_estimate = 0;
- results_out->at(i).std_error = 0;
- }
-
- // Write the results from the second step of RAPPOR.
- const uint32_t num_observations = bit_counter_.num_observations();
- for (uint32_t i = 0; i < second_step_num_candidates; i++) {
- int first_step_col_num = second_step_cols[i];
- results_out->at(first_step_col_num).count_estimate =
- exact_candidate_weights_second_step[i] * num_observations;
- results_out->at(first_step_col_num).std_error =
- est_candidate_errors_second_step[i] * num_observations;
- }
-
- // Note: the errors below probably indicate that the problem was too difficult
- // for the given parameters and the limit on the number of epochs.
- if (!lasso_runner.minimizer_data().converged) {
- std::string message = "The last lasso subproblem did not converge.";
- LOG_STACKDRIVER_COUNT_METRIC(ERROR, kAnalyzeFailure) << message;
- return grpc::Status(grpc::DEADLINE_EXCEEDED, message);
- }
-
- if (!lasso_runner.minimizer_data().reached_last_lasso_subproblem) {
- std::string message = "The lasso path did not reach the last subproblem.";
- LOG_STACKDRIVER_COUNT_METRIC(ERROR, kAnalyzeFailure) << message;
- return grpc::Status(grpc::DEADLINE_EXCEEDED, message);
- }
-
- return grpc::Status::OK;
-}
-
-grpc::Status RapporAnalyzer::ExtractEstimatedBitCountRatiosAndStdErrors(
- Eigen::VectorXd* est_bit_count_ratios, std::vector<double>* est_std_errors) {
- VLOG(5) << "RapporAnalyzer::ExtractEstimatedBitCountRatiosAndStdErrors()";
- CHECK(est_bit_count_ratios);
-
- if (!config_->valid()) {
- return grpc::Status(grpc::INVALID_ARGUMENT, "Invalid RapporConfig passed to constructor.");
- }
-
- if (candidate_map_.candidate_list == nullptr ||
- candidate_map_.candidate_list->candidates_size() == 0) {
- return grpc::Status(grpc::INVALID_ARGUMENT,
- "Cannot perform RAPPOR analysis because no candidate "
- "list was specified.");
- }
-
- const uint32_t num_bits = config_->num_bits();
- const uint32_t num_cohorts = config_->num_cohorts();
-
- est_bit_count_ratios->resize(num_cohorts * num_bits);
- est_std_errors->resize(num_cohorts * num_bits);
-
- const std::vector<CohortCounts>& estimated_counts = bit_counter_.EstimateCounts();
- CHECK(estimated_counts.size() == num_cohorts);
-
- int cohort_block_base = 0;
- for (auto& cohort_data : estimated_counts) {
- CHECK(cohort_data.count_estimates.size() == num_bits);
- for (size_t bit_index = 0; bit_index < num_bits; bit_index++) {
- // |bit_index| is an index "from the right".
- size_t bloom_index = num_bits - 1 - bit_index;
- (*est_bit_count_ratios)(cohort_block_base + bloom_index) =
- cohort_data.count_estimates[bit_index] /
- static_cast<double>(cohort_data.num_observations);
- (*est_std_errors)[cohort_block_base + bloom_index] =
- cohort_data.std_errors[bit_index] / static_cast<double>(cohort_data.num_observations);
- }
- cohort_block_base += num_bits;
- }
- return grpc::Status::OK;
-}
-
-grpc::Status RapporAnalyzer::BuildCandidateMap() {
- VLOG(5) << "RapporAnalyzer::BuildCandidateMap()";
- if (!config_->valid()) {
- return grpc::Status(grpc::FAILED_PRECONDITION, "Invalid RapporConfig passed to constructor.");
- }
-
- if (candidate_map_.candidate_list == nullptr ||
- candidate_map_.candidate_list->candidates_size() == 0) {
- return grpc::Status(grpc::INVALID_ARGUMENT,
- "Cannot perform RAPPOR analysis because no candidate "
- "list was specified.");
- }
-
- // TODO(rudominer) We should cache candidate_matrix_ rather than recomputing
- // candidate_map_ and candidate_matrix_ each time.
-
- const uint32_t num_bits = config_->num_bits();
- const uint32_t num_cohorts = config_->num_cohorts();
- const uint32_t num_hashes = config_->num_hashes();
- const uint32_t num_candidates = candidate_map_.candidate_list->candidates_size();
-
- if (VLOG_IS_ON(4)) {
- VLOG(4) << "RapporAnalyzer: Start list of " << num_candidates << " candidates:";
- for (const std::string& candidate : candidate_map_.candidate_list->candidates()) {
- VLOG(4) << "RapporAnalyzer: candidate: " << candidate;
- }
- VLOG(4) << "RapporAnalyzer: End list of " << num_candidates << " candidates.";
- }
-
- candidate_matrix_.resize(num_cohorts * num_bits, num_candidates);
- std::vector<Eigen::Triplet<double>> sparse_matrix_triplets;
- sparse_matrix_triplets.reserve(num_candidates * num_cohorts * num_hashes);
-
- int column = 0;
- for (const std::string& candidate : candidate_map_.candidate_list->candidates()) {
- // In rappor_encoder.cc it is not std::strings that are encoded but rather
- // |ValuePart|s. So here we want to take the candidate as a string and
- // convert it into a serialized |ValuePart|.
- ValuePart candidate_as_value_part;
- candidate_as_value_part.set_string_value(candidate);
- std::string serialized_candidate;
- candidate_as_value_part.SerializeToString(&serialized_candidate);
-
- // Append a CohortMap for this candidate.
- candidate_map_.candidate_cohort_maps.emplace_back();
- CohortMap& cohort_map = candidate_map_.candidate_cohort_maps.back();
-
- // Iterate through the cohorts.
- int row_block_base = 0;
- for (size_t cohort = 0; cohort < num_cohorts; cohort++) {
- // Append an instance of |Hashes| for this cohort.
- cohort_map.cohort_hashes.emplace_back();
- Hashes& hashes = cohort_map.cohort_hashes.back();
-
- // Form one big hashed value of the serialized_candidate. This will be
- // used to obtain multiple bit indices.
- byte hashed_value[crypto::hash::DIGEST_SIZE];
- if (!RapporEncoder::HashValueAndCohort(serialized_candidate, cohort, num_hashes,
- hashed_value)) {
- return grpc::Status(grpc::INTERNAL, "Hash operation failed unexpectedly.");
- }
-
- // bloom_filter is indexed "from the left". That is bloom_filter[0]
- // corresponds to the most significant bit of the first byte of the
- // Bloom filter.
- std::vector<bool> bloom_filter(num_bits, false);
-
- // Extract one bit index for each of the hashes in the Bloom filter.
- for (size_t hash_index = 0; hash_index < num_hashes; hash_index++) {
- uint32_t bit_index = RapporEncoder::ExtractBitIndex(hashed_value, hash_index, num_bits);
- hashes.bit_indices.push_back(bit_index);
- // |bit_index| is an index "from the right".
- bloom_filter[num_bits - 1 - bit_index] = true;
- }
-
- // Add triplets to the sparse matrix representation. For the current
- // column and the current block of rows we add a 1 into the row
- // corresponding to the index of each set bit in the Bloom filter.
- for (size_t bloom_index = 0; bloom_index < num_bits; bloom_index++) {
- if (bloom_filter[bloom_index]) {
- int row = row_block_base + static_cast<int>(bloom_index);
- sparse_matrix_triplets.emplace_back(row, column, 1.0);
- }
- }
-
- // In our sparse matrix representation each cohort corresponds to a block
- // of |num_bits| rows.
- row_block_base += num_bits;
- }
- // In our sparse matrix representation a column corresponds to a candidate.
- column++;
- row_block_base = 0;
- }
-
- candidate_matrix_.setFromTriplets(sparse_matrix_triplets.begin(), sparse_matrix_triplets.end());
-
- return grpc::Status::OK;
-}
-
-} // namespace cobalt::rappor
-
-/*
-
-Justification for the formula used in ExtractEstimatedBitCountRatiosAndStdErrors
--------------------------------------------------------------------
-See the comments at the declaration of the method
-ExtractEstimatedBitCountRatiosAndStdErrors() in rappor_analyzer.h for the
-context and the definitions of the symbols used here.
-
-Here we justify the use of the formula
-
- est_bit_count_ratios[i*k +j] = est_count_i_j / n_i.
-
-Let A be the binary sparse matrix produced by the method BuildCandidateMap()
-and stored in candidate_matrix_. Let b be the column vector produced by
-the method ExtractEstimatedBitCountRatiosAndStdErrors() and stored in the
-variable est_bit_count_ratios. In RapporAnalyzer::Analyze() we compute an
-estimate of a solution to the equation Ax = b. The question we want to address
-here is how do we know we are using the correct value of b? In particular, why
-is it appropriate to divide each entry by n_i, the number of observations from
-cohort i?
-
-The assumption that underlies the justifcation is that the probability of
-a given candidate string occurring is the same in each cohort. That is, there
-is a probability distribution vector x_0 of length s = # of candidates such
-that for each cohort i < m, and each candidate index r < s,
-x_0[r] =
- (number of true observations of candidate r in cohort i) /
- (number of observations from cohort i)
-
-Assume such an x_0 exists. Now let n_i = (number of observations from cohort i).
-Then consider the vector b_i = A (n_i) x_0. We are only concerned with the
-entries in b_i corresponding to cohort i, that is the entries
-i*k + j for 0 <= j < k. Fix such a j and note that
-b_i[i*k + j] = "the true count of 1's for bit j in cohort i". That is, the
-count of 1's for bit j in cohort i prior to flipping bits for randomized
-response. In other words, the count of 1's if we use p = 0, q = 1.
-
-Dividing both sides of the equation A (n_i) x_0 = b_i by n_i and focusing
-only on cohort i we get
- A x_0 [i*k + j] = "the true count of 1's for bit j in cohort i" / n_i
-
-Let b* = A x_0. Then we have:
-
-(i) x_0 is a solution to the equation Ax = b*
-(ii) b*[i*k + j] = "the true count of 1's for bit j in cohort i" / n_i
-
-This justifies our use of the vector b. We have
- b[i*k + j] = "the estimated count of 1's for bit j in cohort i" / n_i
-
- and we seek an estimate to an x such that Ax = b. Such an x may therefore
- naturally be considered to be an estimate of x_0.
-
-*/
diff --git a/src/algorithms/rappor/rappor_analyzer.h b/src/algorithms/rappor/rappor_analyzer.h
deleted file mode 100644
index 644fb43..0000000
--- a/src/algorithms/rappor/rappor_analyzer.h
+++ /dev/null
@@ -1,195 +0,0 @@
-// Copyright 2017 The Fuchsia Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef COBALT_SRC_ALGORITHMS_RAPPOR_RAPPOR_ANALYZER_H_
-#define COBALT_SRC_ALGORITHMS_RAPPOR_RAPPOR_ANALYZER_H_
-
-#include <memory>
-#include <random>
-#include <string>
-#include <vector>
-
-#include "src/algorithms/rappor/bloom_bit_counter.h"
-#include "src/algorithms/rappor/lasso_runner.h"
-#include "src/algorithms/rappor/rappor_analyzer_utils.h"
-#include "src/algorithms/rappor/rappor_config_validator.h"
-#include "src/pb/observation.pb.h"
-#include "src/registry/report_definition.pb.h"
-
-#include "grpc++/grpc++.h"
-#include "third_party/eigen/Eigen/SparseCore"
-
-namespace cobalt {
-namespace rappor {
-
-// A string RAPPOR analysis result for a single candidate. The method
-// RapporAnalyzer::Analyze() populates a vector of CandidateResults, one for
-// each candidate.
-struct CandidateResult {
- double count_estimate;
-
- double std_error;
-};
-
-// A RapporAnalyzer is constructed for the purpose of performing a single
-// string RAPPOR analysis.
-//
-// (1) Construct a RapporAnalyzer passing in a RapporConfig and a
-// RapporCandidateList.
-//
-// (2) Repeatedly invoke AddObservation() to add the set of observations to
-// be analyzed. The observations must all be for the same metric part and
-// must have been encoded using the same encoding configuration. More
-// precisely this means they must be associated with the same customer_id,
-// project_id, metric_id, encoding_config_id and metric_part_name.
-//
-// (3) Invoke Analyze() to perform the string RAPPOR analysis and obtain the
-// results.
-//
-// (4) Optionally examine the underlying BloomBitCounter via the bit_counter()
-// accessor.
-class RapporAnalyzer {
- public:
- // Constructs a RapporAnalyzer for the given config and candidates. All of the
- // observations added via AddObservation() must have been encoded using this
- // config. If the config is not valid then all calls to AddObservation()
- // will return false.
- // Does not take ownership of |candidates|.
- //
- // If |candidates| is NULL or empty then AddObservation() may still succeed
- // but Analyze() will return INVALID_ARGUMENT.
- //
- // TODO(rudominer) Enhance this API to also accept DP release parameters.
- explicit RapporAnalyzer(const RapporConfig& config, const RapporCandidateList* candidates);
-
- // Adds an additional observation to be analyzed. The observation must have
- // been encoded using the RapporConfig passed to the constructor.
- //
- // Returns true to indicate the observation was added without error.
- bool AddObservation(const RapporObservation& obs);
-
- // Performs the string RAPPOR analysis and writes the results to
- // |results_out|. Return OK for success or an error status.
- //
- // |results_out| will initially be cleared and, upon success of the alogrithm,
- // will contain a vector of size candidates->size() where |candidates| is the
- // argument to the constructor. |results_out| will be in the same order
- // as |candidates|. More precisely, the CandidateResult in
- // (*results_out)[i] will be the result for the candidate in (*candidates)[i].
- grpc::Status Analyze(std::vector<CandidateResult>* results_out);
-
- // Gives access to the underlying BloomBitCounter.
- const BloomBitCounter& bit_counter() { return bit_counter_; }
-
- private:
- friend class RapporAnalyzerTest;
-
- // Builds the RAPPOR CandidateMap and the associated sparse matrix based on
- // the data passed to the constructor.
- grpc::Status BuildCandidateMap();
-
- // An instance of Hashes is implicitly associated with a given
- // (candidate, cohort) pair and gives the list of hash values for that pair
- // under each of several hash functions. Each of the hash values is a
- // bit index in a Bloom filter.
- struct Hashes {
- // This vector has size h = num_hashes from the RapporConfig passed
- // to the RapporAnalyzer constructor. bit_indices[i] contains the value of
- // the ith hash function applied to the implicitly associated
- // (candidate, cohort) pair. bit_indices[i] is a bit index in the range
- // [0, k) where k = num_bloom_bits from the RapporConfig passed to the
- // RapporAnalyzer constructor.
- //
- // IMPORTANT: We index bits "from the right." This means that bit number
- // zero is the least significant bit of the last byte of the Bloom filter.
- std::vector<uint16_t> bit_indices;
- };
-
- // An instance of CohortMap is implicitly associated with a given
- // candidate string S and gives the Hashes for the pairs (S, cohort)
- // for each cohort in the range [0, num_cohorts).
- struct CohortMap {
- // This vector has size m = num_cohorts from the RapporConfig passed to
- // the RapporAnalyzer constructor. cohort_hashes[i] contains the Hashes
- // for cohort i.
- std::vector<Hashes> cohort_hashes;
- };
-
- // CandidateMap stores the list of all candidates and a parallel list of
- // CohortMaps for each candidate.
- struct CandidateMap {
- // Contains the list of all candidates. (pointer not owned)
- const RapporCandidateList* candidate_list;
-
- // This vector has size equal to the number of candidates in
- // |candidate_list|. candidate_cohort_maps[i] contains the CohortMap for
- // the ith candidate.
- std::vector<CohortMap> candidate_cohort_maps;
- };
-
- // Computes the column vector est_bit_count_ratios as well as a vector
- // est_std_errors of the corresponding standard errors. This method should be
- // invoked after all Observations have been added via AddObservation().
- //
- // est_bit_count_ratios is a column vector of length m * k where
- // m = # of cohorts
- // k = # of Bloom filter bits per cohort.
- //
- // For i < m, j < k, est_bit_count_ratios[i*k +j] = est_count_i_j / n_i
- // where
- // est_count_i_j = the estimate of the true number of times that bit j was
- // set in cohort i.
- // n_i = the number of observations from cohort i
- //
- // Likewise, est_std_errors[i*k + j] = std_error_i_j / n_i
- // where
- // std_error_i_j = the unbiased sample estimate of the standard deviation of
- // est_count_i_j, and n_i is the same as above.
- //
- // These values are extracted from the BloomBitCounter.
- //
- // See the note at the bottom of rappor_anlayzer.cc for a justification of
- // the formulas.
- grpc::Status ExtractEstimatedBitCountRatiosAndStdErrors(Eigen::VectorXd* est_bit_count_ratios,
- std::vector<double>* est_std_errors);
-
- BloomBitCounter bit_counter_;
-
- std::shared_ptr<RapporConfigValidator> config_;
-
- CandidateMap candidate_map_;
-
- // candidate_matrix_ is a representation of candidate_map_ as a sparse matrix.
- // It is an (m * k) X s sparse binary matrix, where
- // m = # of cohorts
- // k = # of Bloom filter bits per cohort
- // s = # of candidates
- // and for i < m, j < k, r < s candidate_matrix_[i*k + j, r] = 1 iff
- // candidate_map_.candidate_cohort_maps[r].cohort_hashes[i].bit_indices[g] =
- // k - j
- // for at least one g < h where h = # of hashes.
- //
- // In other words, if one of the hash functions for cohort i hashes candidate
- // r to bit j (indexed from the left) then we put a 1 in column r, row
- // i*k + j.
- //
- // The expression (k - j) above is due to the fact that
- // candidate_map_ indexes bits from the right instead of from the left.
- Eigen::SparseMatrix<double, Eigen::RowMajor> candidate_matrix_;
-};
-
-} // namespace rappor
-} // namespace cobalt
-
-#endif // COBALT_SRC_ALGORITHMS_RAPPOR_RAPPOR_ANALYZER_H_
diff --git a/src/algorithms/rappor/rappor_analyzer_manual_tests.cc b/src/algorithms/rappor/rappor_analyzer_manual_tests.cc
deleted file mode 100644
index 9ec0ab8..0000000
--- a/src/algorithms/rappor/rappor_analyzer_manual_tests.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-// Copyright 2018 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "src/algorithms/rappor/rappor_analyzer_test.h"
-
-namespace cobalt::rappor {
-
-// Comparison of Analyze and simple least squares.
-// It invokes Analyze() in a few very simple cases, checks that the the
-// algorithm converges and that the result vector has the correct size. For each
-// case, it also computes the least squares solution using QR for exactly the
-// same system and prints both solutions (note that the least squares solution
-// is not always unique).
-TEST_F(RapporAnalyzerTest, CompareAnalyzeToRegression) {
- static const uint32_t kNumCandidates = 10;
- static const uint32_t kNumCohorts = 2;
- static const uint32_t kNumHashes = 2;
- static const uint32_t kNumBloomBits = 8;
-
- std::vector<int> candidate_indices(100, 5); // NOLINT
- std::vector<int> true_candidate_counts = {0, 0, 0, 0, 0, 100, 0, 0, 0, 0}; // NOLINT
- CompareAnalyzeToSimpleRegression("p=0, q=1, only candidate 5", kNumCandidates, kNumBloomBits,
- kNumCohorts, kNumHashes, candidate_indices,
- true_candidate_counts);
-
- candidate_indices = std::vector<int>(20, 1); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 20, 4); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 60, 9); // NOLINT
- true_candidate_counts = {0, 20, 0, 0, 20, 0, 0, 0, 0, 60}; // NOLINT
- CompareAnalyzeToSimpleRegression("p=0, q=1, several candidates", kNumCandidates, kNumBloomBits,
- kNumCohorts, kNumHashes, candidate_indices,
- true_candidate_counts);
-
- prob_0_becomes_1_ = 0.1; // NOLINT
- prob_1_stays_1_ = 0.9; // NOLINT
-
- candidate_indices = std::vector<int>(100, 5); // NOLINT
- true_candidate_counts = {0, 0, 0, 0, 0, 100, 0, 0, 0, 0}; // NOLINT
- CompareAnalyzeToSimpleRegression("p=0.1, q=0.9, only candidate 5", kNumCandidates, kNumBloomBits,
- kNumCohorts, kNumHashes, candidate_indices,
- true_candidate_counts);
-
- candidate_indices = std::vector<int>(20, 1); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 20, 4); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 60, 9); // NOLINT
- true_candidate_counts = {0, 20, 0, 0, 20, 0, 0, 0, 0, 60}; // NOLINT
- CompareAnalyzeToSimpleRegression("p=0.1, q=0.9, several candidates", kNumCandidates,
- kNumBloomBits, kNumCohorts, kNumHashes, candidate_indices,
- true_candidate_counts);
-}
-
-// Runs LongExperimentWithAnalyze; the true candidate counts are
-// distributed according to the power law; we specify the number of
-// observations and the exponent parameter of the power law. The ids are then
-// shuffled so that it is not true that large ids are more frequent.
-// Note: encoding observations is time consuming so large tests may take long.
-TEST_F(RapporAnalyzerTest, PowerLawExperiment) {
- static const uint32_t kNumCandidates = 20000;
- static const uint32_t kNumCohorts = 128;
- static const uint32_t kNumHashes = 2;
- static const uint32_t kNumBloomBits = 128;
- static const uint32_t kNumObservations = 1e+6;
- static const bool print_estimates = false;
- const double exponent = 30;
- const auto max_id = static_cast<const int>(kNumCandidates - 1);
-
- std::vector<int> candidate_indices(kNumObservations);
- std::vector<int> true_candidate_counts(kNumCandidates, 0);
-
- // Create a "map" of shuffled ids to randomize the observed id values
- std::vector<int> candidate_ids_list_shuffled(kNumCandidates);
- std::iota(candidate_ids_list_shuffled.begin(), candidate_ids_list_shuffled.end(), 0);
- std::mt19937 g(random_dev_());
- std::shuffle(candidate_ids_list_shuffled.begin(), candidate_ids_list_shuffled.end(), g);
-
- // Generate observations from the power law distribution on
- // [0,kNumCandidates-1]
- const double left = 0.0;
- const auto right = static_cast<const double>(max_id);
- for (uint32_t i = 0; i < kNumObservations; i++) {
- double random_power_law_number = GenerateNumberFromPowerLaw(left, right, exponent);
- auto observed_candidate_id = static_cast<int>(random_power_law_number);
- observed_candidate_id = std::min(observed_candidate_id, max_id);
- observed_candidate_id = candidate_ids_list_shuffled[observed_candidate_id];
- candidate_indices[i] = observed_candidate_id;
- true_candidate_counts[observed_candidate_id]++;
- }
-
- prob_0_becomes_1_ = 0.05; // NOLINT
- prob_1_stays_1_ = 0.95; // NOLINT
-
- LongExperimentWithAnalyze("p=0.05, q=0.95, power-law distribution", kNumCandidates, kNumBloomBits,
- kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts,
- print_estimates);
-
- prob_0_becomes_1_ = 0.25; // NOLINT
- prob_1_stays_1_ = 0.75; // NOLINT
-
- LongExperimentWithAnalyze("p=0.25, q=0.75, power-law distribution", kNumCandidates, kNumBloomBits,
- kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts,
- print_estimates);
-}
-
-// This is the same as PowerLawExperiment but the distribution of observations
-// is exponential.
-TEST_F(RapporAnalyzerTest, ExponentialExperiment) {
- static const uint32_t kNumCandidates = 300;
- static const uint32_t kNumCohorts = 2;
- static const uint32_t kNumHashes = 2;
- static const uint32_t kNumBloomBits = 128;
- static const uint32_t kNumObservations = 1e+6;
- static const bool print_estimates = true;
- static const double lambda = 1.0; // the support of pdf for lambda == 1.0 ...
- static const double approximate_max_generated_num = 6.0; // ... is roughly [0, 6.0]
- const auto max_id = static_cast<const int>(kNumCandidates - 1);
-
- std::vector<int> candidate_indices(kNumObservations);
- std::vector<int> true_candidate_counts(kNumCandidates, 0);
-
- // Create a "map" of shuffled ids to randomize the observed id values
- std::vector<int> candidate_ids_list_shuffled = GenerateRandomMapOfIds(kNumCandidates);
-
- // Generate observations from the exponential distribution on
- // [0,kNumCandidates-1]
- std::exponential_distribution<double> exp_distribution(lambda);
- for (uint32_t i = 0; i < kNumObservations; i++) {
- double random_exponential_number = exp_distribution(random_dev_);
- random_exponential_number /= approximate_max_generated_num;
- random_exponential_number *= static_cast<double>(kNumCandidates);
- auto observed_candidate_id = static_cast<int>(random_exponential_number);
- observed_candidate_id = std::min(observed_candidate_id, max_id);
- observed_candidate_id = candidate_ids_list_shuffled[observed_candidate_id];
- candidate_indices[i] = observed_candidate_id;
- true_candidate_counts[observed_candidate_id]++;
- }
-
- prob_0_becomes_1_ = 0.05; // NOLINT
- prob_1_stays_1_ = 0.95; // NOLINT
-
- LongExperimentWithAnalyze("p=0.05, q=0.95, exponential distribution", kNumCandidates,
- kNumBloomBits, kNumCohorts, kNumHashes, candidate_indices,
- true_candidate_counts, print_estimates);
-
- prob_0_becomes_1_ = 0.25; // NOLINT
- prob_1_stays_1_ = 0.75; // NOLINT
-
- LongExperimentWithAnalyze("p=0.25, q=0.75, exponential distribution", kNumCandidates,
- kNumBloomBits, kNumCohorts, kNumHashes, candidate_indices,
- true_candidate_counts, print_estimates);
-}
-
-// This is the same as PowerLawExperiment but the distribution of observations
-// comes from normal distribution.
-TEST_F(RapporAnalyzerTest, NormalDistExperiment) {
- static const uint32_t kNumCandidates = 100;
- static const uint32_t kNumCohorts = 2;
- static const uint32_t kNumHashes = 2;
- static const uint32_t kNumBloomBits = 32;
- static const uint32_t kNumObservations = 1e+5;
- static const bool print_estimates = true;
- const auto mean = static_cast<double>(kNumCandidates) / 2.0;
- // Most probablility weight is within +/- 3 standard deviations.
- const auto sd = static_cast<const double>(mean / 10);
- const auto max_id = static_cast<const int>(kNumCandidates - 1);
-
- std::vector<int> candidate_indices(kNumObservations);
- std::vector<int> true_candidate_counts(kNumCandidates, 0);
-
- // Create a "map" of shuffled ids to randomize the observed id values.
- std::vector<int> candidate_ids_list_shuffled = GenerateRandomMapOfIds(kNumCandidates);
-
- // Generate observations from the normal distribution.
- std::normal_distribution<double> nrm_distribution(mean, sd);
- for (uint32_t i = 0; i < kNumObservations; i++) {
- double random_normal_number = nrm_distribution(random_dev_);
- int observed_candidate_id = static_cast<int>(random_normal_number);
- observed_candidate_id = std::max(0, std::min(observed_candidate_id, max_id));
- observed_candidate_id = candidate_ids_list_shuffled[observed_candidate_id];
- candidate_indices[i] = observed_candidate_id;
- true_candidate_counts[observed_candidate_id]++;
- }
-
- prob_0_becomes_1_ = 0.05; // NOLINT
- prob_1_stays_1_ = 0.95; // NOLINT
-
- LongExperimentWithAnalyze("p=0.05, q=0.95, normal distribution", kNumCandidates, kNumBloomBits,
- kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts,
- print_estimates);
-
- prob_0_becomes_1_ = 0.25; // NOLINT
- prob_1_stays_1_ = 0.75; // NOLINT
-
- LongExperimentWithAnalyze("p=0.25, q=0.75, normal distribution", kNumCandidates, kNumBloomBits,
- kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts,
- print_estimates);
-}
-
-// This is the same as PowerLawExperiment but the observations come from a
-// uniform distribution supported on some small set of candidates.
-TEST_F(RapporAnalyzerTest, KOutOfNExperiment) {
- // For this test to be meaningful we should have kNumObservedCandidates <<
- // kNumCandidates
- static const uint32_t kNumCandidates = 2000;
- static const uint32_t kNumObservedCandidates = 10;
- static const uint32_t kNumCohorts = 50;
- static const uint32_t kNumHashes = 2;
- static const uint32_t kNumBloomBits = 32;
- static const uint32_t kNumObservations = 1e+5;
- static const bool print_estimates = true;
- const int max_id = static_cast<const int>(kNumCandidates - 1);
-
- std::vector<int> candidate_indices(kNumObservations);
- std::vector<int> true_candidate_counts(kNumCandidates, 0);
-
- // Create a "map" of shuffled ids to randomize the observed id values
- std::vector<int> candidate_ids_list_shuffled = GenerateRandomMapOfIds(kNumCandidates);
-
- // Generate observations
- for (uint32_t i = 0; i < kNumObservations; i++) {
- int observed_candidate_id = static_cast<int>(i % kNumObservedCandidates);
- observed_candidate_id = std::min(observed_candidate_id, max_id);
- observed_candidate_id = candidate_ids_list_shuffled[observed_candidate_id];
- candidate_indices[i] = observed_candidate_id;
- true_candidate_counts[observed_candidate_id]++;
- }
-
- prob_0_becomes_1_ = 0.05; // NOLINT
- prob_1_stays_1_ = 0.95; // NOLINT
-
- LongExperimentWithAnalyze("p=0.05, q=0.95, k out of N distribution", kNumCandidates,
- kNumBloomBits, kNumCohorts, kNumHashes, candidate_indices,
- true_candidate_counts, print_estimates);
-
- prob_0_becomes_1_ = 0.25; // NOLINT
- prob_1_stays_1_ = 0.75; // NOLINT
-
- LongExperimentWithAnalyze("p=0.25, q=0.75, k out of N distribution", kNumCandidates,
- kNumBloomBits, kNumCohorts, kNumHashes, candidate_indices,
- true_candidate_counts, print_estimates);
-}
-
-} // namespace cobalt::rappor
diff --git a/src/algorithms/rappor/rappor_analyzer_test.cc b/src/algorithms/rappor/rappor_analyzer_test.cc
deleted file mode 100644
index f32ada5..0000000
--- a/src/algorithms/rappor/rappor_analyzer_test.cc
+++ /dev/null
@@ -1,421 +0,0 @@
-// Copyright 2016 The Fuchsia Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "src/algorithms/rappor/rappor_analyzer_test.h"
-
-namespace cobalt::rappor {
-
-using system_data::ClientSecret;
-
-void RapporAnalyzerTest::SetAnalyzer(uint32_t num_candidates, uint32_t num_bloom_bits,
- uint32_t num_cohorts, uint32_t num_hashes) {
- PopulateRapporCandidateList(num_candidates, &candidate_list_);
- config_ = Config(num_bloom_bits, num_cohorts, num_hashes, prob_0_becomes_1_, prob_1_stays_1_);
- analyzer_ = std::make_unique<RapporAnalyzer>(config_, &candidate_list_);
-}
-
-void RapporAnalyzerTest::BuildCandidateMap() {
- EXPECT_EQ(grpc::OK, analyzer_->BuildCandidateMap().error_code());
-
- const uint32_t num_candidates = analyzer_->candidate_map_.candidate_list->candidates_size();
- const uint32_t num_cohorts = analyzer_->config_->num_cohorts();
- const uint32_t num_hashes = analyzer_->config_->num_hashes();
- const uint32_t num_bits = analyzer_->config_->num_bits();
-
- // Expect the number of candidates to be correct,
- EXPECT_EQ(num_candidates, analyzer_->candidate_map_.candidate_cohort_maps.size());
-
- // and for each candidate...
- for (size_t candidate = 0; candidate < num_candidates; candidate++) {
- // expect the number of cohorts to be correct,
- EXPECT_EQ(num_cohorts,
- analyzer_->candidate_map_.candidate_cohort_maps[candidate].cohort_hashes.size());
-
- // and for each cohort...
- for (size_t cohort = 0; cohort < num_cohorts; cohort++) {
- // expect the number of hashes to be correct,
- EXPECT_EQ(num_hashes, analyzer_->candidate_map_.candidate_cohort_maps[candidate]
- .cohort_hashes[cohort]
- .bit_indices.size());
-
- // and for each hash...
- for (size_t hash = 0; hash < num_hashes; hash++) {
- // Expect the bit index to be in the range [0, num_bits).
- auto bit_index = GetCandidateMapValue(candidate, cohort, hash);
- EXPECT_GE(bit_index, 0u);
- EXPECT_LT(bit_index, num_bits);
- }
- }
- }
-
- // Validate the associated sparse matrix.
- EXPECT_EQ(num_candidates, candidate_matrix().cols());
- EXPECT_EQ(num_cohorts * num_bits, candidate_matrix().rows());
- EXPECT_LE(num_candidates * num_cohorts, candidate_matrix().nonZeros());
- EXPECT_GE(num_candidates * num_cohorts * num_hashes, candidate_matrix().nonZeros());
-}
-
-uint16_t RapporAnalyzerTest::GetCandidateMapValue(uint16_t candidate_index, uint16_t cohort_index,
- uint16_t hash_index) {
- EXPECT_GT(analyzer_->candidate_map_.candidate_cohort_maps.size(), candidate_index);
- EXPECT_GT(analyzer_->candidate_map_.candidate_cohort_maps[candidate_index].cohort_hashes.size(),
- cohort_index);
- EXPECT_GT(analyzer_->candidate_map_.candidate_cohort_maps[candidate_index]
- .cohort_hashes[cohort_index]
- .bit_indices.size(),
- hash_index);
- return analyzer_->candidate_map_.candidate_cohort_maps[candidate_index]
- .cohort_hashes[cohort_index]
- .bit_indices[hash_index];
-}
-
-std::string RapporAnalyzerTest::BuildBitString(uint16_t candidate_index, uint16_t cohort_index) {
- return BuildBinaryString(analyzer_->config_->num_bits(),
- analyzer_->candidate_map_.candidate_cohort_maps[candidate_index]
- .cohort_hashes[cohort_index]
- .bit_indices);
-}
-
-void RapporAnalyzerTest::AddObservation(uint32_t cohort, const std::string& binary_string) {
- EXPECT_TRUE(analyzer_->AddObservation(RapporObservationFromString(cohort, binary_string)));
-}
-
-void RapporAnalyzerTest::ExtractEstimatedBitCountRatios(Eigen::VectorXd* est_bit_count_ratios) {
- std::vector<double> est_std_errors_not_used;
- EXPECT_TRUE(analyzer_
- ->ExtractEstimatedBitCountRatiosAndStdErrors(est_bit_count_ratios,
- &est_std_errors_not_used)
- .ok());
-}
-
-void RapporAnalyzerTest::ExtractEstimatedBitCountRatiosAndStdErrors(
- Eigen::VectorXd* est_bit_count_ratios, std::vector<double>* est_std_errors) {
- EXPECT_TRUE(
- analyzer_->ExtractEstimatedBitCountRatiosAndStdErrors(est_bit_count_ratios, est_std_errors)
- .ok());
-}
-
-void RapporAnalyzerTest::AddObservationsForCandidates(const std::vector<int>& candidate_indices) {
- for (auto index : candidate_indices) {
- // Construct a new encoder with a new ClientSecret so that a random
- // cohort is selected.
- RapporEncoder encoder(config_, ClientSecret::GenerateNewSecret());
-
- // Encode the current candidate string using |encoder|.
- ValuePart value_part;
- value_part.set_string_value(CandidateString(index));
- RapporObservation observation;
- encoder.Encode(value_part, &observation);
- EXPECT_TRUE(analyzer_->AddObservation(observation));
- }
-}
-
-int RapporAnalyzerTest::GenerateNumberFromPowerLaw(const double left, const double right,
- const double exponent) {
- // double precision must be used because of potentially large powers taken
- std::uniform_real_distribution<double> uniform_0_1_distribution(0.0, 1.0);
- double random_between_0_1 = uniform_0_1_distribution(random_dev_);
- double left_to_exponent_plus_1 = std::pow(left, exponent + 1);
- double random_power_law_number =
- (std::pow(right, exponent + 1) - left_to_exponent_plus_1) * random_between_0_1 +
- left_to_exponent_plus_1;
- random_power_law_number = std::pow(random_power_law_number, 1.0f / (exponent + 1));
- return static_cast<int>(random_power_law_number);
-}
-
-std::vector<int> RapporAnalyzerTest::GenerateRandomMapOfIds(const int num_candidates) {
- // Create a "map" of shuffled ids to randomize the observed id values
- std::vector<int> candidate_ids_list_shuffled(num_candidates);
- std::iota(candidate_ids_list_shuffled.begin(), candidate_ids_list_shuffled.end(), 0);
- std::mt19937 g(random_dev_());
- std::shuffle(candidate_ids_list_shuffled.begin(), candidate_ids_list_shuffled.end(), g);
- return candidate_ids_list_shuffled;
-}
-
-std::vector<int> RapporAnalyzerTest::CountsEstimatesFromResults(
- const std::vector<CandidateResult>& results) {
- uint32_t num_candidates = results.size();
- std::vector<int> count_estimates(num_candidates);
- for (size_t i = 0; i < num_candidates; i++) {
- count_estimates[i] = static_cast<int>(round(results[i].count_estimate));
- }
- return count_estimates;
-}
-
-Eigen::VectorXd RapporAnalyzerTest::VectorFromCounts(const std::vector<int>& counts) {
- uint32_t num_candidates = counts.size();
- Eigen::VectorXd counts_vector(num_candidates);
- for (size_t i = 0; i < num_candidates; i++) {
- counts_vector(i) = static_cast<double>(counts[i]);
- }
- return counts_vector;
-}
-
-void RapporAnalyzerTest::CheckExactSolution(const std::vector<int>& exact_candidate_counts) {
- Eigen::VectorXd est_bit_count_ratios;
- ExtractEstimatedBitCountRatios(&est_bit_count_ratios);
- Eigen::VectorXd exact_count_vector = VectorFromCounts(exact_candidate_counts);
- EXPECT_EQ(analyzer_->candidate_matrix_.cols(),
- static_cast<const int64_t>(exact_candidate_counts.size()));
- Eigen::VectorXd rhs = analyzer_->candidate_matrix_ * exact_count_vector;
- rhs /= exact_count_vector.sum();
- Eigen::VectorXd difference = rhs - est_bit_count_ratios;
- LOG(ERROR) << "How well does the exact solution reproduce the right hand side?"
- << difference.norm() / rhs.norm();
-}
-
-void RapporAnalyzerTest::PrintTrueCountsAndEstimates(
- const std::string& case_label, uint32_t num_candidates,
- const std::vector<CandidateResult>& results, const std::vector<int>& true_candidate_counts) {
- std::vector<int> count_estimates = CountsEstimatesFromResults(results);
- std::ostringstream true_stream;
- for (size_t i = 0; i < num_candidates; i++) {
- if (true_candidate_counts[i] > 0) {
- true_stream << "beta(" << i << ") == " << true_candidate_counts[i] << std::endl;
- }
- }
- LOG(ERROR) << "-------------------------------------";
- LOG(ERROR) << case_label;
- LOG(ERROR) << "True counts: " << true_stream.str();
- std::ostringstream estimate_stream;
- for (size_t i = 0; i < num_candidates; i++) {
- if (count_estimates[i] > 0) {
- estimate_stream << "beta(" << i << ") == " << count_estimates[i] << std::endl;
- }
- }
- LOG(ERROR) << " Estimates: " << estimate_stream.str();
-}
-
-void RapporAnalyzerTest::AssessUtility(const std::vector<CandidateResult>& results,
- const std::vector<int>& true_candidate_counts) {
- // Get the estimates vector as well as the number of nonzero estimates
- int num_candidates = results.size();
- std::vector<int> estimates_vector = CountsEstimatesFromResults(results);
- int how_many_nonzeros = std::count_if(estimates_vector.begin(), estimates_vector.end(),
- [](const int a) { return a > 0; });
-
- // Sort candidate ids in an ascending order according to their real and
- // estimated values
- std::vector<int> estimated_id_order(num_candidates);
- std::iota(estimated_id_order.begin(), estimated_id_order.end(), 0);
- std::vector<int> true_id_order = estimated_id_order;
-
- std::sort(estimated_id_order.begin(), estimated_id_order.end(),
- [&estimates_vector](const int a, const int b) {
- return estimates_vector[a] > estimates_vector[b];
- });
-
- std::sort(true_id_order.begin(), true_id_order.end(),
- [&true_candidate_counts](const int a, const int b) {
- return true_candidate_counts[a] > true_candidate_counts[b];
- });
-
- // Compute the false positive rates for a grid of values
- LOG(ERROR) << "Identified " << how_many_nonzeros << " nonzero estimates.";
- LOG(ERROR) << "The measure of false positives for identified top n hitters:";
- const std::vector<int> top_hitters_analyzed = {10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000};
- for (auto num_hitters : top_hitters_analyzed) {
- if (num_hitters > num_candidates) {
- break;
- }
- float false_positives = 0;
- auto after_nth_element_it = true_id_order.begin() + num_hitters;
- for (int i = 0; i < num_hitters; i++) {
- auto where_ith_estimated_element =
- std::find(true_id_order.begin(), after_nth_element_it, estimated_id_order[i]);
- if (where_ith_estimated_element == after_nth_element_it) {
- false_positives += 1.0;
- }
- }
- LOG(ERROR) << "The false positive rate at n = " << num_hitters << " is "
- << false_positives / num_hitters;
- }
-}
-
-grpc::Status RapporAnalyzerTest::ComputeLeastSquaresFitQR(
- const Eigen::VectorXd& est_bit_count_ratios, std::vector<CandidateResult>* results) {
- // cast from smaller to larger type for comparisons
- const auto num_candidates = static_cast<const size_t>(analyzer_->candidate_matrix_.cols());
- EXPECT_EQ(results->size(), num_candidates);
- // define the QR solver and perform the QR decomposition followed by
- // least squares solve
- Eigen::SparseQR<Eigen::SparseMatrix<double, Eigen::ColMajor>, Eigen::COLAMDOrdering<int>>
- qrsolver;
-
- EXPECT_EQ(analyzer_->candidate_matrix_.rows(), est_bit_count_ratios.size());
- EXPECT_GT(analyzer_->candidate_matrix_.rows(), 0);
- // explicitly construct Eigen::ColMajor matrix from candidate_matrix_
- // (the documentation for Eigen::SparseQR requires it)
- // compute() as well as Eigen::COLAMDOrdering require compressed
- // matrix
- Eigen::SparseMatrix<double, Eigen::ColMajor> candidate_matrix_col_major =
- analyzer_->candidate_matrix_;
- candidate_matrix_col_major.makeCompressed();
- qrsolver.compute(candidate_matrix_col_major);
- if (qrsolver.info() != Eigen::Success) {
- std::string message = "Eigen::SparseQR decomposition was unsuccessfull";
- return grpc::Status(grpc::INTERNAL, message);
- }
- Eigen::VectorXd result_vals = qrsolver.solve(est_bit_count_ratios);
- if (qrsolver.info() != Eigen::Success) {
- std::string message = "Eigen::SparseQR solve was unsuccessfull";
- return grpc::Status(grpc::INTERNAL, message);
- }
-
- // write to the results vector
- EXPECT_EQ(num_candidates, results->size());
- for (size_t i = 0; i < num_candidates; i++) {
- results->at(i).count_estimate = result_vals[i] * analyzer_->bit_counter_.num_observations();
- results->at(i).std_error = 0;
- }
- return grpc::Status::OK;
-}
-
-void RapporAnalyzerTest::RunSimpleLinearRegressionReference(
- const std::string& case_label, uint32_t num_candidates, uint32_t num_bloom_bits,
- uint32_t num_cohorts, uint32_t num_hashes, const std::vector<int>& candidate_indices,
- const std::vector<int>& true_candidate_counts) {
- SetAnalyzer(num_candidates, num_bloom_bits, num_cohorts, num_hashes);
- AddObservationsForCandidates(candidate_indices);
-
- // set up the matrix
- auto status = analyzer_->BuildCandidateMap();
- if (!status.ok()) {
- EXPECT_EQ(grpc::OK, status.error_code());
- return;
- }
- // set up the right hand side of the equation
- Eigen::VectorXd est_bit_count_ratios;
- ExtractEstimatedBitCountRatios(&est_bit_count_ratios);
-
- std::vector<CandidateResult> results(num_candidates);
- status = ComputeLeastSquaresFitQR(est_bit_count_ratios, &results);
- if (!status.ok()) {
- EXPECT_EQ(grpc::OK, status.error_code());
- return;
- }
-
- PrintTrueCountsAndEstimates(case_label, num_candidates, results, true_candidate_counts);
-}
-
-void RapporAnalyzerTest::ShortExperimentWithAnalyze(
- const std::string& case_label, uint32_t num_candidates, uint32_t num_bloom_bits,
- uint32_t num_cohorts, uint32_t num_hashes, const std::vector<int>& candidate_indices,
- const std::vector<int>& true_candidate_counts, const bool print_estimates) {
- SetAnalyzer(num_candidates, num_bloom_bits, num_cohorts, num_hashes);
- AddObservationsForCandidates(candidate_indices);
-
- std::vector<CandidateResult> results;
- auto status = analyzer_->Analyze(&results);
- if (!status.ok()) {
- EXPECT_EQ(grpc::OK, status.error_code());
- return;
- }
-
- if (results.size() != num_candidates) {
- EXPECT_EQ(num_candidates, results.size());
- return;
- }
-
- if (print_estimates) {
- PrintTrueCountsAndEstimates(case_label, num_candidates, results, true_candidate_counts);
- }
-}
-
-void RapporAnalyzerTest::LongExperimentWithAnalyze(const std::string& case_label,
- uint32_t num_candidates, uint32_t num_bloom_bits,
- uint32_t num_cohorts, uint32_t num_hashes,
- const std::vector<int>& candidate_indices,
- const std::vector<int>& true_candidate_counts,
- const bool print_estimates) {
- SetAnalyzer(num_candidates, num_bloom_bits, num_cohorts, num_hashes);
- AddObservationsForCandidates(candidate_indices);
-
- std::vector<CandidateResult> results;
- auto start_analyze_time = std::chrono::steady_clock::now();
- auto status = analyzer_->Analyze(&results);
- if (!status.ok()) {
- EXPECT_EQ(grpc::OK, status.error_code());
- return;
- }
-
- auto end_analyze_time = std::chrono::steady_clock::now();
- LOG(ERROR) << "Analyze() took "
- << std::chrono::duration_cast<std::chrono::seconds>(end_analyze_time -
- start_analyze_time)
- .count()
- << " seconds.";
-
- if (results.size() != num_candidates) {
- EXPECT_EQ(num_candidates, results.size());
- return;
- }
- CheckExactSolution(true_candidate_counts);
-
- if (print_estimates) {
- PrintTrueCountsAndEstimates(case_label, num_candidates, results, true_candidate_counts);
- }
- AssessUtility(results, true_candidate_counts);
-}
-
-void RapporAnalyzerTest::CompareAnalyzeToSimpleRegression(
- const std::string& case_label, uint32_t num_candidates, uint32_t num_bloom_bits,
- uint32_t num_cohorts, uint32_t num_hashes, const std::vector<int>& candidate_indices,
- const std::vector<int>& true_candidate_counts) {
- SetAnalyzer(num_candidates, num_bloom_bits, num_cohorts, num_hashes);
- AddObservationsForCandidates(candidate_indices);
-
- // compute and print the results of Analyze()
- std::vector<CandidateResult> results_analyze;
- auto status = analyzer_->Analyze(&results_analyze);
-
- if (!status.ok()) {
- EXPECT_EQ(grpc::OK, status.error_code());
- return;
- }
-
- if (results_analyze.size() != num_candidates) {
- EXPECT_EQ(num_candidates, results_analyze.size());
- return;
- }
-
- std::string print_label = case_label + " analyze ";
- PrintTrueCountsAndEstimates(print_label, num_candidates, results_analyze, true_candidate_counts);
-
- // compute and print the results for simple linear regression
- Eigen::VectorXd est_bit_count_ratios;
- ExtractEstimatedBitCountRatios(&est_bit_count_ratios);
-
- print_label = case_label + " least squares ";
- std::vector<CandidateResult> results_ls(num_candidates);
- status = ComputeLeastSquaresFitQR(est_bit_count_ratios, &results_ls);
- if (!status.ok()) {
- EXPECT_EQ(grpc::OK, status.error_code());
- return;
- }
- PrintTrueCountsAndEstimates(print_label, num_candidates, results_ls, true_candidate_counts);
-}
-
-RapporConfig config_;
-std::unique_ptr<RapporAnalyzer> analyzer_;
-
-RapporCandidateList candidate_list_;
-
-// By default this test uses p=0, q=1. Individual tests may override this.
-double prob_0_becomes_1_ = 0.0;
-double prob_1_stays_1_ = 1.0;
-
-// Random device
-std::random_device random_dev_;
-
-} // namespace cobalt::rappor
diff --git a/src/algorithms/rappor/rappor_analyzer_test.h b/src/algorithms/rappor/rappor_analyzer_test.h
deleted file mode 100644
index c45cfc7..0000000
--- a/src/algorithms/rappor/rappor_analyzer_test.h
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright 2018 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef COBALT_SRC_ALGORITHMS_RAPPOR_RAPPOR_ANALYZER_TEST_H_
-#define COBALT_SRC_ALGORITHMS_RAPPOR_RAPPOR_ANALYZER_TEST_H_
-
-#include <algorithm>
-#include <chrono>
-#include <memory>
-#include <random>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-
-#include "src/algorithms/rappor/rappor_analyzer.h"
-#include "src/algorithms/rappor/rappor_encoder.h"
-#include "src/algorithms/rappor/rappor_test_utils.h"
-#include "src/system_data/client_secret.h"
-#include "third_party/googletest/googletest/include/gtest/gtest.h"
-
-#include "third_party/eigen/Eigen/SparseQR"
-
-namespace cobalt {
-namespace rappor {
-
-class RapporAnalyzerTest : public ::testing::Test {
- protected:
- // Sets the member variable analyzer_ to a new RapporAnalyzer configured
- // with the given arguments and the current values of prob_0_becomes_1_,
- // prob_1_stays_1_.
- void SetAnalyzer(uint32_t num_candidates, uint32_t num_bloom_bits, uint32_t num_cohorts,
- uint32_t num_hashes);
-
- void BuildCandidateMap();
-
- // This should be invoked after BuildCandidateMap. It returns the bit index
- // within the CandidateMap for the given |candidate_index|, |cohort_index|,
- // and |hash_index|.
- uint16_t GetCandidateMapValue(uint16_t candidate_index, uint16_t cohort_index,
- uint16_t hash_index);
-
- // Builds and returns a bit string (i.e. a string of ASCII '0's and '1's)
- // representing the Bloom filter implicitly stored within the CandidateMap
- // for the given |candidate_index| and |cohort_index|.
- std::string BuildBitString(uint16_t candidate_index, uint16_t cohort_index);
-
- const Eigen::SparseMatrix<double, Eigen::RowMajor>& candidate_matrix() {
- return analyzer_->candidate_matrix_;
- }
-
- void AddObservation(uint32_t cohort, const std::string& binary_string);
-
- void ExtractEstimatedBitCountRatios(Eigen::VectorXd* est_bit_count_ratios);
-
- void ExtractEstimatedBitCountRatiosAndStdErrors(Eigen::VectorXd* est_bit_count_ratios,
- std::vector<double>* est_std_errors);
-
- void AddObservationsForCandidates(const std::vector<int>& candidate_indices);
-
- // Generate a random number from power law distribution on the interval
- // [|left|,|right|] with given |exponent|.
- int GenerateNumberFromPowerLaw(double left, double right, double exponent);
-
- // Generate a "map" of shuffled Ids, that is, a vector of size
- // |num_candidates| containing exactly the numbers 0,1,...,num_canidates - 1,
- // in a random order.
- std::vector<int> GenerateRandomMapOfIds(int num_candidates);
-
- std::vector<int> CountsEstimatesFromResults(const std::vector<CandidateResult>& results);
-
- Eigen::VectorXd VectorFromCounts(const std::vector<int>& exact_candidate_counts);
-
- // Checks how well the |exact_candidate_counts| reproduces the right hand side
- // of the equation solved by Analyze(). The function prints the value of
- // d == || A * x_s - b || / || b || where x_s == |exact_candidate_counts| /
- // num_observations, b is the vector of the estimated bit count ratios from
- // ExactEstimatedBitCountRatios(), and A == analyzer_->candidate_matrix_ (it
- // must be valid, the function does not build A).
- // The purpose of this function is to assess how much information is lost by
- // both encoding and the assumption that cohorts have equal ratios. d == 0
- // means that exact_candidate_count is among the solutions to the problem
- // being solved, so values much larger than 0 suggest that getting the right
- // counts in the Analyzer is difficult (if exact_candidate_counts is our a
- // priori knowledge of the solution). Note: d does not account for
- // non-uniqeness of the solution of A * x_s = b, just checks how well the
- // given vector solves it.
- void CheckExactSolution(const std::vector<int>& exact_candidate_counts);
-
- void PrintTrueCountsAndEstimates(const std::string& case_label, uint32_t num_candidates,
- const std::vector<CandidateResult>& results,
- const std::vector<int>& true_candidate_counts);
-
- // Assess utility of |results|. The informal measure suggested by mironov is:
- // "Largest n such that at most 10% of the n highest hitters are identified as
- // such incorrectly (are false positives)". Obviously, this makes sense only
- // for n in some range (it may unjusty suggest that the results are bad for n
- // too small, or for n too large, that they are good when in fact they are
- // not). Also, 10% is arbitrary. So instead, we just compute the false
- // positive rates for n in some grid set. We also print the total number of
- // nonzero estimates identified.
- void AssessUtility(const std::vector<CandidateResult>& results,
- const std::vector<int>& true_candidate_counts);
-
- // Computes the least squares fit on the candidate matrix using QR,
- // for the given rhs in |est_bit_count_ratios| and saves it to |results|
- grpc::Status ComputeLeastSquaresFitQR(const Eigen::VectorXd& est_bit_count_ratios,
- std::vector<CandidateResult>* results);
-
- // Runs a simple least squares problem for Ax = b on the candidate matrix
- // using QR algorithm from eigen library; this is to see the results
- // without penalty terms (note: in an overdetermined system the solution
- // is not unique so this is more a helper testing function to cross-check
- // the behavior of regression without penalty)
- void RunSimpleLinearRegressionReference(const std::string& case_label, uint32_t num_candidates,
- uint32_t num_bloom_bits, uint32_t num_cohorts,
- uint32_t num_hashes,
- const std::vector<int>& candidate_indices,
- const std::vector<int>& true_candidate_counts);
-
- // Invokes the Analyze() method using the given parameters. Checks that
- // the algorithms converges and that the result vector has the correct length.
- void ShortExperimentWithAnalyze(const std::string& case_label, uint32_t num_candidates,
- uint32_t num_bloom_bits, uint32_t num_cohorts,
- uint32_t num_hashes, const std::vector<int>& candidate_indices,
- const std::vector<int>& true_candidate_counts,
- bool print_estimates);
-
- // Same as ShortExperimentWithAnalyze() but also measures the time taken by
- // Analyze(), calls CheckExactSolution() to assess the loss of information,
- // and calls AssessUtility() to compare the results with true counts.
- void LongExperimentWithAnalyze(const std::string& case_label, uint32_t num_candidates,
- uint32_t num_bloom_bits, uint32_t num_cohorts, uint32_t num_hashes,
- const std::vector<int>& candidate_indices,
- const std::vector<int>& true_candidate_counts,
- bool print_estimates);
-
- // Similar to ShortExperimentWithAnalyze except it also
- // computes the estimates for both Analyze and simple regression using QR,
- // which is computed on exactly the same system.
- void CompareAnalyzeToSimpleRegression(const std::string& case_label, uint32_t num_candidates,
- uint32_t num_bloom_bits, uint32_t num_cohorts,
- uint32_t num_hashes,
- const std::vector<int>& candidate_indices,
- const std::vector<int>& true_candidate_counts);
-
- private:
- RapporCandidateList candidate_list_;
-
- protected:
- RapporConfig config_; // NOLINT
- std::unique_ptr<RapporAnalyzer> analyzer_; // NOLINT
-
- // By default this test uses p=0, q=1. Individual tests may override this.
- float prob_0_becomes_1_ = 0.0; // NOLINT
- float prob_1_stays_1_ = 1.0; // NOLINT
-
- // Random device
- std::random_device random_dev_; // NOLINT
-};
-
-} // namespace rappor
-} // namespace cobalt
-
-#endif // COBALT_SRC_ALGORITHMS_RAPPOR_RAPPOR_ANALYZER_TEST_H_
diff --git a/src/algorithms/rappor/rappor_analyzer_unit_tests.cc b/src/algorithms/rappor/rappor_analyzer_unit_tests.cc
deleted file mode 100644
index fde224e..0000000
--- a/src/algorithms/rappor/rappor_analyzer_unit_tests.cc
+++ /dev/null
@@ -1,470 +0,0 @@
-// Copyright 2018 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "src/algorithms/rappor/rappor_analyzer_test.h"
-
-namespace cobalt::rappor {
-
-using system_data::ClientSecret;
-
-// Tests the function BuildCandidateMap. We build one small CandidateMap and
-// then we explicitly check every value against a known value. We have not
-// independently verified the SHA-256 hash values and so rather than a test
-// of correctness this is firstly a sanity test: we can eyeball the values
-// and confirm they look sane, and secondly a regression test.
-TEST_F(RapporAnalyzerTest, BuildCandidateMapSmallTest) {
- static const uint32_t kNumCandidates = 5;
- static const uint32_t kNumCohorts = 3;
- static const uint32_t kNumHashes = 2;
- static const uint32_t kNumBloomBits = 8;
-
- SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
- BuildCandidateMap();
-
- // clang-format off
- int expected_bit_indices[kNumCandidates][kNumCohorts*kNumHashes] = {
- // cihj means cohort = i and hash-index = j.
- // c0h0 c0h1 c1h0 c1h1 c2h0 c2h2
- // NOLINTNEXTLINE
- {3, 5, 2, 6, 3, 6}, // candidate 0
- // NOLINTNEXTLINE
- {1, 5, 4, 7, 2, 0}, // candidate 1
- // NOLINTNEXTLINE
- {3, 0, 2, 0, 1, 4}, // candidate 2
- // NOLINTNEXTLINE
- {5, 1, 2, 4, 2, 4}, // candidate 3
- // NOLINTNEXTLINE
- {1, 4, 3, 1, 2, 6}, // candidate 4
- };
- // clang-format on
-
- for (size_t candidate = 0; candidate < kNumCandidates; candidate++) {
- for (size_t cohort = 0; cohort < kNumCohorts; cohort++) {
- for (size_t hash = 0; hash < kNumHashes; hash++) {
- EXPECT_EQ(expected_bit_indices[candidate][cohort * kNumHashes + hash],
- GetCandidateMapValue(candidate, cohort, hash))
- << "(" << candidate << "," << cohort * kNumHashes + hash << ")";
- }
- }
- }
-
- // Check the associated sparse matrix.
- std::ostringstream stream;
- stream << candidate_matrix().block(0, 0, kNumCohorts * kNumBloomBits, kNumCandidates);
- const char* kExpectedMatrixString =
- "0 0 0 0 0 \n"
- "0 0 0 0 0 \n"
- "1 1 0 1 0 \n"
- "0 0 0 0 1 \n"
- "1 0 1 0 0 \n"
- "0 0 0 0 0 \n"
- "0 1 0 1 1 \n"
- "0 0 1 0 0 \n"
- "0 1 0 0 0 \n"
- "1 0 0 0 0 \n"
- "0 0 0 0 0 \n"
- "0 1 0 1 0 \n"
- "0 0 0 0 1 \n"
- "1 0 1 1 0 \n"
- "0 0 0 0 1 \n"
- "0 0 1 0 0 \n"
- "0 0 0 0 0 \n"
- "1 0 0 0 1 \n"
- "0 0 0 0 0 \n"
- "0 0 1 1 0 \n"
- "1 0 0 0 0 \n"
- "0 1 0 1 1 \n"
- "0 0 1 0 0 \n"
- "0 1 0 0 0 \n";
- EXPECT_EQ(kExpectedMatrixString, stream.str());
-}
-
-// This test is identical to the previous test except that kNumBloomBits = 4
-// instead of 8. The purpose of this test is to force the situation in which
-// the two hash functions for a given cohort and a given candidate give the
-// same value. For example below we see that for candidate 0, cohort 1, both
-// hash functions yielded a 2. We want to test that the associated sparse
-// matrix has a "1" in the corresponding position (in this case that is
-// row 5, column 0) and does not have a "2" in that position. In other words
-// we want to test that we correctly added only one entry to the list of
-// triples that defined the sparse matrix and not two entries.
-TEST_F(RapporAnalyzerTest, BuildCandidateMapSmallTestWithDuplicates) {
- static const uint32_t kNumCandidates = 5;
- static const uint32_t kNumCohorts = 3;
- static const uint32_t kNumHashes = 2;
- static const uint32_t kNumBloomBits = 4;
-
- SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
- BuildCandidateMap();
-
- // clang-format off
- int expected_bit_indices[kNumCandidates][kNumCohorts*kNumHashes] = {
- // cihj means cohort = i and hash-index = j.
- // c0h0 c0h1 c1h0 c1h1 c2h0 c2h2
- {3, 1, 2, 2, 3, 2}, // candidate 0
- {1, 1, 0, 3, 2, 0}, // candidate 1
- {3, 0, 2, 0, 1, 0}, // candidate 2
- {1, 1, 2, 0, 2, 0}, // candidate 3
- {1, 0, 3, 1, 2, 2}, // candidate 4
- };
- // clang-format on
-
- for (size_t candidate = 0; candidate < kNumCandidates; candidate++) {
- for (size_t cohort = 0; cohort < kNumCohorts; cohort++) {
- for (size_t hash = 0; hash < kNumHashes; hash++) {
- EXPECT_EQ(expected_bit_indices[candidate][cohort * kNumHashes + hash],
- GetCandidateMapValue(candidate, cohort, hash))
- << "(" << candidate << "," << cohort * kNumHashes + hash << ")";
- }
- }
- }
-
- // Check the associated sparse matrix.
- std::ostringstream stream;
- stream << candidate_matrix().block(0, 0, kNumCohorts * kNumBloomBits, kNumCandidates);
- const char* kExpectedMatrixString =
- "1 0 1 0 0 \n"
- "0 0 0 0 0 \n"
- "1 1 0 1 1 \n"
- "0 0 1 0 1 \n"
- "0 1 0 0 1 \n"
- "1 0 1 1 0 \n"
- "0 0 0 0 1 \n"
- "0 1 1 1 0 \n"
- "1 0 0 0 0 \n"
- "1 1 0 1 1 \n"
- "0 0 1 0 0 \n"
- "0 1 1 1 0 \n";
- EXPECT_EQ(kExpectedMatrixString, stream.str());
-}
-
-// Tests the function BuildCandidateMap. We build many different CandidateMaps
-// with many different parameters. We are testing firstly that the procedure
-// completes without error, secondly that the shape of the produced
-// data structure is correct and thirdly that the bit indexes are in the range
-// [0, num_bloom_bits). The latter two checks occur inside of
-// BuildCandidateMap.
-TEST_F(RapporAnalyzerTest, BuildCandidateMapSmokeTest) {
- for (auto num_candidates : {11, 51, 99}) {
- for (auto num_cohorts : {23, 45}) {
- for (auto num_hashes : {2, 6, 7}) {
- for (auto num_bloom_bits : {16, 128}) {
- SetAnalyzer(num_candidates, num_bloom_bits, num_cohorts, num_hashes);
- BuildCandidateMap();
- }
- }
- }
- }
-}
-
-// Tests the function BuildCandidateMap. We test that the map that is built
-// is consistent with the Bloom filters that are built by an encoder.
-TEST_F(RapporAnalyzerTest, BuildCandidateMapCompareWithEncoder) {
- static const uint32_t kNumCandidates = 10;
- static const uint32_t kNumCohorts = 20;
- static const uint32_t kNumHashes = 5;
- static const uint32_t kNumBloomBits = 64;
-
- SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
- BuildCandidateMap();
-
- for (size_t candidate = 0; candidate < kNumCandidates; candidate++) {
- // Construct a new encoder with a new ClientSecret so that a random
- // cohort is selected.
- RapporEncoder encoder(config_, ClientSecret::GenerateNewSecret());
-
- // Encode the current candidate string using |encoder|.
- ValuePart value_part;
- value_part.set_string_value(CandidateString(candidate));
- RapporObservation observation;
- encoder.Encode(value_part, &observation);
-
- // Since p=0 and q=1 the RapporObservation contains the raw Bloom filter
- // with no noise added. Confirm that the BloomFilter is the same as
- // the one implied by the CandidateMap at the appropriate candidate
- // and cohort.
- EXPECT_EQ(BuildBitString(candidate, encoder.cohort()), DataToBinaryString(observation.data()));
- }
-}
-
-// Tests the function ExtractEstimatedBitCountRatios(). We build one small
-// estimated bit count ratio vector and explicitly check its values. We
-// use no-randomness: p = 0, q = 1 so that the estimated bit counts are
-// identical to the true bit counts.
-TEST_F(RapporAnalyzerTest, ExtractEstimatedBitCountRatiosSmallNonRandomTest) {
- static const uint32_t kNumCandidates = 10;
- static const uint32_t kNumCohorts = 3;
- static const uint32_t kNumHashes = 2;
- static const uint32_t kNumBloomBits = 8;
- SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
- AddObservation(0, "00001010");
- AddObservation(0, "00010010");
- AddObservation(1, "00001010");
- AddObservation(1, "00010010");
- AddObservation(1, "00100010");
- AddObservation(2, "00001010");
- AddObservation(2, "00010010");
- AddObservation(2, "00010010");
- AddObservation(2, "00100010");
-
- Eigen::VectorXd est_bit_count_ratios;
- ExtractEstimatedBitCountRatios(&est_bit_count_ratios);
-
- std::ostringstream stream;
- stream << est_bit_count_ratios.block(0, 0, kNumCohorts * kNumBloomBits, 1);
-
- const char* kExpectedVectorString =
- " 0\n"
- " 0\n"
- " 0\n"
- " 0.5\n"
- " 0.5\n"
- " 0\n"
- " 1\n"
- " 0\n"
- " 0\n"
- " 0\n"
- "0.333333\n"
- "0.333333\n"
- "0.333333\n"
- " 0\n"
- " 1\n"
- " 0\n"
- " 0\n"
- " 0\n"
- " 0.25\n"
- " 0.5\n"
- " 0.25\n"
- " 0\n"
- " 1\n"
- " 0";
- EXPECT_EQ(kExpectedVectorString, stream.str());
-}
-
-// This test invokes Analyze() in a few very simple cases, checks that the the
-// algorithm converges and that the result vector has the correct size.
-TEST_F(RapporAnalyzerTest, SimpleAnalyzeTest) {
- static const uint32_t kNumCandidates = 10;
- static const uint32_t kNumCohorts = 3;
- static const uint32_t kNumHashes = 2;
- static const uint32_t kNumBloomBits = 8;
- static const bool print_estimates = false;
-
- std::vector<int> candidate_indices(100, 5); // NOLINT
- std::vector<int> true_candidate_counts = {0, 0, 0, 0, 0, 100, 0, 0, 0, 0}; // NOLINT
- ShortExperimentWithAnalyze("p=0, q=1, only candidate 5", kNumCandidates, kNumBloomBits,
- kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts,
- print_estimates);
-
- candidate_indices = std::vector<int>(20, 1); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 20, 4); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 60, 9); // NOLINT
- true_candidate_counts = {0, 20, 0, 0, 20, 0, 0, 0, 0, 60}; // NOLINT
- ShortExperimentWithAnalyze("p=0, q=1, several candidates", kNumCandidates, kNumBloomBits,
- kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts,
- print_estimates);
-
- prob_0_becomes_1_ = 0.1; // NOLINT
- prob_1_stays_1_ = 0.9; // NOLINT
-
- candidate_indices = std::vector<int>(100, 5); // NOLINT
- true_candidate_counts = {0, 0, 0, 0, 0, 100, 0, 0, 0, 0}; // NOLINT
- ShortExperimentWithAnalyze("p=0.1, q=0.9, only candidate 5", kNumCandidates, kNumBloomBits,
- kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts,
- print_estimates);
-
- candidate_indices = std::vector<int>(20, 1); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 20, 4); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 60, 9); // NOLINT
- true_candidate_counts = {0, 20, 0, 0, 20, 0, 0, 0, 0, 60}; // NOLINT
- ShortExperimentWithAnalyze("p=0.1, q=0.9, several candidates", kNumCandidates, kNumBloomBits,
- kNumCohorts, kNumHashes, candidate_indices, true_candidate_counts,
- print_estimates);
-}
-
-// Runs Analyze() on the corner-case of one candidate and one cohort.
-// The result should be exact.
-TEST_F(RapporAnalyzerTest, SingleCandidateTest) {
- static const uint32_t kNumCandidates = 1;
- static const uint32_t kNumCohorts = 1;
- static const uint32_t kNumHashes = 2;
- static const uint32_t kNumBloomBits = 8;
-
- // No noise introduced
- prob_0_becomes_1_ = 0.0;
- prob_1_stays_1_ = 1.0;
-
- SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
-
- std::vector<int> candidate_indices(100, 0); // NOLINT
- AddObservationsForCandidates(candidate_indices);
-
- std::vector<CandidateResult> results;
- auto status = analyzer_->Analyze(&results);
-
- EXPECT_EQ(results.size(), kNumCandidates);
- EXPECT_GE(results[0].count_estimate, 99);
- EXPECT_LE(results[0].count_estimate, 101);
-}
-
-// Runs Analyze with one cohort and the number of bits much larger than the
-// number of candidates. No noise is introduced. The heavy hitter should be
-// reproduced nearly exactly (assuming second step of RAPPOR is "on" and
-// working). Note(bazyli): this test is based on the assumption that the number
-// of bits in the cohort is so large that no or almost no of the candidates will
-// share a non-zero bit with the heavy hitter.
-TEST_F(RapporAnalyzerTest, ExactSolutionTest) {
- // The probability that the heaviest hitter will not share a nonzero bit with
- // any other candidate when kNumCandidates = 10, kNumCohorts = 1, kNumHashes =
- // 2, kNumBloomBits = 128, is (125 * 126 / (127 * 128))^ 9 = 0.75. In that
- // case, the heaviest hitter must be solved exactly. Even if it does share a
- // bit or two with some other candidate(s), this is unlikely to alter the
- // result significantly.
- static const uint32_t kNumCandidates = 10;
- static const uint32_t kNumCohorts = 1;
- static const uint32_t kNumHashes = 2;
- static const uint32_t kNumBloomBits = 128;
-
- // No noise introduced.
- prob_0_becomes_1_ = 0.0;
- prob_1_stays_1_ = 1.0;
-
- SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
-
- std::vector<int> candidate_indices(10, 0); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 10, 1); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 10, 2); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 10, 3); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 10, 4); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 100, 9); // NOLINT
- AddObservationsForCandidates(candidate_indices);
-
- std::vector<CandidateResult> results;
- auto status = analyzer_->Analyze(&results);
-
- // Check that the heavy hitter is reproduced almost exactly
- EXPECT_GE(results[9].count_estimate, 95);
- EXPECT_LE(results[9].count_estimate, 105);
-
- // Candidates that are not observed should have zero counts
- EXPECT_EQ(results[5].count_estimate, 0);
- EXPECT_EQ(results[6].count_estimate, 0);
- EXPECT_EQ(results[7].count_estimate, 0);
- EXPECT_EQ(results[8].count_estimate, 0);
-
- // All standard errors should be numerically 0
- for (uint32_t i = 0; i < kNumCandidates; i++) {
- EXPECT_LE(results[i].std_error, 1e-12);
- }
-}
-
-// Runs Analyzer with two cohorts and large number of bits. Very small noise is
-// introduced. The heaviest hitter should be reproduced very well (assuming
-// second step of RAPPOR is "on" and working). This test is based on similar
-// assumptions that ExactSolutionTest but the check is relaxed because of noise
-// and multiple cohorts.
-TEST_F(RapporAnalyzerTest, HighAccuracySolutionTest) {
- static const uint32_t kNumCandidates = 10;
- static const uint32_t kNumCohorts = 2;
- static const uint32_t kNumHashes = 2;
- static const uint32_t kNumBloomBits = 64;
-
- // Small noise introduced.
- prob_0_becomes_1_ = 0.1; // NOLINT
- prob_1_stays_1_ = 0.9; // NOLINT
-
- SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
-
- std::vector<int> candidate_indices(10, 0); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 10, 1); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 10, 2); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 10, 3); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 10, 4); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 100, 9); // NOLINT
- AddObservationsForCandidates(candidate_indices);
-
- std::vector<CandidateResult> results;
- auto status = analyzer_->Analyze(&results);
-
- EXPECT_GE(results[9].count_estimate, 70);
- EXPECT_LE(results[9].count_estimate, 120);
-}
-
-// Runs Analyzer on a moderately under-determined system (number of rows <
-// number of columns) with more than one cohort. Small noise is introduced.
-// There are a number of (equal) heavy hitters, which should be identified.
-// TODO(bazyli): this test is heuristic.
-TEST_F(RapporAnalyzerTest, KHeavyHittersTest) {
- static const uint32_t kNumCandidates = 100;
- static const uint32_t kNumCohorts = 4;
- static const uint32_t kNumHashes = 2;
- static const uint32_t kNumBloomBits = 16;
-
- // Small noise introduced
- prob_0_becomes_1_ = 0.1; // NOLINT
- prob_1_stays_1_ = 0.9; // NOLINT
-
- SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
-
- std::vector<int> candidate_indices;
- candidate_indices.insert(candidate_indices.end(), 100, 1); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 100, 2); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 100, 4); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 100, 8); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 100, 16); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 100, 32); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 100, 64); // NOLINT
- AddObservationsForCandidates(candidate_indices);
-
- std::vector<CandidateResult> results;
- auto status = analyzer_->Analyze(&results);
-
- // Check if the heavy hitters are identified
- EXPECT_GE(results[1].count_estimate, 50);
- EXPECT_LE(results[1].count_estimate, 150);
- EXPECT_GE(results[2].count_estimate, 50);
- EXPECT_LE(results[2].count_estimate, 150);
- EXPECT_GE(results[4].count_estimate, 50);
- EXPECT_LE(results[4].count_estimate, 150);
- EXPECT_GE(results[8].count_estimate, 50);
- EXPECT_LE(results[8].count_estimate, 150);
- EXPECT_GE(results[16].count_estimate, 50);
- EXPECT_LE(results[16].count_estimate, 150);
- EXPECT_GE(results[32].count_estimate, 50);
- EXPECT_LE(results[32].count_estimate, 150);
-}
-
-// Runs a small instance of RAPPOR with large noise.
-// The standard errors of identified heavy hitters should be nonzero.
-// Note(bazyli): this test is heuristic test.
-TEST_F(RapporAnalyzerTest, StandardErrorTest) {
- static const uint32_t kNumCandidates = 50;
- static const uint32_t kNumCohorts = 2;
- static const uint32_t kNumHashes = 2;
- static const uint32_t kNumBloomBits = 32;
-
- // Large noise introduced.
- prob_0_becomes_1_ = 0.4; // NOLINT
- prob_1_stays_1_ = 0.6; // NOLINT
-
- SetAnalyzer(kNumCandidates, kNumBloomBits, kNumCohorts, kNumHashes);
-
- std::vector<int> candidate_indices(1000, 0); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 1000, 10); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 1000, 20); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 1000, 30); // NOLINT
- candidate_indices.insert(candidate_indices.end(), 1000, 40); // NOLINT
- AddObservationsForCandidates(candidate_indices);
-
- std::vector<CandidateResult> results;
- auto status = analyzer_->Analyze(&results);
-
- EXPECT_GT(results[0].std_error, 0);
- EXPECT_GT(results[10].std_error, 0);
- EXPECT_GT(results[20].std_error, 0);
- EXPECT_GT(results[30].std_error, 0);
- EXPECT_GT(results[40].std_error, 0);
-}
-
-} // namespace cobalt::rappor
diff --git a/src/algorithms/rappor/rappor_analyzer_utils.cc b/src/algorithms/rappor/rappor_analyzer_utils.cc
deleted file mode 100644
index e25116d..0000000
--- a/src/algorithms/rappor/rappor_analyzer_utils.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2018 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "src/algorithms/rappor/rappor_analyzer_utils.h"
-
-#include <vector>
-
-using cobalt_lossmin::InstanceSet;
-
-namespace cobalt::rappor {
-
-void PrepareSecondRapporStepMatrix(InstanceSet* second_step_matrix,
- const std::vector<int>& second_step_cols,
- const InstanceSet& full_matrix, int num_cohorts,
- int num_hashes) {
- // We will construct the matrix from triplets which is simple and efficient.
- const uint32_t second_step_num_candidates = second_step_cols.size();
- const int nonzero_matrix_entries =
- num_cohorts * num_hashes * static_cast<int>(second_step_num_candidates);
- std::vector<Eigen::Triplet<double>> second_step_matrix_triplets;
- second_step_matrix_triplets.reserve(nonzero_matrix_entries);
-
- // Construct a ColMajor copy of candidate_matrix_ format to iterate over
- // columns efficiently, and form the triplets.
- Eigen::SparseMatrix<double, Eigen::ColMajor> candidate_matrix_col_major = full_matrix;
- for (uint32_t col_i = 0; col_i < second_step_num_candidates; col_i++) {
- // Iterate over column corresponding to this candidate and update
- // triplets.
- for (Eigen::SparseMatrix<double, Eigen::ColMajor>::InnerIterator it(candidate_matrix_col_major,
- second_step_cols[col_i]);
- it; ++it) {
- second_step_matrix_triplets.emplace_back(Eigen::Triplet<double>(it.row(), col_i, it.value()));
- }
- }
-
- // Build the second step matrix.
- second_step_matrix->setFromTriplets(second_step_matrix_triplets.begin(),
- second_step_matrix_triplets.end());
-}
-
-} // namespace cobalt::rappor
diff --git a/src/algorithms/rappor/rappor_analyzer_utils.h b/src/algorithms/rappor/rappor_analyzer_utils.h
deleted file mode 100644
index 35d885b..0000000
--- a/src/algorithms/rappor/rappor_analyzer_utils.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2018 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef COBALT_SRC_ALGORITHMS_RAPPOR_RAPPOR_ANALYZER_UTILS_H_
-#define COBALT_SRC_ALGORITHMS_RAPPOR_RAPPOR_ANALYZER_UTILS_H_
-
-#include <vector>
-
-#include "src/lib/lossmin/eigen-types.h"
-
-#include "third_party/eigen/Eigen/SparseCore"
-
-namespace cobalt::rappor {
-
-// Constructs a submatrix of |full_matrix| composed of columns corresponding to
-// indices in |second_step_cols|. |num_cohorts| and |num_hashes| are needed to
-// determine the number of nonzero entries of the matrix.
-void PrepareSecondRapporStepMatrix(cobalt_lossmin::InstanceSet* second_step_matrix,
- const std::vector<int>& second_step_cols,
- const cobalt_lossmin::InstanceSet& full_matrix, int num_cohorts,
- int num_hashes);
-
-} // namespace cobalt::rappor
-
-#endif // COBALT_SRC_ALGORITHMS_RAPPOR_RAPPOR_ANALYZER_UTILS_H_
diff --git a/src/algorithms/rappor/rappor_config.h b/src/algorithms/rappor/rappor_config.h
index d592c89..e24f3da 100644
--- a/src/algorithms/rappor/rappor_config.h
+++ b/src/algorithms/rappor/rappor_config.h
@@ -14,36 +14,6 @@
namespace cobalt::rappor {
-struct RapporConfig {
- // k = the number of Bloom filter bits. This must be a power of 2 and
- // it must satisfy:
- // 1 <= h < k <= 1024, where h = num_hashes.
- uint32_t num_bloom_bits = 0;
-
- // h = the number of hashes. This must satisfy
- // 1 <= h <= 8
- uint32_t num_hashes = 0;
-
- // m = the number of cohorts. This must satisfy
- // 1 <= m <= 1024
- uint32_t num_cohorts = 0;
-
- // All probabilities below must be in the range [0.0, 1.0]
-
- // prob_0_becomes_1 MAY NOT BE EQUAL to prob_1_stays_1.
-
- // p = p(a zero bit is changed to a one bit in the IRR)
- float prob_0_becomes_1 = 0.0;
-
- // q = p(a one bit remains a one bit in the IRR)
- float prob_1_stays_1 = 0.0;
-
- // f = p(a bit is randomly assigned a value in the PRR)
- // Note: Not Implemented in version 0.1 of Cobalt.
- // This value must not be set to a non-zero value.
- float prob_rr = 0.0;
-};
-
struct BasicRapporConfig {
// All probabilities below must be in the range [0.0, 1.0].
diff --git a/src/algorithms/rappor/rappor_config_helper.cc b/src/algorithms/rappor/rappor_config_helper.cc
index 6aef8dc..483d73b 100644
--- a/src/algorithms/rappor/rappor_config_helper.cc
+++ b/src/algorithms/rappor/rappor_config_helper.cc
@@ -19,11 +19,9 @@
constexpr float RapporConfigHelper::kProbRR = 0.0;
//////////////////////////// p, q and f //////////////////////////////////
-
// This is relevant to Basic RAPPOR (via the SIMPLE_OCCURRENCE_COUNT
-// Report type) and String RAPPOR (via the HIGH_FREQUENCY_STRING_COUNTS Report
-// type.)
-
+// and UNIQUE_N_DAY_ACTIVES Report types).
+//
// The mapping from different values of LocalPrivacyNoiseLevel to
// RAPPOR parameters p and q. The user sets LocalPrivacyNoiseLevel on
// a per-report basis. We always use q = 1 - p and so we don't need to
@@ -42,77 +40,17 @@
// LARGE
static const float kLocalPrivacyLargeProbBitFlip = 0.25;
-//////////////////////// m = num_cohorts ///////////////////////////////
-
-// This is relevant to String RAPPOR (via the HIGH_FREQUENCY_STRING_COUNTS
-// Report type.)
-
-// |expected_population_size| is the user's estimate of the number of Fuchsia
-// devices from which data is collected.
-
-// The mapping from expected_population_size to num_cohorts
-
-// This is the number of cohorts to use if expected_population_size is not set.
-static const size_t kDefaultNumCohorts = 50;
-
-// For estimated_population_size < 100 we use 5 cohorts.
-static const size_t kTinyPopulationSize = 100;
-static const size_t kTinyNumCohorts = 5;
-
-// For 100 <= estimated_population_size < 1000 we use 10 cohorts.
-static const size_t kSmallPopulationSize = 1000;
-static const size_t kSmallNumCohorts = 10;
-
-// For 1,000 <= estimated_population_size < 10,000 we use 50 cohorts.
-static const size_t kMediumPopulationSize = 10000;
-static const size_t kMediumNumCohorts = 50;
-
-// For estimated_population_size >= 10,000 we use 100 cohorts
-static const size_t kLargeNumCohorts = 100;
-
-//////////////////////// k = num_bloom_bits //////////////////////////////
-
-// This is relevant to String RAPPOR (via the HIGH_FREQUENCY_STRING_COUNTS
-// Report type.)
-
-// |expected_string_set_size| is the user's estimate of the number of candidate
-// strings. This cannot be changed once data collection begins.
-
-// The mapping from expected_string_set_size to num_bloom_bits
-
-// This is the number of bits to use if expected_string_set_size is not set.
-static const size_t kDefaultNumBits = 32;
-
-// For estimated_string_set_size < 100 we use 8 bits.
-static const size_t kTinyNumCandidates = 100;
-static const size_t kTinyNumBits = 8;
-
-// For 100 <= estimated_string_set_size < 1000 we use 16 bits.
-static const size_t kSmallNumCandidates = 1000;
-static const size_t kSmallNumBits = 16;
-
-// For 1,000 <= estimated_string_set_size < 10,000 we use 64 bits.
-static const size_t kMediumNumCandidates = 10000;
-static const size_t kMediumNumBits = 64;
-
-// For estimated_string_set_size >= 10,000 we use 128 bits
-static const size_t kLargeNumBits = 128;
-
float RapporConfigHelper::ProbBitFlip(const ReportDefinition& report_definition,
const std::string& metric_debug_name) {
switch (report_definition.local_privacy_noise_level()) {
case ReportDefinition::NONE:
return kLocalPrivacyNoneProbBitFlip;
-
case ReportDefinition::SMALL:
return kLocalPrivacySmallProbBitFlip;
-
case ReportDefinition::MEDIUM:
return kLocalPrivacyMediumProbBitFlip;
-
case ReportDefinition::LARGE:
return kLocalPrivacyLargeProbBitFlip;
-
default:
LOG(ERROR) << "Invalid Cobalt config: Report " << report_definition.report_name()
<< " from metric " << metric_debug_name
@@ -139,44 +77,4 @@
return 0;
}
-size_t RapporConfigHelper::StringRapporNumCohorts(const ReportDefinition& report_definition) {
- if (report_definition.expected_population_size() == 0) {
- return kDefaultNumCohorts;
- }
-
- if (report_definition.expected_population_size() < kTinyPopulationSize) {
- return kTinyNumCohorts;
- }
-
- if (report_definition.expected_population_size() < kSmallPopulationSize) {
- return kSmallNumCohorts;
- }
-
- if (report_definition.expected_population_size() < kMediumPopulationSize) {
- return kMediumNumCohorts;
- }
-
- return kLargeNumCohorts;
-}
-
-size_t RapporConfigHelper::StringRapporNumBloomBits(const ReportDefinition& report_definition) {
- if (report_definition.expected_string_set_size() == 0) {
- return kDefaultNumBits;
- }
-
- if (report_definition.expected_string_set_size() < kTinyNumCandidates) {
- return kTinyNumBits;
- }
-
- if (report_definition.expected_string_set_size() < kSmallNumCandidates) {
- return kSmallNumBits;
- }
-
- if (report_definition.expected_string_set_size() < kMediumNumCandidates) {
- return kMediumNumBits;
- }
-
- return kLargeNumBits;
-}
-
} // namespace cobalt::rappor
diff --git a/src/algorithms/rappor/rappor_config_helper.h b/src/algorithms/rappor/rappor_config_helper.h
index b0e2925..d24ecc5 100644
--- a/src/algorithms/rappor/rappor_config_helper.h
+++ b/src/algorithms/rappor/rappor_config_helper.h
@@ -23,9 +23,6 @@
// We do not support RAPPOR's PRR in Cobalt.
static const float kProbRR; // = 0
- // For String RAPPOR's number of hashes, always use h = 2.
- static const size_t kNumHashes = 2;
-
// Returns the probability of flipping a bit in the RAPPOR encoding,
// or kInvalidProbability.
//
@@ -38,12 +35,6 @@
// Returns the number of categories to use for the Basic RAPPOR encoding.
// This is the same as the number of bits.
static size_t BasicRapporNumCategories(const MetricDefinition& metric_definition);
-
- // Returns the number of cohorts to use for the String RAPPOR encoding.
- static size_t StringRapporNumCohorts(const ReportDefinition& report_definition);
-
- // Returns the number of bits to use for the String RAPPOR encoding.
- static size_t StringRapporNumBloomBits(const ReportDefinition& report_definition);
};
} // namespace rappor
diff --git a/src/algorithms/rappor/rappor_config_helper_test.cc b/src/algorithms/rappor/rappor_config_helper_test.cc
index 63a3062..ecd9081 100644
--- a/src/algorithms/rappor/rappor_config_helper_test.cc
+++ b/src/algorithms/rappor/rappor_config_helper_test.cc
@@ -45,78 +45,4 @@
EXPECT_EQ(0u, RapporConfigHelper::BasicRapporNumCategories(metric_definition));
}
-TEST(RapporConfigHelperTest, StringRapporNumCohorts) {
- ReportDefinition report_definition;
- EXPECT_EQ(50u, RapporConfigHelper::StringRapporNumCohorts(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_population_size(99);
- EXPECT_EQ(5u, RapporConfigHelper::StringRapporNumCohorts(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_population_size(100);
- EXPECT_EQ(10u, RapporConfigHelper::StringRapporNumCohorts(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_population_size(999);
- EXPECT_EQ(10u, RapporConfigHelper::StringRapporNumCohorts(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_population_size(1000);
- EXPECT_EQ(50u, RapporConfigHelper::StringRapporNumCohorts(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_population_size(9999);
- EXPECT_EQ(50u, RapporConfigHelper::StringRapporNumCohorts(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_population_size(10000);
- EXPECT_EQ(100u, RapporConfigHelper::StringRapporNumCohorts(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_population_size(10001);
- EXPECT_EQ(100u, RapporConfigHelper::StringRapporNumCohorts(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_population_size(100000);
- EXPECT_EQ(100u, RapporConfigHelper::StringRapporNumCohorts(report_definition));
-}
-
-TEST(RapporConfigHelperTest, StringRapporNumBloomBits) {
- ReportDefinition report_definition;
- EXPECT_EQ(32u, RapporConfigHelper::StringRapporNumBloomBits(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_string_set_size(99);
- EXPECT_EQ(8u, RapporConfigHelper::StringRapporNumBloomBits(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_string_set_size(100);
- EXPECT_EQ(16u, RapporConfigHelper::StringRapporNumBloomBits(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_string_set_size(999);
- EXPECT_EQ(16u, RapporConfigHelper::StringRapporNumBloomBits(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_string_set_size(1000);
- EXPECT_EQ(64u, RapporConfigHelper::StringRapporNumBloomBits(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_string_set_size(9999);
- EXPECT_EQ(64u, RapporConfigHelper::StringRapporNumBloomBits(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_string_set_size(10000);
- EXPECT_EQ(128u, RapporConfigHelper::StringRapporNumBloomBits(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_string_set_size(10001);
- EXPECT_EQ(128u, RapporConfigHelper::StringRapporNumBloomBits(report_definition));
-
- // NOLINTNEXTLINE readability-magic-numbers
- report_definition.set_expected_string_set_size(100000);
- EXPECT_EQ(128u, RapporConfigHelper::StringRapporNumBloomBits(report_definition));
-}
-
} // namespace cobalt::rappor
diff --git a/src/algorithms/rappor/rappor_config_validator.cc b/src/algorithms/rappor/rappor_config_validator.cc
index a5c8afd..fbb0879 100644
--- a/src/algorithms/rappor/rappor_config_validator.cc
+++ b/src/algorithms/rappor/rappor_config_validator.cc
@@ -17,14 +17,11 @@
#include <string>
#include <vector>
-#include "src/lib/crypto_util/hash.h"
#include "src/logging.h"
#include "src/pb/observation.pb.h"
namespace cobalt::rappor {
-using crypto::hash::DIGEST_SIZE;
-
namespace {
// Factors out some common validation logic.
bool CommonValidate(float prob_0_becomes_1, float prob_1_stays_1, float prob_rr) {
@@ -113,57 +110,8 @@
return v + 1;
}
-constexpr uint32_t max_num_hashes = 8;
-// Constructor for String RAPPOR
-RapporConfigValidator::RapporConfigValidator(const RapporConfig& config)
- : prob_0_becomes_1_(config.prob_0_becomes_1),
- prob_1_stays_1_(config.prob_1_stays_1),
- num_bits_(config.num_bloom_bits),
- num_hashes_(config.num_hashes),
- num_cohorts_(config.num_cohorts),
- // num_cohorts_2_power_ is computed below.
- num_cohorts_2_power_(0) {
- valid_ = false;
- if (!CommonValidate(prob_0_becomes_1_, prob_1_stays_1_, config.prob_rr)) {
- return;
- }
- if (num_bits_ <= 1 || num_bits_ > max_num_categories) {
- LOG(ERROR) << "For k = num_bits we require 1 < k <= 1024.";
- return;
- }
- if ((num_bits_ & (num_bits_ - 1)) != 0) {
- LOG(ERROR) << "k = num_bits must be a power of 2.";
- return;
- }
- if (num_hashes_ < 1 || num_hashes_ > max_num_hashes || num_hashes_ >= num_bits_) {
- LOG(ERROR) << "For k = num_bits and h = num_hashes we require 1 <= h <= 8 "
- "and h < k.";
- return;
- }
- // We consume 2 bytes of the digest per hash.
- if (num_hashes_ * 2 > DIGEST_SIZE) {
- // This should not happen unless DIGEST_SIZE is changed to a value that is
- // too small.
- LOG(ERROR) << "DIGEST_SIZE too small for number of hashes: " << DIGEST_SIZE;
- return;
- }
- if (num_cohorts_ < 1 || num_cohorts_ > max_num_categories) {
- LOG(ERROR) << "For m = num_cohorts we require 1 <= m <= 1024.";
- return;
- }
- num_cohorts_2_power_ = MinPower2Above((uint16_t(num_cohorts_)));
- CHECK_GT(num_cohorts_2_power_, 0u);
- CHECK_LE(num_cohorts_2_power_, 1024u);
- valid_ = true;
-}
-
-// Constructor for Basic RAPPOR
RapporConfigValidator::RapporConfigValidator(const BasicRapporConfig& config)
- : prob_0_becomes_1_(config.prob_0_becomes_1),
- prob_1_stays_1_(config.prob_1_stays_1),
- num_bits_(0),
- num_hashes_(0),
- num_cohorts_(1) {
+ : prob_0_becomes_1_(config.prob_0_becomes_1), prob_1_stays_1_(config.prob_1_stays_1) {
valid_ = false;
if (!CommonValidate(prob_0_becomes_1_, prob_1_stays_1_, config.prob_rr)) {
return;
@@ -188,8 +136,7 @@
}
// Returns the bit-index of |category| or -1 if |category| is not one of the
-// basic RAPPOR categories (or if this object was not initialized with a
-// BasicRapporConfig.)
+// basic RAPPOR categories.
int RapporConfigValidator::bit_index(const ValuePart& category) {
std::string serialized_value;
category.SerializeToString(&serialized_value);
diff --git a/src/algorithms/rappor/rappor_config_validator.h b/src/algorithms/rappor/rappor_config_validator.h
index bb2bfae..904fbaf 100644
--- a/src/algorithms/rappor/rappor_config_validator.h
+++ b/src/algorithms/rappor/rappor_config_validator.h
@@ -28,9 +28,6 @@
class RapporConfigValidator {
public:
- // Constructor for String RAPPOR
- explicit RapporConfigValidator(const RapporConfig& config);
-
// Constructor for Basic RAPPOR
explicit RapporConfigValidator(const BasicRapporConfig& config);
@@ -43,13 +40,9 @@
bool valid() { return valid_; }
uint32_t num_bits() { return num_bits_; }
- uint32_t num_hashes() { return num_hashes_; }
- uint32_t num_cohorts() { return num_cohorts_; }
- uint32_t num_cohorts_2_power() { return num_cohorts_2_power_; }
// Returns the bit-index of |category| or -1 if |category| is not one of the
- // basic RAPPOR categories (or if this object was not initialized with a
- // BasicRapporConfig.)
+ // basic RAPPOR categories.
int bit_index(const ValuePart& category);
// Gives access to the vector of Categories if this object was initialized
@@ -67,12 +60,6 @@
float prob_1_stays_1_;
uint32_t num_bits_;
- // Used only in string RAPPOR
- uint32_t num_hashes_;
- uint32_t num_cohorts_;
- // This is the least power of 2 greater than or equal to num_cohorts_.
- uint32_t num_cohorts_2_power_;
-
// Used only in Basic RAPPOR. |categories_| is the list of all
// categories. The keys to |category_to_bit_index_| are serialized
// ValueParts.
diff --git a/src/algorithms/rappor/rappor_encoder.cc b/src/algorithms/rappor/rappor_encoder.cc
index dc3e59d..efd5665 100644
--- a/src/algorithms/rappor/rappor_encoder.cc
+++ b/src/algorithms/rappor/rappor_encoder.cc
@@ -19,8 +19,6 @@
#include <memory>
#include <vector>
-#include "src/lib/crypto_util/hash.h"
-#include "src/lib/crypto_util/mac.h"
#include "src/lib/crypto_util/random.h"
#include "src/logging.h"
#include "src/tracing.h"
@@ -28,7 +26,6 @@
namespace cobalt::rappor {
using crypto::byte;
-using crypto::hmac::HMAC;
using system_data::ClientSecret;
namespace {
@@ -83,143 +80,6 @@
} // namespace
-RapporEncoder::RapporEncoder(const RapporConfig& config, ClientSecret client_secret)
- : config_(new RapporConfigValidator(config)),
- random_(new crypto::Random()),
- client_secret_(std::move(client_secret)),
- cohort_num_(DeriveCohortFromSecret()) {}
-
-bool RapporEncoder::HashValueAndCohort(const std::string& serialized_value, uint32_t cohort_num,
- uint32_t num_hashes,
- byte hashed_value[crypto::hash::DIGEST_SIZE]) {
- // We append the cohort to the value before hashing.
- std::vector<byte> hash_input(serialized_value.size() + sizeof(cohort_num_));
- std::memcpy(hash_input.data(), &serialized_value[0], serialized_value.size());
- std::memcpy(hash_input.data() + serialized_value.size(), &cohort_num, sizeof(cohort_num_));
-
- // Now we hash |hash_input| into |hashed_value|.
- // We are going to use two bytes of |hashed_value| for each hash in the Bloom
- // filter so we need DIGEST_SIZE to be at least num_hashes*2. This should have
- // already been checked at config validation time.
- CHECK(crypto::hash::DIGEST_SIZE >= num_hashes * 2);
- return crypto::hash::Hash(hash_input.data(), hash_input.size(), hashed_value);
-}
-
-uint32_t RapporEncoder::ExtractBitIndex(byte const hashed_value[crypto::hash::DIGEST_SIZE],
- size_t hash_index, uint32_t num_bits) {
- // Each bloom filter consumes two bytes of |hashed_value|. Note that
- // num_bits is required to be a power of 2 (this is checked in the
- // constructor of RapporConfigValidator) so that the mod operation below
- // preserves the uniform distribution of |hashed_value|.
- return (*reinterpret_cast<const uint16_t*>(&hashed_value[hash_index * 2])) % num_bits;
-}
-
-std::string RapporEncoder::MakeBloomBits(const ValuePart& value) {
- uint32_t num_bits = config_->num_bits();
- uint32_t num_bytes = (num_bits + kBitsPerByte - 1) / kBitsPerByte;
- uint32_t num_hashes = config_->num_hashes();
-
- std::string serialized_value;
- value.SerializeToString(&serialized_value);
-
- byte hashed_value[crypto::hash::DIGEST_SIZE];
- if (!HashValueAndCohort(serialized_value, cohort_num_, num_hashes, hashed_value)) {
- VLOG(1) << "Hash() failed";
- return "";
- }
-
- // Initialize data to a string of all zero bytes.
- // (The C++ Protocol Buffer API uses string to represent an array of bytes.)
- std::string data(num_bytes, static_cast<char>(0));
- for (size_t hash_index = 0; hash_index < num_hashes; hash_index++) {
- uint32_t bit_index = ExtractBitIndex(hashed_value, hash_index, num_bits);
-
- // Indexed from the right, i.e. the least-significant bit.
- uint32_t byte_index = bit_index / kBitsPerByte;
- uint32_t bit_in_byte_index = bit_index % kBitsPerByte;
- // Set the appropriate bit.
- data[num_bytes - (byte_index + 1)] |= 1 << bit_in_byte_index;
- }
-
- return data;
-}
-
-// We use HMAC as a PRF and compute
-// HMAC_{client_secret}(attempt_number) % num_cohorts_2_power
-uint32_t RapporEncoder::AttemptDeriveCohortFromSecret(size_t attempt_number) {
- if (!config_->valid()) {
- VLOG(1) << "config is not valid";
- return UINT32_MAX;
- }
- if (!client_secret_.valid()) {
- VLOG(1) << "client_secret is not valid";
- return UINT32_MAX;
- }
-
- // Invoke HMAC.
- byte hashed_value[crypto::hmac::TAG_SIZE];
- if (!HMAC(client_secret_.data(), ClientSecret::kNumSecretBytes,
- reinterpret_cast<byte*>(&attempt_number), sizeof(attempt_number), hashed_value)) {
- VLOG(1) << "HMAC() failed!";
- return UINT32_MAX;
- }
-
- // Interpret the first two bytes of hashed_value as an unsigned integer
- // and mod by num_cohorts_2_power.
- CHECK_GT(config_->num_cohorts_2_power(), 0u);
- return *(reinterpret_cast<uint16_t*>(hashed_value)) % config_->num_cohorts_2_power();
-}
-
-uint32_t RapporEncoder::DeriveCohortFromSecret() {
- size_t attempt_number = 0;
- // Each invocation of AttemptDeriveCohortFromSecret() has probability > 1/2
- // of returning a value < num_cohorts so the probability that this loop
- // will execute more than n times is less than 1/(2^n).
- while (true) {
- uint32_t cohort = AttemptDeriveCohortFromSecret(attempt_number++);
- if (cohort == UINT32_MAX) {
- // Derivation failed.
- return UINT32_MAX;
- }
- if (cohort < config_->num_cohorts()) {
- return cohort;
- }
- }
-}
-
-Status RapporEncoder::Encode(const ValuePart& value, RapporObservation* observation_out) {
- if (!config_->valid()) {
- return kInvalidConfig;
- }
- if (!client_secret_.valid()) {
- LOG(ERROR) << "client_secret is not valid";
- return kInvalidConfig;
- }
- if (cohort_num_ == UINT32_MAX) {
- LOG(ERROR) << "Unable to derive cohort from client_secret.";
- return kInvalidConfig;
- }
-
- std::string data = MakeBloomBits(value);
- if (data.empty()) {
- LOG(ERROR) << "MakeBloomBits failed on input: " << DebugString(value);
- return kInvalidInput;
- }
-
- // TODO(rudominer) Consider supporting prr in future versions of Cobalt.
-
- // Randomly flip some of the bits based on the probabilities p and q.
- auto status =
- FlipBits(config_->prob_0_becomes_1(), config_->prob_1_stays_1(), random_.get(), &data);
- if (status != kOK) {
- return status;
- }
-
- observation_out->set_cohort(cohort_num_);
- observation_out->set_data(data);
- return kOK;
-}
-
BasicRapporEncoder::BasicRapporEncoder(const BasicRapporConfig& config, ClientSecret client_secret)
: config_(new RapporConfigValidator(config)),
random_(new crypto::Random()),
diff --git a/src/algorithms/rappor/rappor_encoder.h b/src/algorithms/rappor/rappor_encoder.h
index 9b62c88..4ce4fc3 100644
--- a/src/algorithms/rappor/rappor_encoder.h
+++ b/src/algorithms/rappor/rappor_encoder.h
@@ -20,7 +20,6 @@
#include <utility>
#include "src/algorithms/rappor/rappor_config_validator.h"
-#include "src/lib/crypto_util/hash.h"
#include "src/lib/crypto_util/random.h"
#include "src/pb/observation.pb.h"
#include "src/system_data/client_secret.h"
@@ -33,80 +32,6 @@
kInvalidInput,
};
-// Performs String RAPPOR encoding.
-class RapporEncoder {
- public:
- // Constructor.
- // The |client_secret| is used to determine the cohort and the PRR.
- RapporEncoder(const RapporConfig& config, system_data::ClientSecret client_secret);
- virtual ~RapporEncoder() = default;
-
- // Encodes |value| using RAPPOR encoding. Returns kOK on success, or
- // kInvalidConfig if the |config| passed to the constructor is not valid.
- Status Encode(const ValuePart& value, RapporObservation* observation_out);
-
- [[nodiscard]] uint32_t cohort() const { return cohort_num_; }
-
- private:
- friend class StringRapporEncoderTest;
- friend class RapporAnalyzer;
-
- // Allows Friend classess to set a special RNG for use in tests.
- void SetRandomForTesting(std::unique_ptr<crypto::Random> random) { random_ = std::move(random); }
-
- // Computes a hash of the given |serialized value| and |cohort_num| and writes
- // the result to |hashed_value|. This plus ExtractBitIndex() are used by
- // MakeBloomBits() to form the Bloom filter. These two functions have been
- // extracted from MakeBloomBits() so that they can be shared by RaporAnalyzer.
- //
- // |num_hashes| indicates the the upper bound for the values of |hash_index|
- // that will be passed to ExtractBitIndex() after this method returns.
- //
- // Returns true for success or false if the hash operation fails for any
- // reason.
- static bool HashValueAndCohort(const std::string& serialized_value, uint32_t cohort_num,
- uint32_t num_hashes,
- crypto::byte hashed_value[crypto::hash::DIGEST_SIZE]);
-
- // Extracts a bit index from the given |hashed_value| for the given
- // |hash_index|. This plus HashValueAndCohort are used by MakeBloomBits()
- // to form the Bloom filter. These two functions have been extracted from
- // MakeBloomBits() so that they can be shared by RaporAnalyzer.
- //
- // IMPORTANT: We index bits "from the right." This means that bit number zero
- // is the least significant bit of the last byte of the Bloom filter.
- static uint32_t ExtractBitIndex(crypto::byte const hashed_value[crypto::hash::DIGEST_SIZE],
- size_t hash_index, uint32_t num_bits);
-
- // Generates the array of bloom bits derived from |value|. Returns the
- // empty string on error.
- std::string MakeBloomBits(const ValuePart& value);
-
- // Derives an integer in the range [0, config_.num_cohorts_2_power_) from
- // |client_secret_| and |attempt_number|. The distribution of values in this
- // range will be (approximately) uniform as the Client Secret and
- // |attempt_number| vary uniformly.
- //
- // This method is invoked iteratively from DeriveCohortFromSecret() with
- // increasing attempt_numbers until the returned value is less than
- // config_.num_cohorts_.
- //
- // Returns UINT32_MAX to indicate failure.
- uint32_t AttemptDeriveCohortFromSecret(size_t attempt_number);
-
- // Derives an integer in the range [0, config_.num_cohorts_) from
- // |client_secret_|. The distribution of values in this range will be
- // (approximately) uniform as the Client Secret varies uniformly.
- //
- // Returns UINT32_MAX to indicate failure.
- uint32_t DeriveCohortFromSecret();
-
- std::unique_ptr<RapporConfigValidator> config_;
- std::unique_ptr<crypto::Random> random_;
- system_data::ClientSecret client_secret_;
- uint32_t cohort_num_;
-};
-
// Performs encoding for Basic RAPPOR, a.k.a Categorical RAPPOR. No cohorts
// are used and the list of all candidates must be pre-specified as part
// of the BasicRapporConfig.
diff --git a/src/algorithms/rappor/rappor_encoder_test.cc b/src/algorithms/rappor/rappor_encoder_test.cc
index 749d3f8..782e733 100644
--- a/src/algorithms/rappor/rappor_encoder_test.cc
+++ b/src/algorithms/rappor/rappor_encoder_test.cc
@@ -25,213 +25,6 @@
using system_data::ClientSecret;
-TEST(RapporConfigValidatorTest, TestMinPower2Above) {
- EXPECT_EQ(1u, RapporConfigValidator::MinPower2Above(0));
- EXPECT_EQ(1u, RapporConfigValidator::MinPower2Above(1));
- EXPECT_EQ(2u, RapporConfigValidator::MinPower2Above(2));
- EXPECT_EQ(4u, RapporConfigValidator::MinPower2Above(3));
- EXPECT_EQ(4u, RapporConfigValidator::MinPower2Above(4));
- EXPECT_EQ(8u, RapporConfigValidator::MinPower2Above(5));
- EXPECT_EQ(8u, RapporConfigValidator::MinPower2Above(6));
- EXPECT_EQ(8u, RapporConfigValidator::MinPower2Above(7));
- EXPECT_EQ(8u, RapporConfigValidator::MinPower2Above(8));
- EXPECT_EQ(16u, RapporConfigValidator::MinPower2Above(9));
- EXPECT_EQ(16u, RapporConfigValidator::MinPower2Above(10));
- EXPECT_EQ(16u, RapporConfigValidator::MinPower2Above(11));
- EXPECT_EQ(16u, RapporConfigValidator::MinPower2Above(12));
- EXPECT_EQ(16u, RapporConfigValidator::MinPower2Above(13));
- EXPECT_EQ(16u, RapporConfigValidator::MinPower2Above(14));
- EXPECT_EQ(16u, RapporConfigValidator::MinPower2Above(15));
- EXPECT_EQ(16u, RapporConfigValidator::MinPower2Above(16));
- EXPECT_EQ(32u, RapporConfigValidator::MinPower2Above(17));
-}
-
-TEST(RapporConfigValidatorTest, TestConstructor) {
- RapporConfig config;
- config.prob_0_becomes_1 = 0.3;
- config.prob_1_stays_1 = 0.7;
- config.num_bloom_bits = 64;
- config.num_hashes = 5;
-
- config.num_cohorts = 100;
- auto validator = RapporConfigValidator(config);
- EXPECT_EQ(128u, validator.num_cohorts_2_power());
-
- config.num_cohorts = 200;
- validator = RapporConfigValidator(config);
- EXPECT_EQ(256u, validator.num_cohorts_2_power());
-
- config.num_cohorts = 300;
- validator = RapporConfigValidator(config);
- EXPECT_EQ(512u, validator.num_cohorts_2_power());
-
- config.num_cohorts = 400;
- validator = RapporConfigValidator(config);
- EXPECT_EQ(512u, validator.num_cohorts_2_power());
-
- config.num_cohorts = 500;
- validator = RapporConfigValidator(config);
- EXPECT_EQ(512u, validator.num_cohorts_2_power());
-
- config.num_cohorts = 600;
- validator = RapporConfigValidator(config);
- EXPECT_EQ(1024u, validator.num_cohorts_2_power());
-
- config.num_cohorts = 1024 - 1;
- validator = RapporConfigValidator(config);
- EXPECT_EQ(1024u, validator.num_cohorts_2_power());
-
- config.num_cohorts = 1024;
- validator = RapporConfigValidator(config);
- EXPECT_EQ(1024u, validator.num_cohorts_2_power());
-}
-
-// Constructs a RapporEncoder with the given |config|, invokes
-// Encode() with a dummy string and EncodeNullObservation(), and checks that the
-// returned statuses are either kOK or kInvalidConfig, whichever is expected.
-void TestRapporConfig(const RapporConfig& config, Status expected_status, int caller_line_number) {
- // Make a ClientSecret once and statically store the token.
- static const std::string kClientSecretToken = ClientSecret::GenerateNewSecret().GetToken();
- // Each time this function is invoked reconstitute the secret from the token.
- RapporEncoder encoder(config, ClientSecret::FromToken(kClientSecretToken));
- RapporObservation obs;
- ValuePart value;
- value.set_string_value("dummy");
- EXPECT_EQ(expected_status, encoder.Encode(value, &obs))
- << "Invoked from line number: " << caller_line_number;
-}
-
-// A macro to invoke testRapporConfig and pass it the current line number.
-#define TEST_RAPPOR_CONFIG(config, expected_status) \
- (TestRapporConfig(config, expected_status, __LINE__))
-
-// Tests the validation of config for String RAPPOR.
-TEST(RapporEncoderTest, StringRapporConfigValidation) {
- // Empty config: Invalid
- RapporConfig config;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Add two probabilities, still Invalid
- config.prob_0_becomes_1 = 0.3;
- config.prob_1_stays_1 = 0.7;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Set num_bloom_bits, still Invalid
- config.num_bloom_bits = 8;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Set num_hashes, still Invalid
- config.num_hashes = 2;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // set num_cohorts: Valid
- config.num_cohorts = 20;
- TEST_RAPPOR_CONFIG(config, kOK);
-
- // Explicitly set PRR to 0: Valid.
- config.prob_rr = 0.0;
- TEST_RAPPOR_CONFIG(config, kOK);
-
- // Explicitly set PRR to non-zero: Invalid.
- config.prob_rr = 0.1;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Explicitly set PRR back to zero: Valid.
- config.prob_rr = 0.0;
- TEST_RAPPOR_CONFIG(config, kOK);
-
- // Set one of the probabilities to negative: Invalid
- config.prob_0_becomes_1 = -0.3;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Set one of the probabilities to greater than 1: Invalid
- config.prob_0_becomes_1 = 1.3;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Fix the probability: Valid
- config.prob_0_becomes_1 = 0.3;
- TEST_RAPPOR_CONFIG(config, kOK);
-
- // Set the other probability to negative: Invalid
- config.prob_1_stays_1 = -0.7;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Set the other probability to greater than 1: Invalid
- config.prob_1_stays_1 = 1.7;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Fix the probability: Valid
- config.prob_1_stays_1 = 0.7;
- TEST_RAPPOR_CONFIG(config, kOK);
-
- // Set num_bloom_bits to negative: Invalid
- config.num_bloom_bits = -8;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Set num_bloom_bits to 0: Invalid
- config.num_bloom_bits = 0;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Set num_bloom_bits back to positive: Valid
- config.num_bloom_bits = 8;
- TEST_RAPPOR_CONFIG(config, kOK);
-
- // Set num_hashes to negative: Invalid
- config.num_hashes = -2;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Set num_hashes to 0: Invalid
- config.num_hashes = 0;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Set num_hashes to 8: Invalid
- config.num_hashes = 8;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Set num_hashes back to positive: Valid
- config.num_hashes = 2;
- TEST_RAPPOR_CONFIG(config, kOK);
-
- // Set num_cohorts to negative: Invalid
- config.num_cohorts = -20;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Set num_cohorts to 0: Invalid
- config.num_cohorts = 0;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Set num_cohorts to 1025: Invalid
- config.num_cohorts = 1025;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Set num_cohorts to 1024: Valid
- config.num_cohorts = 1024;
- TEST_RAPPOR_CONFIG(config, kOK);
-
- // Set num_cohorts back to positive: Valid
- config.num_cohorts = 20;
- TEST_RAPPOR_CONFIG(config, kOK);
-
- // Set num_bloom_bits to equal num_hashes: Invalid
- config.num_bloom_bits = 2;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Set num_bloom_bits to greater than num_hashes and a power of 2: Valid
- config.num_bloom_bits = 4;
- TEST_RAPPOR_CONFIG(config, kOK);
-
- // Set num_bloom_bits to greater than num_hashes but not a power of 2: Invalid
- config.num_bloom_bits = 3;
- TEST_RAPPOR_CONFIG(config, kInvalidConfig);
-
- // Test with an invalid ClientSecret
- RapporEncoder encoder(config, ClientSecret::FromToken("Invalid Token"));
- RapporObservation obs;
- ValuePart value;
- value.set_string_value("dummy");
- EXPECT_EQ(kInvalidConfig, encoder.Encode(value, &obs));
-}
-
// Constructs a BasicRapporEncoder with the given |config|, invokes
// Encode() with a dummy string and EncodeNullObservation(), and checks that the
// returned statuses are either kOK or kInvalidConfig, whichever is expected.
@@ -690,304 +483,4 @@
EXPECT_EQ(kInvalidInput, encoder.Encode(value, &obs));
}
-class StringRapporEncoderTest : public ::testing::Test {
- protected:
- uint32_t AttemptDeriveCohortFromSecret(size_t attempt_number) {
- return encoder_->AttemptDeriveCohortFromSecret(attempt_number);
- }
-
- uint32_t DeriveCohortFromSecret() { return encoder_->DeriveCohortFromSecret(); }
-
- void SetNewEncoder(const RapporConfig& config, const system_data::ClientSecret& secret) {
- encoder_ = std::make_unique<RapporEncoder>(config, secret);
- }
-
- std::string MakeBloomBits(const ValuePart& value) { return encoder_->MakeBloomBits(value); }
-
- // Using the given parameters, and using the fixed input string
- // "www.google.com" and a fixed cohort (i.e. a fixed client secret),
- // this test generates a String RAPPOR observation 1000 times, counts
- // the number of resulting 1's and 0's in two bit positions, and performs
- // Pearson's Chi-squared test to check for goodness of fit to a binomial
- // distribution with the appropriate parameter. Fails if
- // chi-squared >= |chi_squared_threshold|.
- //
- // First we examine the Bloom filter with no bits flipped and we find one
- // index of a set bit and one index of an unset bit. We perform the
- // Chi-squared test twice: once for each of these two indices.
- //
- // Uses DeterministicRandom in order to ensure reproducibility.
- void DoChiSquaredTest(float prob_0_becomes_1, float prob_1_stays_1, int num_bits, int num_hashes,
- double chi_squared_threshold) {
- // Build the encoder.
- RapporConfig config;
- config.prob_0_becomes_1 = prob_0_becomes_1;
- config.prob_1_stays_1 = prob_1_stays_1;
- config.num_bloom_bits = num_bits;
- config.num_hashes = num_hashes;
- // This value will not be used but it needs to be something valid.
- config.num_cohorts = 100;
- // We use a fixed client secret so this test is deterministic.
- static const char kClientSecret[] = "4b4BxKq253TTCWIXFhLDTg==";
- SetNewEncoder(config, ClientSecret::FromToken(kClientSecret));
- // Give the encoder a deterministic RNG.
- encoder_->SetRandomForTesting(
- std::unique_ptr<crypto::Random>(new crypto::DeterministicRandom()));
-
- // Build the input value. We use a fixed input string so this test is
- // deterministic.
- ValuePart value;
- value.set_string_value("www.google.com");
-
- // Capture the indices of one bit that is set and one bit that is unset in
- // the bloom filter for the input value. It doesn't matter which two bits
- // we capture. We will do two chi-squared tests, one on each of the two
- // bits.
- int index_of_set_bit = -1;
- int index_of_unset_bit = -1;
- auto bloom_bits = MakeBloomBits(value);
- int num_bits_set = 0;
- for (int bit_index = 0; bit_index < num_bits; bit_index++) {
- if (IsSet(bloom_bits, bit_index)) {
- num_bits_set++;
- index_of_set_bit = bit_index;
- } else {
- index_of_unset_bit = bit_index;
- }
- }
- ASSERT_GT(num_bits_set, 0);
- ASSERT_LE(num_bits_set, num_hashes);
- // This heuristic for a minimum reasonable number of bits that should be
- // set was found by experimentation to allow the test to pass.
- int expected_min_num_bits_set = std::min(num_hashes - 2, num_bits / 4);
- ASSERT_GE(num_bits_set, expected_min_num_bits_set)
- << " num_bits=" << num_bits << " num_hashes=" << num_hashes;
- ASSERT_GE(index_of_set_bit, 0);
- ASSERT_LT(index_of_set_bit, num_bits);
- ASSERT_GE(index_of_unset_bit, 0);
- ASSERT_LT(index_of_unset_bit, num_bits);
- ASSERT_NE(index_of_set_bit, index_of_unset_bit);
- ASSERT_TRUE(IsSet(bloom_bits, index_of_set_bit));
- ASSERT_FALSE(IsSet(bloom_bits, index_of_unset_bit));
-
- // Encode the input value 1000 times, tallying the counts for the two bits.
- static const int kNumTrials = 1000;
- int set_bit_count = 0;
- int unset_bit_count = 0;
- for (size_t i = 0; i < kNumTrials; i++) {
- RapporObservation obs;
- EXPECT_EQ(kOK, encoder_->Encode(value, &obs));
- if (IsSet(obs.data(), index_of_set_bit)) {
- set_bit_count++;
- }
- if (IsSet(obs.data(), index_of_unset_bit)) {
- unset_bit_count++;
- }
- }
-
- // This is the expected number of ones and zeroes for a bit that is set
- // in the Bloom filter.
- const double expected_1_set = static_cast<double>(kNumTrials) * prob_1_stays_1;
- const double expected_0_set = static_cast<double>(kNumTrials) - expected_1_set;
-
- // This is the expected number of ones and zeroes for a bit that is
- // unset in the Bloom filter.
- const double expected_1_unset = static_cast<double>(kNumTrials) * prob_0_becomes_1;
- const double expected_0_unset = static_cast<double>(kNumTrials) - expected_1_unset;
-
- // Perform Chi-squared test twice, once for the set bit, once for the
- // unset bit.
- for (int i = 0; i < 2; i++) {
- double exp_0 = (i == 0 ? expected_0_set : expected_0_unset);
- double exp_1 = (i == 0 ? expected_1_set : expected_1_unset);
- int count = (i == 0 ? set_bit_count : unset_bit_count);
-
- // Difference between actual 1 count and expected 1 count.
- double delta_1 = static_cast<double>(count) - exp_1;
-
- // Difference between actual 0 count and expected 0 count.
- double delta_0 = static_cast<double>(kNumTrials - count) - exp_0;
-
- // Compute and check the Chi-Squared value.
- double chi_squared = delta_1 * delta_1 / exp_1 + delta_0 * delta_0 / exp_0;
-
- EXPECT_LT(chi_squared, chi_squared_threshold)
- << " delta_0=" << delta_0 << " delta_1=" << delta_1 << " num_bits=" << num_bits
- << " num_hashes=" << num_hashes << " i=" << i << " prob_0_becomes_1=" << prob_0_becomes_1
- << " prob_1_stays_1=" << prob_1_stays_1;
- }
- }
-
- public:
- std::unique_ptr<RapporEncoder> encoder_;
-};
-
-// We invoke AttemptDeriveCohortFromSecret() 1000 times using a fixed
-// ClientSecret and increasing values for |attempt_number|. We use 16 buckets
-// (i.e. num_cohorts_2_power = 16). The outputs should be approximately
-// uniformly distributed over the integers in [0, 15].
-TEST_F(StringRapporEncoderTest, AttemptDeriveCohortFromSecret) {
- RapporConfig config;
- // These config values are not relevant but need to be something valid.
- config.prob_0_becomes_1 = 0.3;
- config.prob_1_stays_1 = 0.7;
- config.num_bloom_bits = 64;
- config.num_hashes = 5;
-
- // We set num_cohorts to 10 so num_cohorts_2_power will be 16.
- config.num_cohorts = 10;
-
- // We use a fixed client secret so this test is deterministic.
- static std::string kClientSecret = "4b4BxKq253TTCWIXFhLDTg==";
- SetNewEncoder(config, ClientSecret::FromToken(kClientSecret));
-
- // Initialize counts to all zeroes.
- int counts[16] = {0};
-
- // Invoke AttemptDeriveCohortFromSecret() 1000 times with successive
- // attempt indices. Accumulate the results.
- for (int i = 0; i < 1000; i++) {
- counts[AttemptDeriveCohortFromSecret(i)]++;
- }
-
- // These are the counts we found we get. 1000/16 = 62.5 is the expected value
- // for each count.
- //
- // Each of the results for buckets 10, 11 12, 13, 14 and 15 would have been
- // discarded and another attempt would have been made. That happened with
- // probability (75 + 71 + 56 + 58 + 44 + 53) / 1000 = 0.357.
- const int expected_counts[] = {58, 71, 69, 62, 70, 64, 76, 57, 48, 68, 75, 71, 56, 58, 44, 53};
-
- for (size_t i = 0; i < sizeof(expected_counts) / sizeof(expected_counts[0]); i++) {
- EXPECT_EQ(expected_counts[i], counts[i]) << "i=" << i;
- }
-}
-
-// We invoke DeriveCohortFromSecret() 1000 times using a varying ClientSecret.
-// (We use a deterministic PRNG so the test is deterministic.)
-// We use 10 buckets (i.e. num_cohorts = 10). The outputs should be
-// approximately uniformly distributed over the integers in [0, 9].
-TEST_F(StringRapporEncoderTest, DeriveCohortFromSecret) {
- RapporConfig config;
- // These config values are not relevant but need to be valid.
- config.prob_0_becomes_1 = 0.3;
- config.prob_1_stays_1 = 0.7;
- config.num_bloom_bits = 64;
- config.num_hashes = 5;
-
- // We set num_cohorts to 10.
- config.num_cohorts = 10;
-
- // Initialize counts to all zeroes.
- int counts[10] = {0};
-
- crypto::DeterministicRandom deterministic_random;
-
- // Invoke DeriveCohortFromSecret() 1000 times. Accumulate the results.
- for (int i = 0; i < 1000; i++) {
- SetNewEncoder(config, ClientSecret::GenerateNewSecret(&deterministic_random));
- // The constructor should have already invoked DeriveCohortFromSecret
- // and set cohort to that value.
- EXPECT_EQ(encoder_->cohort(), DeriveCohortFromSecret());
- counts[encoder_->cohort()]++;
- }
-
- // These are the counts we found we get. 1000/10 = 100 is the expected value
- // for each count.
- const int expected_counts[] = {85, 98, 104, 93, 113, 93, 89, 99, 103, 123};
-
- for (int i = 0; i < 10; i++) {
- EXPECT_EQ(expected_counts[i], counts[i]) << "i=" << i;
- }
-}
-
-// We invoke MakeBloomBits 1000 times with a fixed cohort
-// (i.e. a fixed ClientSecret) and varying input strings. We use 10 different
-// initial segments of 100 different randomly generated strings. (We use a
-// deterministic PRNG so this test is deterministic.) We use the values
-// num_hashes = 2, num_bloom_bits = 16. We accumulate the counts of the number
-// of times each bit is set. The counts should be approximately uniformly
-// distributed over the integers in [0, 15].
-TEST_F(StringRapporEncoderTest, MakeBloomBits) {
- RapporConfig config;
- // These config values are not relevant but need to be valid.
- config.prob_0_becomes_1 = 0.3;
- config.prob_1_stays_1 = 0.7;
- config.num_cohorts = 10;
-
- // Set the number of bloom bits to 16
- static const int kNumBloomBits = 16;
- config.num_bloom_bits = kNumBloomBits;
-
- // Set the number of hashes to 2.
- config.num_hashes = 2;
-
- // We use a fixed client secret so this test is deterministic.
- static const char kClientSecret[] = "4b4BxKq253TTCWIXFhLDTg==";
- SetNewEncoder(config, ClientSecret::FromToken(kClientSecret));
-
- // Initialize counts to all zeroes.
- int counts[kNumBloomBits] = {0};
-
- crypto::DeterministicRandom prng;
-
- // We invoke MakeBloomBits() 1000 times and accumulate the results.
-
- // Generate 100 random strings of length 100.
- for (int i = 0; i < 100; i++) {
- crypto::byte random_bits[100];
- prng.RandomBytes(random_bits, sizeof(random_bits));
- // Use 10 progressively longer initial segments of |random_bits|.
- for (int size = 10; size <= 100; size += 10) {
- ValuePart value;
- auto blob_value = std::string(reinterpret_cast<char*>(random_bits), size);
- value.set_blob_value(blob_value);
- auto bloom_bits = MakeBloomBits(value);
- // Capture which bits were set.
- int num_set = 0;
- for (int bit_index = 0; bit_index < kNumBloomBits; bit_index++) {
- if (IsSet(bloom_bits, bit_index)) {
- num_set++;
- counts[bit_index]++;
- }
- }
- // Since we are using 2 hashes the number of bits set should be 1 or 2.
- EXPECT_TRUE(num_set == 1 || num_set == 2);
- }
- }
-
- // These are the counts we found we get. 2000/16 = 125 is the
- // expected value for each count.
- const int expected_counts[] = {114, 139, 100, 113, 117, 118, 119, 122,
- 137, 129, 114, 134, 116, 109, 137, 123};
-
- for (int i = 0; i < 16; i++) {
- EXPECT_EQ(expected_counts[i], counts[i]) << "i=" << i;
- }
-}
-
-// For various numbers of bits, and hashes, and for various values of p and q
-// we invoke DoChiSquaredTest().
-TEST_F(StringRapporEncoderTest, ChiSquaredTest) {
- // Use num_bits = 4, 16, 64, 256, 1024
- for (int num_bits_exp = 2; num_bits_exp <= 10; num_bits_exp += 2) {
- int num_bits = 1 << num_bits_exp;
- // Use num_hashes = 2, 5 and 8.
- int max_num_hashes = std::min(8, num_bits - 1);
- for (int num_hashes = 2; num_hashes <= max_num_hashes; num_hashes += 3) {
- // The first two parameters are p and q.
- //
- // The last parameter is the chi-squared value to use. Notice that these
- // values were chosen by experimentation to be as small as possible so
- // that the test passes. They do not necessarily correspond to natural
- // confidence intervals for the chi-squared test.
- DoChiSquaredTest(0.01, 0.99, num_bits, num_hashes, 4.95);
- DoChiSquaredTest(0.1, 0.9, num_bits, num_hashes, 5.38);
- DoChiSquaredTest(0.2, 0.8, num_bits, num_hashes, 2.26);
- DoChiSquaredTest(0.25, 0.75, num_bits, num_hashes, 2.83);
- DoChiSquaredTest(0.3, 0.7, num_bits, num_hashes, 2.31);
- }
- }
-}
-
} // namespace cobalt::rappor
diff --git a/src/algorithms/rappor/rappor_test_utils.cc b/src/algorithms/rappor/rappor_test_utils.cc
index d6a87d7..950e495 100644
--- a/src/algorithms/rappor/rappor_test_utils.cc
+++ b/src/algorithms/rappor/rappor_test_utils.cc
@@ -81,36 +81,4 @@
std::string(index, other_char);
}
-std::string CandidateString(int i) { return std::string("candidate string") + std::to_string(i); }
-
-// Populates |candidate_list| with |num_candidates| candidates;
-void PopulateRapporCandidateList(uint32_t num_candidates, RapporCandidateList* candidate_list) {
- candidate_list->Clear();
- for (size_t i = 0; i < num_candidates; i++) {
- candidate_list->add_candidates(CandidateString(i));
- }
-}
-
-// Makes a RapporConfig with the given data.
-RapporConfig Config(uint32_t num_bloom_bits, uint32_t num_cohorts, uint32_t num_hashes, float p,
- float q) {
- RapporConfig config;
- config.num_bloom_bits = num_bloom_bits;
- config.num_hashes = num_hashes;
- config.num_cohorts = num_cohorts;
- config.prob_0_becomes_1 = p;
- config.prob_1_stays_1 = q;
- return config;
-}
-
-// Given a string of "0"s and "1"s of length a multiple of 8, and a cohort,
-// returns a RapporObservation for the given cohort whose data is equal to the
-// bytes whose binary representation is given by the string.
-RapporObservation RapporObservationFromString(uint32_t cohort, const std::string& binary_string) {
- RapporObservation obs;
- obs.set_cohort(cohort);
- obs.set_data(BinaryStringToData(binary_string));
- return obs;
-}
-
} // namespace cobalt::rappor
diff --git a/src/algorithms/rappor/rappor_test_utils.h b/src/algorithms/rappor/rappor_test_utils.h
index 72ec32b..f00bafb 100644
--- a/src/algorithms/rappor/rappor_test_utils.h
+++ b/src/algorithms/rappor/rappor_test_utils.h
@@ -51,20 +51,6 @@
// REQUIRES: 0 <= index < num_bits.
std::string BuildBitPatternString(int num_bits, int index, char index_char, char other_char);
-std::string CandidateString(int i);
-
-// Populates |candidate_list| with |num_candidates| candidates;
-void PopulateRapporCandidateList(uint32_t num_candidates, RapporCandidateList* candidate_list);
-
-// Makes a RapporConfig with the given data.
-RapporConfig Config(uint32_t num_bloom_bits, uint32_t num_cohorts, uint32_t num_hashes, float p,
- float q);
-
-// Given a string of "0"s and "1"s of length a multiple of 8, and a cohort,
-// returns a RapporObservation for the given cohort whose data is equal to the
-// bytes whose binary representation is given by the string.
-RapporObservation RapporObservationFromString(uint32_t cohort, const std::string& binary_string);
-
} // namespace cobalt::rappor
#endif // COBALT_SRC_ALGORITHMS_RAPPOR_RAPPOR_TEST_UTILS_H_
diff --git a/src/registry/report_definition.proto b/src/registry/report_definition.proto
index 68aa074..f33665a 100644
--- a/src/registry/report_definition.proto
+++ b/src/registry/report_definition.proto
@@ -36,8 +36,8 @@
// An instance of ReportDefinition is always associated with an instance of
// MetricDefinition called the owning MetricDefinition.
message ReportDefinition {
- reserved 7;
- reserved "threshold";
+ reserved 5, 6, 7;
+ reserved "expected_population_size", "expected_string_set_size", "threshold";
// Next id: 17
// Unique name for this Report within its owning MetricDefinition.
@@ -284,7 +284,7 @@
// A level of random noise to use when encoding observations for RAPPOR.
enum LocalPrivacyNoiseLevel {
// local_privacy_noise_level must be explicitly set when using
- // Basic RAPPOR or String RAPPOR.
+ // Basic RAPPOR.
NOISE_LEVEL_UNSET = 0;
// p = 0.0, q = 1.0
@@ -301,16 +301,10 @@
}
// This field is used only with Report types SIMPLE_OCCURRENCE_COUNT and
- // HIGH_FREQUENCY_STRING_COUNTS. Its value must not be changed
- // after data collection has begun or the data will become corrupted.
+ // UniqueNDayActives. Its value must not be changed after data collection
+ // has begun or the data will become corrupted.
LocalPrivacyNoiseLevel local_privacy_noise_level = 4;
- // This field is not in use.
- uint32 expected_population_size = 5 [deprecated = true];
-
- // This field is not in use.
- uint32 expected_string_set_size = 6 [deprecated = true];
-
// Explicit list of known string values. Either this or |candidate_file|
// should be used, not both. Used for the hashed |component_name|
// field for several Report types.
@@ -508,8 +502,3 @@
LinearIntegerBuckets linear = 2;
}
}
-
-// A list of candidates to use during a string RAPPOR analysis.
-message RapporCandidateList {
- repeated string candidates = 1;
-}