[experimental] Implement single pass of RAPPOR over 2d representations of histograms.
Change-Id: I01a6a818f792eeb6dfa70298c013b87e0a52ed52
diff --git a/src/algorithms/experimental/histogram_encoder.cc b/src/algorithms/experimental/histogram_encoder.cc
index fc4e20c..421851a 100644
--- a/src/algorithms/experimental/histogram_encoder.cc
+++ b/src/algorithms/experimental/histogram_encoder.cc
@@ -142,4 +142,70 @@
return sum;
}
+SinglePassTwoDimRapporHistogramEncoder::SinglePassTwoDimRapporHistogramEncoder(
+ BitGeneratorInterface<uint32_t>* gen, uint32_t num_buckets, uint64_t max_count, double p)
+ : gen_(gen), num_buckets_(num_buckets), max_count_(max_count), p_(p) {}
+
+std::vector<std::pair<uint32_t, uint64_t>> SinglePassTwoDimRapporHistogramEncoder::Encode(
+ const std::vector<uint64_t>& histogram) {
+ std::vector<uint64_t> clipped_histogram(num_buckets_);
+ for (uint32_t bucket_index = 0u; bucket_index < num_buckets_; bucket_index++) {
+ clipped_histogram[bucket_index] = std::min(histogram[bucket_index], max_count_);
+ }
+ std::vector<std::pair<uint32_t, uint64_t>> encoded;
+ auto flip_bit = BernoulliDistribution(gen_, p_);
+ for (uint32_t bucket_index = 0u; bucket_index < num_buckets_; bucket_index++) {
+ for (uint64_t bucket_count = 1u; bucket_count <= max_count_; bucket_count++) {
+ if ((bucket_count == clipped_histogram[bucket_index]) != flip_bit.Sample()) {
+ encoded.emplace_back(std::pair<uint32_t, uint64_t>(bucket_index, bucket_count));
+ }
+ }
+ }
+
+ return encoded;
+}
+
+SinglePassTwoDimRapporHistogramSumEstimator::SinglePassTwoDimRapporHistogramSumEstimator(
+ uint32_t num_buckets, uint64_t max_count, double p)
+ : num_buckets_(num_buckets), max_count_(max_count), p_(p) {}
+
+std::vector<double> SinglePassTwoDimRapporHistogramSumEstimator::ComputeSum(
+ const std::vector<std::vector<std::pair<uint32_t, uint64_t>>>& encoded_histograms,
+ uint64_t num_participants) {
+ // Compute the number of occurrences of each bit ID in |encoded_histograms|.
+ std::map<std::pair<uint32_t, uint64_t>, uint64_t> raw_bit_counts;
+ for (const auto& hist : encoded_histograms) {
+ for (auto bit_id : hist) {
+ raw_bit_counts[bit_id]++;
+ }
+ }
+
+ std::vector<double> sum(num_buckets_);
+ for (uint32_t bucket_index = 0; bucket_index < num_buckets_; bucket_index++) {
+ double total_count_for_bucket = 0.0;
+ for (uint64_t bucket_count = 0; bucket_count <= max_count_; bucket_count++) {
+ // Estimate the true number of occurrences of the bit with ID (|bucket_count|, |bucket_index|)
+ // among the |num_participants| encoded bitvectors. (See Erlingsson, Pihur, Korolova, "RAPPOR:
+ // Randomized Aggregatable Privacy-Preserving Ordinal Response" section 4, taking f = 1.)
+ double estimated_bit_count =
+ (static_cast<double>(raw_bit_counts[{bucket_index, bucket_count}]) -
+ p_ * num_participants) /
+ (1.0 - 2 * p_);
+ // Clip the estimate into the range of possible true values.
+ if (estimated_bit_count < 0.0) {
+ estimated_bit_count = 0.0;
+ }
+ if (estimated_bit_count > num_participants) {
+ estimated_bit_count = num_participants;
+ }
+ // Compute the contribution of the (estimated number of) bits with true ID (|bucket_index|,
+ // |bucket_count|) to the total count for bucket |bucket_index|.
+ total_count_for_bucket += static_cast<double>(bucket_count) * estimated_bit_count;
+ }
+ sum[bucket_index] = total_count_for_bucket;
+ }
+
+ return sum;
+}
+
} // namespace cobalt
diff --git a/src/algorithms/experimental/histogram_encoder.h b/src/algorithms/experimental/histogram_encoder.h
index efbf623..16ed252 100644
--- a/src/algorithms/experimental/histogram_encoder.h
+++ b/src/algorithms/experimental/histogram_encoder.h
@@ -124,12 +124,65 @@
public:
// |num_buckets| is the number of histogram buckets, |max_count| is the maximum contribution of an
// individual user to each bucket count, and |p| is a noise parameter. These should be equal to
- // the arguments that were used to construct the OccurrenceWiseHistogramEncoder which produced the
+ // the arguments that were used to construct the TwoDimRapporHistogramSumEstimator which produced the
// encoded histograms.
TwoDimRapporHistogramSumEstimator(uint32_t num_buckets, uint64_t max_count, double p);
// |encoded_histograms| is a vector of encoded histograms produced by an
- // OccurrenceWiseHistogramEncoder, and |num_participants| is the number of users contributing to
+ // TwoDimRapporHistogramSumEstimator, and |num_participants| is the number of users contributing to
+ // the report. For each index n in the range [0, |num_buckets| - 1], ComputeSum()
+ // returns an unbiased estimate of the sum of the counts in the n-th buckets of the encoded
+ // histograms.
+ std::vector<double> ComputeSum(
+ const std::vector<std::vector<std::pair<uint32_t, uint64_t>>>& encoded_histograms,
+ uint64_t num_participants);
+
+ private:
+ uint32_t num_buckets_;
+ uint64_t max_count_;
+ double p_;
+};
+
+// An encoder that operates on histograms. The histogram is represented as a
+// bit vector of length L = |num_buckets| * (|max_count| + 1).
+// For each bucket, the kth bit is set to 1 where
+// k = |bucket_index| * |max_count| + |bucket_count|.
+// All other bits are set to 0. Then, the bit vector is encoded using Basic RAPPOR;
+// i.e., each bit is flipped with probability |p|. The encoder then returns
+// the set of pairs (bucket_index, bucket_count) corresponding to the "1" bits
+// in the encoded vector. Pairs of the form (bucket_index, 0) are not included in the result.
+class SinglePassTwoDimRapporHistogramEncoder {
+ public:
+ // |num_buckets| is the number of buckets in each input histogram, |max_count| is the maximum
+ // count that should be reported for any bucket, and |p| is the probability of a bit flip during
+ // Basic RAPPOR encoding.
+ SinglePassTwoDimRapporHistogramEncoder(BitGeneratorInterface<uint32_t>* gen, uint32_t num_buckets,
+ uint64_t max_count, double p);
+
+ // |histogram| is a vector of length |num_buckets|, where the n-th element is the count for that
+ // bucket. Counts should be in the range [0, |max_count|] inclusive; larger bucket counts are
+ // clipped to |max_count| before encoding.
+ std::vector<std::pair<uint32_t, uint64_t>> Encode(const std::vector<uint64_t>& histogram);
+
+ private:
+ BitGeneratorInterface<uint32_t>* gen_;
+ uint32_t num_buckets_;
+ uint64_t max_count_;
+ double p_;
+};
+
+// Estimates the true sum of a collection of histograms, each of which was encoded with a
+// SinglePassTwoDimRapporHistogramEncoder.
+class SinglePassTwoDimRapporHistogramSumEstimator {
+ public:
+ // |num_buckets| is the number of histogram buckets, |max_count| is the maximum contribution of an
+ // individual user to each bucket count, and |p| is a noise parameter. These should be equal to
+ // the arguments that were used to construct SinglePassTwoDimRapporHistogramSumEstimator the which produced the
+ // encoded histograms.
+ SinglePassTwoDimRapporHistogramSumEstimator(uint32_t num_buckets, uint64_t max_count, double p);
+
+ // |encoded_histograms| is a vector of encoded histograms produced by an
+ // SinglePassTwoDimRapporHistogramSumEstimator, and |num_participants| is the number of users contributing to
// the report. For each index n in the range [0, |num_buckets| - 1], ComputeSum()
// returns an unbiased estimate of the sum of the counts in the n-th buckets of the encoded
// histograms.
diff --git a/src/algorithms/experimental/histogram_encoder_test.cc b/src/algorithms/experimental/histogram_encoder_test.cc
index 76fc015..6942d38 100644
--- a/src/algorithms/experimental/histogram_encoder_test.cc
+++ b/src/algorithms/experimental/histogram_encoder_test.cc
@@ -212,4 +212,23 @@
EXPECT_THAT(estimate, Pointwise(FloatNear(0.001), expected_sums));
}
+TEST_F(HistogramEncoderTest, SinglePassTwoDimRapporHistogramEncoderAllZero) {
+ auto encoder = SinglePassTwoDimRapporHistogramEncoder(GetGenerator(), 20, 50, 0.0002);
+ std::vector<uint64_t> histogram(20, 0);
+ auto encoded = encoder.Encode(histogram);
+ EXPECT_THAT(encoded, ElementsAre(Pair(4, 21)));
+}
+
+TEST_F(HistogramEncoderTest, SinglePassTwoDimRapporHistogramSumEstimator) {
+ auto estimator = SinglePassTwoDimRapporHistogramSumEstimator(20, 50, 0.0002);
+ auto estimate = estimator.ComputeSum({{{2, 1}, {5, 10}, {6, 50}},
+ {{2, 1}, {7, 1}, {15, 3}, {17, 34}},
+ {{0, 1}, {2, 3}, {15, 6}, {19, 25}}},
+ 3);
+ std::vector<double> expected_sums = {0.999, 0.000, 4.999, 0.000, 0.000, 9.998, 49.990,
+ 0.999, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000,
+ 0.000, 8.998, 0.000, 33.993, 0.000, 24.995};
+ EXPECT_THAT(estimate, Pointwise(FloatNear(0.001), expected_sums));
+}
+
} // namespace cobalt