[experimental] Implement single pass of RAPPOR over 2d representations of histograms. Change-Id: I01a6a818f792eeb6dfa70298c013b87e0a52ed52

commit: 06608077f620746339ae7f9a8ecab78196f54470 [log] [tgz]
author: Alex Zani <azani@google.com> Tue Mar 24 19:06:10 2020 +0000
committer: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Tue Mar 24 19:06:10 2020 +0000
tree: 3b09aaffd09d479fa362d68006d5b0e799f9779c
parent: e6ce017326e774cd138c79fd5d587ebaa0166730 [diff]
diff --git a/src/algorithms/experimental/histogram_encoder.cc b/src/algorithms/experimental/histogram_encoder.cc
index fc4e20c..421851a 100644
--- a/src/algorithms/experimental/histogram_encoder.cc
+++ b/src/algorithms/experimental/histogram_encoder.cc

@@ -142,4 +142,70 @@
   return sum;
 }
 
+SinglePassTwoDimRapporHistogramEncoder::SinglePassTwoDimRapporHistogramEncoder(
+    BitGeneratorInterface<uint32_t>* gen, uint32_t num_buckets, uint64_t max_count, double p)
+    : gen_(gen), num_buckets_(num_buckets), max_count_(max_count), p_(p) {}
+
+std::vector<std::pair<uint32_t, uint64_t>> SinglePassTwoDimRapporHistogramEncoder::Encode(
+    const std::vector<uint64_t>& histogram) {
+  std::vector<uint64_t> clipped_histogram(num_buckets_);
+  for (uint32_t bucket_index = 0u; bucket_index < num_buckets_; bucket_index++) {
+    clipped_histogram[bucket_index] = std::min(histogram[bucket_index], max_count_);
+  }
+  std::vector<std::pair<uint32_t, uint64_t>> encoded;
+  auto flip_bit = BernoulliDistribution(gen_, p_);
+  for (uint32_t bucket_index = 0u; bucket_index < num_buckets_; bucket_index++) {
+    for (uint64_t bucket_count = 1u; bucket_count <= max_count_; bucket_count++) {
+      if ((bucket_count == clipped_histogram[bucket_index]) != flip_bit.Sample()) {
+        encoded.emplace_back(std::pair<uint32_t, uint64_t>(bucket_index, bucket_count));
+      }
+    }
+  }
+
+  return encoded;
+}
+
+SinglePassTwoDimRapporHistogramSumEstimator::SinglePassTwoDimRapporHistogramSumEstimator(
+    uint32_t num_buckets, uint64_t max_count, double p)
+    : num_buckets_(num_buckets), max_count_(max_count), p_(p) {}
+
+std::vector<double> SinglePassTwoDimRapporHistogramSumEstimator::ComputeSum(
+    const std::vector<std::vector<std::pair<uint32_t, uint64_t>>>& encoded_histograms,
+    uint64_t num_participants) {
+  // Compute the number of occurrences of each bit ID in |encoded_histograms|.
+  std::map<std::pair<uint32_t, uint64_t>, uint64_t> raw_bit_counts;
+  for (const auto& hist : encoded_histograms) {
+    for (auto bit_id : hist) {
+      raw_bit_counts[bit_id]++;
+    }
+  }
+
+  std::vector<double> sum(num_buckets_);
+  for (uint32_t bucket_index = 0; bucket_index < num_buckets_; bucket_index++) {
+    double total_count_for_bucket = 0.0;
+    for (uint64_t bucket_count = 0; bucket_count <= max_count_; bucket_count++) {
+      // Estimate the true number of occurrences of the bit with ID (|bucket_count|, |bucket_index|)
+      // among the |num_participants| encoded bitvectors. (See Erlingsson, Pihur, Korolova, "RAPPOR:
+      // Randomized Aggregatable Privacy-Preserving Ordinal Response" section 4, taking f = 1.)
+      double estimated_bit_count =
+          (static_cast<double>(raw_bit_counts[{bucket_index, bucket_count}]) -
+           p_ * num_participants) /
+          (1.0 - 2 * p_);
+      // Clip the estimate into the range of possible true values.
+      if (estimated_bit_count < 0.0) {
+        estimated_bit_count = 0.0;
+      }
+      if (estimated_bit_count > num_participants) {
+        estimated_bit_count = num_participants;
+      }
+      // Compute the contribution of the (estimated number of) bits with true ID (|bucket_index|,
+      // |bucket_count|) to the total count for bucket |bucket_index|.
+      total_count_for_bucket += static_cast<double>(bucket_count) * estimated_bit_count;
+    }
+    sum[bucket_index] = total_count_for_bucket;
+  }
+
+  return sum;
+}
+
 }  // namespace cobalt

diff --git a/src/algorithms/experimental/histogram_encoder.h b/src/algorithms/experimental/histogram_encoder.h
index efbf623..16ed252 100644
--- a/src/algorithms/experimental/histogram_encoder.h
+++ b/src/algorithms/experimental/histogram_encoder.h

@@ -124,12 +124,65 @@
  public:
   // |num_buckets| is the number of histogram buckets, |max_count| is the maximum contribution of an
   // individual user to each bucket count, and |p| is a noise parameter. These should be equal to
-  // the arguments that were used to construct the OccurrenceWiseHistogramEncoder which produced the
+  // the arguments that were used to construct the TwoDimRapporHistogramSumEstimator which produced the
   // encoded histograms.
   TwoDimRapporHistogramSumEstimator(uint32_t num_buckets, uint64_t max_count, double p);
 
   // |encoded_histograms| is a vector of encoded histograms produced by an
-  // OccurrenceWiseHistogramEncoder, and |num_participants| is the number of users contributing to
+  // TwoDimRapporHistogramSumEstimator, and |num_participants| is the number of users contributing to
+  // the report. For each index n in the range [0, |num_buckets| - 1], ComputeSum()
+  // returns an unbiased estimate of the sum of the counts in the n-th buckets of the encoded
+  // histograms.
+  std::vector<double> ComputeSum(
+      const std::vector<std::vector<std::pair<uint32_t, uint64_t>>>& encoded_histograms,
+      uint64_t num_participants);
+
+ private:
+  uint32_t num_buckets_;
+  uint64_t max_count_;
+  double p_;
+};
+
+// An encoder that operates on histograms. The histogram is represented as a
+// bit vector of length L = |num_buckets| * (|max_count| + 1).
+// For each bucket, the kth bit is set to 1 where
+// k = |bucket_index| * |max_count| + |bucket_count|.
+// All other bits are set to 0. Then, the bit vector is encoded using Basic RAPPOR;
+// i.e., each bit is flipped with probability |p|. The encoder then returns
+// the set of pairs (bucket_index, bucket_count) corresponding to the "1" bits
+// in the encoded vector. Pairs of the form (bucket_index, 0) are not included in the result.
+class SinglePassTwoDimRapporHistogramEncoder {
+ public:
+  // |num_buckets| is the number of buckets in each input histogram, |max_count| is the maximum
+  // count that should be reported for any bucket, and |p| is the probability of a bit flip during
+  // Basic RAPPOR encoding.
+  SinglePassTwoDimRapporHistogramEncoder(BitGeneratorInterface<uint32_t>* gen, uint32_t num_buckets,
+                                         uint64_t max_count, double p);
+
+  // |histogram| is a vector of length |num_buckets|, where the n-th element is the count for that
+  // bucket. Counts should be in the range [0, |max_count|] inclusive; larger bucket counts are
+  // clipped to |max_count| before encoding.
+  std::vector<std::pair<uint32_t, uint64_t>> Encode(const std::vector<uint64_t>& histogram);
+
+ private:
+  BitGeneratorInterface<uint32_t>* gen_;
+  uint32_t num_buckets_;
+  uint64_t max_count_;
+  double p_;
+};
+
+// Estimates the true sum of a collection of histograms, each of which was encoded with a
+// SinglePassTwoDimRapporHistogramEncoder.
+class SinglePassTwoDimRapporHistogramSumEstimator {
+ public:
+  // |num_buckets| is the number of histogram buckets, |max_count| is the maximum contribution of an
+  // individual user to each bucket count, and |p| is a noise parameter. These should be equal to
+  // the arguments that were used to construct SinglePassTwoDimRapporHistogramSumEstimator the which produced the
+  // encoded histograms.
+  SinglePassTwoDimRapporHistogramSumEstimator(uint32_t num_buckets, uint64_t max_count, double p);
+
+  // |encoded_histograms| is a vector of encoded histograms produced by an
+  // SinglePassTwoDimRapporHistogramSumEstimator, and |num_participants| is the number of users contributing to
   // the report. For each index n in the range [0, |num_buckets| - 1], ComputeSum()
   // returns an unbiased estimate of the sum of the counts in the n-th buckets of the encoded
   // histograms.

diff --git a/src/algorithms/experimental/histogram_encoder_test.cc b/src/algorithms/experimental/histogram_encoder_test.cc
index 76fc015..6942d38 100644
--- a/src/algorithms/experimental/histogram_encoder_test.cc
+++ b/src/algorithms/experimental/histogram_encoder_test.cc

@@ -212,4 +212,23 @@
   EXPECT_THAT(estimate, Pointwise(FloatNear(0.001), expected_sums));
 }
 
+TEST_F(HistogramEncoderTest, SinglePassTwoDimRapporHistogramEncoderAllZero) {
+  auto encoder = SinglePassTwoDimRapporHistogramEncoder(GetGenerator(), 20, 50, 0.0002);
+  std::vector<uint64_t> histogram(20, 0);
+  auto encoded = encoder.Encode(histogram);
+  EXPECT_THAT(encoded, ElementsAre(Pair(4, 21)));
+}
+
+TEST_F(HistogramEncoderTest, SinglePassTwoDimRapporHistogramSumEstimator) {
+  auto estimator = SinglePassTwoDimRapporHistogramSumEstimator(20, 50, 0.0002);
+  auto estimate = estimator.ComputeSum({{{2, 1}, {5, 10}, {6, 50}},
+                                        {{2, 1}, {7, 1}, {15, 3}, {17, 34}},
+                                        {{0, 1}, {2, 3}, {15, 6}, {19, 25}}},
+                                       3);
+  std::vector<double> expected_sums = {0.999, 0.000, 4.999, 0.000,  0.000, 9.998, 49.990,
+                                       0.999, 0.000, 0.000, 0.000,  0.000, 0.000, 0.000,
+                                       0.000, 8.998, 0.000, 33.993, 0.000, 24.995};
+  EXPECT_THAT(estimate, Pointwise(FloatNear(0.001), expected_sums));
+}
+
 }  // namespace cobalt
commit	06608077f620746339ae7f9a8ecab78196f54470	[log] [tgz]
author	Alex Zani <azani@google.com>	Tue Mar 24 19:06:10 2020 +0000
committer	CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>	Tue Mar 24 19:06:10 2020 +0000
tree	3b09aaffd09d479fa362d68006d5b0e799f9779c
parent	e6ce017326e774cd138c79fd5d587ebaa0166730 [diff]