blob: f1a90cb7a32cb2e3fec6e146bdd435dd360147b2 [file] [log] [blame]
#include "src/algorithms/experimental/archived/two_dim_rappor_histogram_encoder.h"
#include <algorithm>
#include <map>
#include "src/algorithms/random/distributions.h"
#include "src/algorithms/random/random.h"
namespace cobalt {
ArchivedTwoDimRapporHistogramEncoder::ArchivedTwoDimRapporHistogramEncoder(
// TODO(b/278930401): NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
BitGeneratorInterface<uint32_t>* gen, uint32_t num_buckets, uint64_t max_count, Probability p)
: gen_(gen), num_buckets_(num_buckets), max_count_(max_count), p_(p) {}
std::vector<std::pair<uint32_t, uint64_t>> ArchivedTwoDimRapporHistogramEncoder::Encode(
const std::vector<uint64_t>& histogram) {
std::vector<uint64_t> clipped_histogram(num_buckets_);
for (uint32_t bucket_index = 0u; bucket_index < num_buckets_; bucket_index++) {
clipped_histogram[bucket_index] = std::min(histogram[bucket_index], max_count_);
}
std::vector<std::pair<uint32_t, uint64_t>> encoded;
auto dist_if_absent = BinomialDistribution(gen_, num_buckets_, p_);
auto dist_0_to_1_if_present = BinomialDistribution(gen_, num_buckets_ - 1, p_);
auto dist_1_to_1_if_present = BernoulliDistribution(gen_, Probability(1.0) - p_);
for (uint32_t bucket_index = 0u; bucket_index < num_buckets_; bucket_index++) {
// We skip encoding and sending bucket counts of 0.
for (uint64_t bucket_count = 1u; bucket_count <= max_count_; bucket_count++) {
uint64_t encoded_count =
(bucket_count == clipped_histogram[bucket_index]
? dist_0_to_1_if_present.Sample() + dist_1_to_1_if_present.Sample()
: dist_if_absent.Sample());
for (uint64_t num_occurrences = 0u; num_occurrences < encoded_count; num_occurrences++) {
encoded.emplace_back(bucket_index, bucket_count);
}
}
}
return encoded;
}
ArchivedTwoDimRapporHistogramSumEstimator::ArchivedTwoDimRapporHistogramSumEstimator(
// TODO(b/278930401): NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
uint32_t num_buckets, uint64_t max_count, Probability p)
: num_buckets_(num_buckets), max_count_(max_count), p_(p) {}
std::vector<double> ArchivedTwoDimRapporHistogramSumEstimator::ComputeSum(
const std::vector<std::vector<std::pair<uint32_t, uint64_t>>>& encoded_histograms,
uint64_t num_participants) const {
// Compute the number of occurrences of each bit ID in |encoded_histograms|.
std::map<std::pair<uint32_t, uint64_t>, uint64_t> raw_bit_counts;
for (const auto& hist : encoded_histograms) {
for (auto bit_id : hist) {
raw_bit_counts[bit_id]++;
}
}
// The total number of encoded 1-hot bitvectors which contributed to |encoded_histograms|.
auto num_bitvectors = static_cast<double>(num_participants * num_buckets_);
std::vector<double> sum(num_buckets_);
for (uint32_t bucket_index = 0; bucket_index < num_buckets_; bucket_index++) {
double total_count_for_bucket = 0.0;
for (uint64_t bucket_count = 0; bucket_count <= max_count_; bucket_count++) {
// Estimate the true number of occurrences of the bit with ID (|bucket_count|, |bucket_index|)
// among the |num_bitvectors| encoded bitvectors. (See Erlingsson, Pihur, Korolova, "RAPPOR:
// Randomized Aggregatable Privacy-Preserving Ordinal Response" section 4, taking f = 1.)
double estimated_bit_count =
(static_cast<double>(raw_bit_counts[{bucket_index, bucket_count}]) -
p_.get() * num_bitvectors) /
(1.0 - 2 * p_.get());
// Clip the estimate into the range of possible true values.
if (estimated_bit_count < 0.0) {
estimated_bit_count = 0.0;
}
if (estimated_bit_count > num_bitvectors) {
estimated_bit_count = num_bitvectors;
}
// Compute the contribution of the (estimated number of) bits with true ID (|bucket_index|,
// |bucket_count|) to the total count for bucket |bucket_index|.
total_count_for_bucket += static_cast<double>(bucket_count) * estimated_bit_count;
}
sum[bucket_index] = total_count_for_bucket;
}
return sum;
}
} // namespace cobalt