blob: 3cb9d2a07beb99c120e513f029fac0c112f33b7e [file] [log] [blame]
#include "src/algorithms/experimental/histogram_encoder.h"
#include <algorithm>
#include <map>
#include "src/algorithms/experimental/integer_encoder.h"
#include "src/algorithms/experimental/randomized_response.h"
#include "src/algorithms/random/random.h"
namespace cobalt {
BucketWiseHistogramEncoder::BucketWiseHistogramEncoder(uint32_t num_buckets,
IntegerEncoder* integer_encoder)
: num_buckets_(num_buckets), integer_encoder_(integer_encoder) {}
std::vector<uint32_t> BucketWiseHistogramEncoder::Encode(const std::vector<int64_t>& histogram) {
std::vector<uint32_t> encoded(num_buckets_);
for (uint32_t index = 0; index < num_buckets_; index++) {
encoded[index] = integer_encoder_->Encode(histogram[index]);
}
return encoded;
}
BucketWiseHistogramSumEstimator::BucketWiseHistogramSumEstimator(
uint32_t num_buckets, IntegerSumEstimator* integer_sum_estimator)
: num_buckets_(num_buckets), integer_sum_estimator_(integer_sum_estimator) {}
std::vector<double> BucketWiseHistogramSumEstimator::ComputeSum(
const std::vector<std::vector<uint32_t>>& encoded_histograms) {
std::vector<double> decoded(num_buckets_);
for (uint32_t index = 0; index < num_buckets_; index++) {
std::vector<uint32_t> encoded_counts(encoded_histograms.size());
for (size_t k = 0; k < encoded_histograms.size(); k++) {
encoded_counts[k] = encoded_histograms[k][index];
}
decoded[index] = integer_sum_estimator_->ComputeSum(encoded_counts);
}
return decoded;
}
OccurrenceWiseHistogramEncoder::OccurrenceWiseHistogramEncoder(BitGeneratorInterface<uint32_t>* gen,
uint32_t num_buckets,
uint64_t max_count, double p)
: num_buckets_(num_buckets), max_count_(max_count) {
randomizer_ = std::make_unique<ResponseRandomizer>(gen, num_buckets - 1, p);
}
std::vector<uint64_t> OccurrenceWiseHistogramEncoder::Encode(
const std::vector<uint64_t>& histogram) {
std::vector<uint64_t> encoded_histogram(num_buckets_);
for (uint32_t bucket_index = 0u; bucket_index < histogram.size(); bucket_index++) {
uint64_t bucket_count = std::min(histogram[bucket_index], max_count_);
for (uint64_t i = 0u; i < bucket_count; i++) {
encoded_histogram[randomizer_->Encode(bucket_index)]++;
}
}
return encoded_histogram;
}
OccurrenceWiseHistogramSumEstimator::OccurrenceWiseHistogramSumEstimator(uint32_t num_buckets,
double p) {
frequency_estimator_ = std::make_unique<FrequencyEstimator>(num_buckets - 1, p);
}
std::vector<double> OccurrenceWiseHistogramSumEstimator::ComputeSum(
const std::vector<std::vector<uint64_t>>& encoded_histograms) {
return frequency_estimator_->GetFrequenciesFromHistograms(encoded_histograms);
}
TwoDimRapporHistogramEncoder::TwoDimRapporHistogramEncoder(BitGeneratorInterface<uint32_t>* gen,
uint32_t num_buckets, uint64_t max_count,
double p)
: gen_(gen), num_buckets_(num_buckets), max_count_(max_count), p_(p) {}
std::vector<std::pair<uint32_t, uint64_t>> TwoDimRapporHistogramEncoder::Encode(
const std::vector<uint64_t>& histogram) {
std::vector<uint64_t> clipped_histogram(num_buckets_);
for (uint32_t bucket_index = 0u; bucket_index < num_buckets_; bucket_index++) {
clipped_histogram[bucket_index] = std::min(histogram[bucket_index], max_count_);
}
std::vector<std::pair<uint32_t, uint64_t>> encoded;
auto dist_if_absent = BinomialDistribution(gen_, num_buckets_, p_);
auto dist_0_to_1_if_present = BinomialDistribution(gen_, num_buckets_ - 1, p_);
auto dist_1_to_1_if_present = BernoulliDistribution(gen_, 1.0 - p_);
for (uint32_t bucket_index = 0u; bucket_index < num_buckets_; bucket_index++) {
// We skip encoding and sending bucket counts of 0.
for (uint64_t bucket_count = 1u; bucket_count <= max_count_; bucket_count++) {
uint64_t encoded_count =
(bucket_count == clipped_histogram[bucket_index]
? dist_0_to_1_if_present.Sample() + dist_1_to_1_if_present.Sample()
: dist_if_absent.Sample());
for (uint64_t num_occurrences = 0u; num_occurrences < encoded_count; num_occurrences++) {
encoded.emplace_back(std::pair<uint32_t, uint64_t>(bucket_index, bucket_count));
}
}
}
return encoded;
}
TwoDimRapporHistogramSumEstimator::TwoDimRapporHistogramSumEstimator(uint32_t num_buckets,
uint64_t max_count, double p)
: num_buckets_(num_buckets), max_count_(max_count), p_(p) {}
std::vector<double> TwoDimRapporHistogramSumEstimator::ComputeSum(
const std::vector<std::vector<std::pair<uint32_t, uint64_t>>>& encoded_histograms,
uint64_t num_participants) {
// Compute the number of occurrences of each bit ID in |encoded_histograms|.
std::map<std::pair<uint32_t, uint64_t>, uint64_t> raw_bit_counts;
for (const auto& hist : encoded_histograms) {
for (auto bit_id : hist) {
raw_bit_counts[bit_id]++;
}
}
// The total number of encoded 1-hot bitvectors which contributed to |encoded_histograms|.
auto num_bitvectors = static_cast<double>(num_participants * num_buckets_);
std::vector<double> sum(num_buckets_);
for (uint32_t bucket_index = 0; bucket_index < num_buckets_; bucket_index++) {
double total_count_for_bucket = 0.0;
for (uint64_t bucket_count = 0; bucket_count <= max_count_; bucket_count++) {
// Estimate the true number of occurrences of the bit with ID (|bucket_count|, |bucket_index|)
// among the |num_bitvectors| encoded bitvectors. (See Erlingsson, Pihur, Korolova, "RAPPOR:
// Randomized Aggregatable Privacy-Preserving Ordinal Response" section 4, taking f = 1.)
double estimated_bit_count =
(static_cast<double>(raw_bit_counts[{bucket_index, bucket_count}]) -
p_ * num_bitvectors) /
(1.0 - 2 * p_);
// Clip the estimate into the range of possible true values.
if (estimated_bit_count < 0.0) {
estimated_bit_count = 0.0;
}
if (estimated_bit_count > num_bitvectors) {
estimated_bit_count = num_bitvectors;
}
// Compute the contribution of the (estimated number of) bits with true ID (|bucket_index|,
// |bucket_count|) to the total count for bucket |bucket_index|.
total_count_for_bucket += static_cast<double>(bucket_count) * estimated_bit_count;
}
sum[bucket_index] = total_count_for_bucket;
}
return sum;
}
SinglePassTwoDimRapporHistogramEncoder::SinglePassTwoDimRapporHistogramEncoder(
BitGeneratorInterface<uint32_t>* gen, uint32_t num_buckets, uint64_t max_count, double p)
: gen_(gen), num_buckets_(num_buckets), max_count_(max_count), p_(p) {}
std::vector<std::pair<uint32_t, uint64_t>> SinglePassTwoDimRapporHistogramEncoder::Encode(
const std::vector<std::pair<uint32_t, uint64_t>>& histogram) {
std::vector<std::pair<uint32_t, uint64_t>> clipped_histogram;
clipped_histogram.reserve(histogram.size());
for (std::pair<uint32_t, uint64_t> bucket_index_and_count : histogram) {
clipped_histogram.emplace_back(bucket_index_and_count.first,
std::min(bucket_index_and_count.second, max_count_));
}
if (p_ == 0.0) {
return clipped_histogram;
}
std::vector<std::pair<uint32_t, uint64_t>> encoded;
auto geom_dist = GeometricDistribution(gen_, p_);
// Current index into an enumeration of the set [0,...,|num_buckets|-1] x [1,...,|max_count|]. We
// will draw a random sample to decide whether to include the pair (bucket_index, count) at this
// index.
uint64_t current_histogram_index = 0;
// Next index to be flipped.
uint64_t flipped_index = geom_dist.Sample();
while (flipped_index < num_buckets_ * max_count_) {
auto flipped_bucket_index_and_count =
std::pair<uint32_t, uint64_t>(flipped_index / max_count_, 1 + (flipped_index % max_count_));
while (current_histogram_index < clipped_histogram.size() &&
clipped_histogram[current_histogram_index] < flipped_bucket_index_and_count) {
// The bit at clipped_histogram[current_histogram_index] is not flipped.
// Add it to the output.
encoded.emplace_back(clipped_histogram[current_histogram_index]);
current_histogram_index++;
}
if (current_histogram_index < clipped_histogram.size() &&
clipped_histogram[current_histogram_index] == flipped_bucket_index_and_count) {
// The bit at histogram[current_histogram_index] is flipped. Do not add it to the output.
current_histogram_index++;
} else {
// The bit at clipped_histogram[flipped_bucket_index_and_count] is flipped.
// Add it to the output.
encoded.emplace_back(flipped_bucket_index_and_count);
}
// Compute the next index to be flipped.
flipped_index += (geom_dist.Sample() + 1);
}
// Add the remaining pairs in the input histogram to the output, as they are not flipped.
while (current_histogram_index < clipped_histogram.size()) {
encoded.emplace_back(clipped_histogram[current_histogram_index]);
current_histogram_index++;
}
return encoded;
}
SinglePassTwoDimRapporHistogramSumEstimator::SinglePassTwoDimRapporHistogramSumEstimator(
uint32_t num_buckets, uint64_t max_count, double p)
: num_buckets_(num_buckets), max_count_(max_count), p_(p) {}
std::vector<double> SinglePassTwoDimRapporHistogramSumEstimator::ComputeSum(
const std::vector<std::vector<std::pair<uint32_t, uint64_t>>>& encoded_histograms,
uint64_t num_participants) {
// Compute the number of occurrences of each bit ID in |encoded_histograms|.
std::map<std::pair<uint32_t, uint64_t>, uint64_t> raw_bit_counts;
for (const auto& hist : encoded_histograms) {
for (auto bit_id : hist) {
raw_bit_counts[bit_id]++;
}
}
std::vector<double> sum(num_buckets_);
for (uint32_t bucket_index = 0; bucket_index < num_buckets_; bucket_index++) {
double total_count_for_bucket = 0.0;
for (uint64_t bucket_count = 0; bucket_count <= max_count_; bucket_count++) {
// Estimate the true number of occurrences of the bit with ID (|bucket_count|, |bucket_index|)
// among the |num_participants| encoded bitvectors. (See Erlingsson, Pihur, Korolova, "RAPPOR:
// Randomized Aggregatable Privacy-Preserving Ordinal Response" section 4, taking f = 1.)
double estimated_bit_count =
(static_cast<double>(raw_bit_counts[{bucket_index, bucket_count}]) -
p_ * static_cast<double>(num_participants)) /
(1.0 - 2 * p_);
// Clip the estimate into the range of possible true values.
if (estimated_bit_count < 0.0) {
estimated_bit_count = 0.0;
}
if (estimated_bit_count > static_cast<double>(num_participants)) {
estimated_bit_count = static_cast<double>(num_participants);
}
// Compute the contribution of the (estimated number of) bits with true ID (|bucket_index|,
// |bucket_count|) to the total count for bucket |bucket_index|.
total_count_for_bucket += static_cast<double>(bucket_count) * estimated_bit_count;
}
sum[bucket_index] = total_count_for_bucket;
}
return sum;
}
TwoDimBinomialHistogramEncoder::TwoDimBinomialHistogramEncoder(BitGeneratorInterface<uint32_t>* gen,
uint32_t num_buckets,
uint64_t max_count, double p)
: gen_(gen), num_buckets_(num_buckets), max_count_(max_count), p_(p) {}
std::vector<std::pair<uint32_t, uint64_t>> TwoDimBinomialHistogramEncoder::Encode(
const std::vector<std::pair<uint32_t, uint64_t>>& histogram) {
std::vector<std::pair<uint32_t, uint64_t>> encoded;
encoded.reserve(histogram.size());
// Add all input pairs to the output.
for (std::pair<uint32_t, uint64_t> bucket_index_and_count : histogram) {
encoded.emplace_back(bucket_index_and_count.first,
std::min(bucket_index_and_count.second, max_count_));
}
if (p_ == 0.0) {
return encoded;
}
// Add noise to the output; each pair is added with probability p_.
auto geom_dist = GeometricDistribution(gen_, p_);
uint64_t flipped_index = geom_dist.Sample();
while (flipped_index < num_buckets_ * max_count_) {
encoded.emplace_back(flipped_index / max_count_, 1 + flipped_index % max_count_);
flipped_index += (geom_dist.Sample() + 1);
}
return encoded;
}
TwoDimBinomialHistogramSumEstimator::TwoDimBinomialHistogramSumEstimator(uint32_t num_buckets,
uint64_t max_count,
double p)
: num_buckets_(num_buckets), max_count_(max_count), p_(p) {}
std::vector<double> TwoDimBinomialHistogramSumEstimator::ComputeSum(
const std::vector<std::vector<std::pair<uint32_t, uint64_t>>>& encoded_histograms,
uint64_t num_participants) {
// Compute the number of occurrences of each bit ID in |encoded_histograms|.
std::map<std::pair<uint32_t, uint64_t>, uint64_t> raw_bit_counts;
for (const auto& hist : encoded_histograms) {
for (auto bit_id : hist) {
raw_bit_counts[bit_id]++;
}
}
std::vector<double> sum(num_buckets_);
for (uint32_t bucket_index = 0; bucket_index < num_buckets_; bucket_index++) {
double total_count_for_bucket = 0.0;
for (uint64_t bucket_count = 0; bucket_count <= max_count_; bucket_count++) {
// Estimate the true number of occurrences of the bit with ID (|bucket_count|, |bucket_index|)
double estimated_bit_count =
(static_cast<double>(raw_bit_counts[{bucket_index, bucket_count}]) -
p_ * static_cast<double>(num_participants));
// Clip the estimate into the range of possible true values.
if (estimated_bit_count < 0.0) {
estimated_bit_count = 0.0;
}
if (estimated_bit_count > static_cast<double>(num_participants)) {
estimated_bit_count = static_cast<double>(num_participants);
}
// Compute the contribution of the (estimated number of) bits with true ID (|bucket_index|,
// |bucket_count|) to the total count for bucket |bucket_index|.
total_count_for_bucket += static_cast<double>(bucket_count) * estimated_bit_count;
}
sum[bucket_index] = total_count_for_bucket;
}
return sum;
}
} // namespace cobalt