blob: 0daf1497b1deee002787baadef59b90cd9211d02 [file] [log] [blame]
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COBALT_SRC_ALGORITHMS_EXPERIMENTAL_HISTOGRAM_ENCODER_H_
#define COBALT_SRC_ALGORITHMS_EXPERIMENTAL_HISTOGRAM_ENCODER_H_
#include "src/algorithms/experimental/integer_encoder.h"
#include "src/algorithms/experimental/randomized_response.h"
#include "src/algorithms/random/random.h"
namespace cobalt {
// An encoder that operates on histograms. Each bucket count is independently encoded using a
// provided IntegerEncoder.
class BucketWiseHistogramEncoder {
public:
// |num_buckets| is the number of histogram buckets, and |integer_encoder| is the IntegerEncoder
// that should be used to encode each bucket count.
BucketWiseHistogramEncoder(uint32_t num_buckets, IntegerEncoder* integer_encoder);
// |histogram| is a vector of length |num_buckets|, where the n-th element is the value for that
// bucket. Values should be within the bounds that the IntegerEncoder was constructed with. The
// IntegerEncoder's encoding of the n-th bucket count is the n-th element of the return value.
std::vector<uint32_t> Encode(const std::vector<int64_t>& histogram);
private:
uint32_t num_buckets_;
IntegerEncoder* integer_encoder_;
};
// Estimates the true sum of a collection of histograms, each of which was encoded with a
// BucketWiseHistogramSumEstimator.
class BucketWiseHistogramSumEstimator {
public:
// |num_buckets| is the number of histogram buckets, and |integer_encoder| is the IntegerEncoder
// that should be used to compute the approximate sum of the counts for each bucket.
BucketWiseHistogramSumEstimator(uint32_t num_buckets, IntegerSumEstimator* integer_sum_estimator);
// |encoded_histograms| is a vector of encoded histograms, where each encoded histogram is a
// vector of indices into the bucketing scheme of the IntegerSumEstimator used to construct this
// HistogramSumEstimator. For each index n in the range [0, |num_buckets| - 1], ComputeSum()
// returns an unbiased estimate of the sum of the counts in the n-th buckets of the encoded
// histograms.
std::vector<double> ComputeSum(const std::vector<std::vector<uint32_t>>& encoded_histograms);
private:
uint32_t num_buckets_;
IntegerSumEstimator* integer_sum_estimator_;
};
// An encoder that operates on histograms. Each occurrence of a bucket index is encoded using
// randomized response on the set of buckets, and the resulting 1-hot histograms are summed to form
// the encoded histogram.
class OccurrenceWiseHistogramEncoder {
public:
// |num_buckets| is the number of buckets in each input histogram, |max_count| is the maximum
// count that should be reported for any bucket, and |p| is the noise parameter for each instance
// of randomized response.
OccurrenceWiseHistogramEncoder(BitGeneratorInterface<uint32_t>* gen, uint32_t num_buckets,
uint64_t max_count, double p);
// |histogram| is a vector of length |num_buckets|, where the n-th element is the count for that
// bucket. Counts should be in the range [0, |max_count|] inclusive; larger bucket counts are
// clipped to |max_count| before encoding.
std::vector<uint64_t> Encode(const std::vector<uint64_t>& histogram);
private:
uint32_t num_buckets_;
uint64_t max_count_;
std::unique_ptr<ResponseRandomizer> randomizer_;
};
// Estimates the true sum of a collection of histograms, each of which was encoded with an
// OccurrenceWiseHistogramSumEstimator.
class OccurrenceWiseHistogramSumEstimator {
public:
// |num_buckets| is the number of histogram buckets, and |p| is a noise parameter. These should be
// equal to the arguments that were used to construct the OccurrenceWiseHistogramEncoder which
// produced the encoded histograms.
explicit OccurrenceWiseHistogramSumEstimator(uint32_t num_buckets, double p);
// |encoded_histograms| is a vector of encoded histograms produced by an
// OccurrenceWiseHistogramEncoder. For each index n in the range [0, |num_buckets| - 1],
// ComputeSum() returns an unbiased estimate of the sum of the counts in the n-th buckets of the
// encoded histograms.
std::vector<double> ComputeSum(const std::vector<std::vector<uint64_t>>& encoded_histograms);
private:
std::unique_ptr<FrequencyEstimator> frequency_estimator_;
};
// An encoder that operates on histograms. Each pair (bucket_index, bucket_count) in the input
// histogram is represented as a 1-hot bitvector of length
// L = |num_buckets| * (|max_count| + 1)
// using a fixed mapping of the set {0,...,|num_buckets| - 1} x {0,...,|max_count|} to {1,...,L}.
// The |num_buckets| 1-hot vectors are each encoded using Basic RAPPOR; i.e., each bit is flipped
// with probability |p|. The encoder then returns the multiset of pairs (bucket_index, bucket_count)
// corresponding to the "1" bits among the |num_buckets| encoded vectors. Pairs of the form
// (bucket_index, 0) are not included in the result.
class TwoDimRapporHistogramEncoder {
public:
// |num_buckets| is the number of buckets in each input histogram, |max_count| is the maximum
// count that should be reported for any bucket, and |p| is the probability of a bit flip during
// Basic RAPPOR encoding.
TwoDimRapporHistogramEncoder(BitGeneratorInterface<uint32_t>* gen, uint32_t num_buckets,
uint64_t max_count, double p);
// |histogram| is a vector of length |num_buckets|, where the n-th element is the count for that
// bucket. Counts should be in the range [0, |max_count|] inclusive; larger bucket counts are
// clipped to |max_count| before encoding.
std::vector<std::pair<uint32_t, uint64_t>> Encode(const std::vector<uint64_t>& histogram);
private:
BitGeneratorInterface<uint32_t>* gen_;
uint32_t num_buckets_;
uint64_t max_count_;
double p_;
};
// Estimates the true sum of a collection of histograms, each of which was encoded with a
// TwoDimRapporHistogramEncoder.
class TwoDimRapporHistogramSumEstimator {
public:
// |num_buckets| is the number of histogram buckets, |max_count| is the maximum contribution of an
// individual user to each bucket count, and |p| is a noise parameter. These should be equal to
// the arguments that were used to construct the TwoDimRapporHistogramSumEstimator which produced
// the encoded histograms.
TwoDimRapporHistogramSumEstimator(uint32_t num_buckets, uint64_t max_count, double p);
// |encoded_histograms| is a vector of encoded histograms produced by an
// TwoDimRapporHistogramSumEstimator, and |num_participants| is the number of users contributing
// to the report. For each index n in the range [0, |num_buckets| - 1], ComputeSum() returns an
// unbiased estimate of the sum of the counts in the n-th buckets of the encoded histograms.
std::vector<double> ComputeSum(
const std::vector<std::vector<std::pair<uint32_t, uint64_t>>>& encoded_histograms,
uint64_t num_participants);
private:
uint32_t num_buckets_;
uint64_t max_count_;
double p_;
};
// An encoder that operates on histograms. The histogram is represented as a
// bit vector of length L = |num_buckets| * (|max_count| + 1).
// For each bucket, the kth bit is set to 1 where
// k = |bucket_index| * |max_count| + |bucket_count|.
// All other bits are set to 0. Then, the bit vector is encoded using Basic RAPPOR;
// i.e., each bit is flipped with probability |p|. The encoder then returns
// the set of pairs (bucket_index, bucket_count) corresponding to the "1" bits
// in the encoded vector. Pairs of the form (bucket_index, 0) are not included in the result.
class SinglePassTwoDimRapporHistogramEncoder {
public:
// |num_buckets| is the number of buckets in each input histogram, |max_count| is the maximum
// count that should be reported for any bucket, and |p| is the probability of a bit flip during
// Basic RAPPOR encoding.
SinglePassTwoDimRapporHistogramEncoder(BitGeneratorInterface<uint32_t>* gen, uint32_t num_buckets,
uint64_t max_count, double p);
// |histogram| is a vector of tuples (i, j), which denotes that the count of the i-th bucket is
// equal to j > 0. The buckets that do not appear in the vector have count zero.
// bucket. Counts should be in the range [1, |max_count|] inclusive; larger bucket counts are
// clipped to |max_count| before encoding.
std::vector<std::pair<uint32_t, uint64_t>> Encode(
const std::vector<std::pair<uint32_t, uint64_t>>& histogram);
private:
BitGeneratorInterface<uint32_t>* gen_;
uint32_t num_buckets_;
uint64_t max_count_;
double p_;
};
// Estimates the true sum of a collection of histograms, each of which was encoded with a
// SinglePassTwoDimRapporHistogramEncoder.
class SinglePassTwoDimRapporHistogramSumEstimator {
public:
// |num_buckets| is the number of histogram buckets, |max_count| is the maximum contribution of an
// individual user to each bucket count, and |p| is a noise parameter. These should be equal to
// the arguments that were used to construct the SinglePassTwoDimRapporHistogramSumEstimator which
// produced the encoded histograms.
SinglePassTwoDimRapporHistogramSumEstimator(uint32_t num_buckets, uint64_t max_count, double p);
// |encoded_histograms| is a vector of encoded histograms produced by a
// SinglePassTwoDimRapporHistogramSumEstimator, and |num_participants| is the number of users
// contributing to the report. For each index n in the range [0, |num_buckets| - 1], ComputeSum()
// returns an unbiased estimate of the sum of the counts in the n-th buckets of the encoded
// histograms.
std::vector<double> ComputeSum(
const std::vector<std::vector<std::pair<uint32_t, uint64_t>>>& encoded_histograms,
uint64_t num_participants);
private:
uint32_t num_buckets_;
uint64_t max_count_;
double p_;
};
// An encoder that operates on histograms.
// For each bucket_index such that the count for that bucket is greater than
// zero, the encoder include a pair (bucket_index, bucket_count) in the output.
// Then, for every pair (i, j) where i is an integer between 0 and
// |num_buckets| - 1 and j is an integer between 1 and |max_count|, the encoder
// add a pair (i, j) to the output with probability p.
class TwoDimBinomialHistogramEncoder {
public:
// |num_buckets| is the number of buckets in each input histogram, |max_count| is the maximum
// count that should be reported for any bucket, and |p| is the noise parameter, i.e.,
// the probability of additionally including each pair (i, j) where i is from
// [0, ||num_buckets| - 1|] and j is from [1, |max_count|].
TwoDimBinomialHistogramEncoder(BitGeneratorInterface<uint32_t>* gen, uint32_t num_buckets,
uint64_t max_count, double p);
// |histogram| is a vector of tuples (i, j), which denotes that the count of the i-th bucket is
// equal to j. The buckets that do not appear in the vector have count zero.
// bucket. Counts should be in the range [1, |max_count|] inclusive; larger bucket counts are
// clipped to |max_count| before encoding.
std::vector<std::pair<uint32_t, uint64_t>> Encode(
const std::vector<std::pair<uint32_t, uint64_t>>& histogram);
private:
BitGeneratorInterface<uint32_t>* gen_;
uint32_t num_buckets_;
uint64_t max_count_;
double p_;
};
// Estimates the true sum of a collection of histograms, each of which was encoded with a
// TwoDimBinomialHistogramEncoder.
class TwoDimBinomialHistogramSumEstimator {
public:
// |num_buckets| is the number of histogram buckets, |max_count| is the maximum contribution of an
// individual user to each bucket count, and |p| is a noise parameter. These should be equal to
// the arguments that were used to construct TwoDimBinomialHistogramEncoder which produced the
// encoded histograms.
TwoDimBinomialHistogramSumEstimator(uint32_t num_buckets, uint64_t max_count, double p);
// |encoded_histograms| is a vector of encoded histograms produced by an
// TwoDimBinomialHistogramEncoder, and |num_participants| is the number of users contributing to
// the report. For each index n in the range [0, |num_buckets| - 1], ComputeSum()
// returns an unbiased estimate of the sum of the counts in the n-th buckets of the encoded
// histograms.
std::vector<double> ComputeSum(
const std::vector<std::vector<std::pair<uint32_t, uint64_t>>>& encoded_histograms,
uint64_t num_participants);
private:
uint32_t num_buckets_;
uint64_t max_count_;
double p_;
};
} // namespace cobalt
#endif // COBALT_SRC_ALGORITHMS_EXPERIMENTAL_HISTOGRAM_ENCODER_H_