| // Copyright 2019 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifndef COBALT_SRC_ALGORITHMS_EXPERIMENTAL_HISTOGRAM_ENCODER_H_ |
| #define COBALT_SRC_ALGORITHMS_EXPERIMENTAL_HISTOGRAM_ENCODER_H_ |
| |
| #include "src/algorithms/experimental/integer_encoder.h" |
| #include "src/algorithms/experimental/randomized_response.h" |
| #include "src/algorithms/random/random.h" |
| |
| namespace cobalt { |
| |
| // An encoder that operates on histograms. Each bucket count is independently encoded using a |
| // provided IntegerEncoder. |
| class BucketWiseHistogramEncoder { |
| public: |
| // |num_buckets| is the number of histogram buckets, and |integer_encoder| is the IntegerEncoder |
| // that should be used to encode each bucket count. |
| BucketWiseHistogramEncoder(uint32_t num_buckets, IntegerEncoder* integer_encoder); |
| |
| // |histogram| is a vector of length |num_buckets|, where the n-th element is the value for that |
| // bucket. Values should be within the bounds that the IntegerEncoder was constructed with. The |
| // IntegerEncoder's encoding of the n-th bucket count is the n-th element of the return value. |
| std::vector<uint32_t> Encode(const std::vector<int64_t>& histogram); |
| |
| private: |
| uint32_t num_buckets_; |
| IntegerEncoder* integer_encoder_; |
| }; |
| |
| // Estimates the true sum of a collection of histograms, each of which was encoded with a |
| // BucketWiseHistogramSumEstimator. |
| class BucketWiseHistogramSumEstimator { |
| public: |
| // |num_buckets| is the number of histogram buckets, and |integer_encoder| is the IntegerEncoder |
| // that should be used to compute the approximate sum of the counts for each bucket. |
| BucketWiseHistogramSumEstimator(uint32_t num_buckets, IntegerSumEstimator* integer_sum_estimator); |
| |
| // |encoded_histograms| is a vector of encoded histograms, where each encoded histogram is a |
| // vector of indices into the bucketing scheme of the IntegerSumEstimator used to construct this |
| // HistogramSumEstimator. For each index n in the range [0, |num_buckets| - 1], ComputeSum() |
| // returns an unbiased estimate of the sum of the counts in the n-th buckets of the encoded |
| // histograms. |
| std::vector<double> ComputeSum(const std::vector<std::vector<uint32_t>>& encoded_histograms); |
| |
| private: |
| uint32_t num_buckets_; |
| IntegerSumEstimator* integer_sum_estimator_; |
| }; |
| |
| // An encoder that operates on histograms. Each occurrence of a bucket index is encoded using |
| // randomized response on the set of buckets, and the resulting 1-hot histograms are summed to form |
| // the encoded histogram. |
| class OccurrenceWiseHistogramEncoder { |
| public: |
| // |num_buckets| is the number of buckets in each input histogram, |max_count| is the maximum |
| // count that should be reported for any bucket, and |p| is the noise parameter for each instance |
| // of randomized response. |
| OccurrenceWiseHistogramEncoder(BitGeneratorInterface<uint32_t>* gen, uint32_t num_buckets, |
| uint64_t max_count, double p); |
| |
| // |histogram| is a vector of length |num_buckets|, where the n-th element is the count for that |
| // bucket. Counts should be in the range [0, |max_count|] inclusive; larger bucket counts are |
| // clipped to |max_count| before encoding. |
| std::vector<uint64_t> Encode(const std::vector<uint64_t>& histogram); |
| |
| private: |
| uint32_t num_buckets_; |
| uint64_t max_count_; |
| std::unique_ptr<ResponseRandomizer> randomizer_; |
| }; |
| |
| // Estimates the true sum of a collection of histograms, each of which was encoded with an |
| // OccurrenceWiseHistogramSumEstimator. |
| class OccurrenceWiseHistogramSumEstimator { |
| public: |
| // |num_buckets| is the number of histogram buckets, and |p| is a noise parameter. These should be |
| // equal to the arguments that were used to construct the OccurrenceWiseHistogramEncoder which |
| // produced the encoded histograms. |
| explicit OccurrenceWiseHistogramSumEstimator(uint32_t num_buckets, double p); |
| |
| // |encoded_histograms| is a vector of encoded histograms produced by an |
| // OccurrenceWiseHistogramEncoder. For each index n in the range [0, |num_buckets| - 1], |
| // ComputeSum() returns an unbiased estimate of the sum of the counts in the n-th buckets of the |
| // encoded histograms. |
| std::vector<double> ComputeSum(const std::vector<std::vector<uint64_t>>& encoded_histograms); |
| |
| private: |
| std::unique_ptr<FrequencyEstimator> frequency_estimator_; |
| }; |
| |
| // An encoder that operates on histograms. Each pair (bucket_index, bucket_count) in the input |
| // histogram is represented as a 1-hot bitvector of length |
| // L = |num_buckets| * (|max_count| + 1) |
| // using a fixed mapping of the set {0,...,|num_buckets| - 1} x {0,...,|max_count|} to {1,...,L}. |
| // The |num_buckets| 1-hot vectors are each encoded using Basic RAPPOR; i.e., each bit is flipped |
| // with probability |p|. The encoder then returns the multiset of pairs (bucket_index, bucket_count) |
| // corresponding to the "1" bits among the |num_buckets| encoded vectors. Pairs of the form |
| // (bucket_index, 0) are not included in the result. |
| class TwoDimRapporHistogramEncoder { |
| public: |
| // |num_buckets| is the number of buckets in each input histogram, |max_count| is the maximum |
| // count that should be reported for any bucket, and |p| is the probability of a bit flip during |
| // Basic RAPPOR encoding. |
| TwoDimRapporHistogramEncoder(BitGeneratorInterface<uint32_t>* gen, uint32_t num_buckets, |
| uint64_t max_count, double p); |
| |
| // |histogram| is a vector of length |num_buckets|, where the n-th element is the count for that |
| // bucket. Counts should be in the range [0, |max_count|] inclusive; larger bucket counts are |
| // clipped to |max_count| before encoding. |
| std::vector<std::pair<uint32_t, uint64_t>> Encode(const std::vector<uint64_t>& histogram); |
| |
| private: |
| BitGeneratorInterface<uint32_t>* gen_; |
| uint32_t num_buckets_; |
| uint64_t max_count_; |
| double p_; |
| }; |
| |
| // Estimates the true sum of a collection of histograms, each of which was encoded with a |
| // TwoDimRapporHistogramEncoder. |
| class TwoDimRapporHistogramSumEstimator { |
| public: |
| // |num_buckets| is the number of histogram buckets, |max_count| is the maximum contribution of an |
| // individual user to each bucket count, and |p| is a noise parameter. These should be equal to |
| // the arguments that were used to construct the TwoDimRapporHistogramSumEstimator which produced |
| // the encoded histograms. |
| TwoDimRapporHistogramSumEstimator(uint32_t num_buckets, uint64_t max_count, double p); |
| |
| // |encoded_histograms| is a vector of encoded histograms produced by an |
| // TwoDimRapporHistogramSumEstimator, and |num_participants| is the number of users contributing |
| // to the report. For each index n in the range [0, |num_buckets| - 1], ComputeSum() returns an |
| // unbiased estimate of the sum of the counts in the n-th buckets of the encoded histograms. |
| std::vector<double> ComputeSum( |
| const std::vector<std::vector<std::pair<uint32_t, uint64_t>>>& encoded_histograms, |
| uint64_t num_participants); |
| |
| private: |
| uint32_t num_buckets_; |
| uint64_t max_count_; |
| double p_; |
| }; |
| |
| // An encoder that operates on histograms. The histogram is represented as a |
| // bit vector of length L = |num_buckets| * (|max_count| + 1). |
| // For each bucket, the kth bit is set to 1 where |
| // k = |bucket_index| * |max_count| + |bucket_count|. |
| // All other bits are set to 0. Then, the bit vector is encoded using Basic RAPPOR; |
| // i.e., each bit is flipped with probability |p|. The encoder then returns |
| // the set of pairs (bucket_index, bucket_count) corresponding to the "1" bits |
| // in the encoded vector. Pairs of the form (bucket_index, 0) are not included in the result. |
| class SinglePassTwoDimRapporHistogramEncoder { |
| public: |
| // |num_buckets| is the number of buckets in each input histogram, |max_count| is the maximum |
| // count that should be reported for any bucket, and |p| is the probability of a bit flip during |
| // Basic RAPPOR encoding. |
| SinglePassTwoDimRapporHistogramEncoder(BitGeneratorInterface<uint32_t>* gen, uint32_t num_buckets, |
| uint64_t max_count, double p); |
| |
| // |histogram| is a vector of tuples (i, j), which denotes that the count of the i-th bucket is |
| // equal to j > 0. The buckets that do not appear in the vector have count zero. |
| // bucket. Counts should be in the range [1, |max_count|] inclusive; larger bucket counts are |
| // clipped to |max_count| before encoding. |
| std::vector<std::pair<uint32_t, uint64_t>> Encode( |
| const std::vector<std::pair<uint32_t, uint64_t>>& histogram); |
| |
| private: |
| BitGeneratorInterface<uint32_t>* gen_; |
| uint32_t num_buckets_; |
| uint64_t max_count_; |
| double p_; |
| }; |
| |
| // Estimates the true sum of a collection of histograms, each of which was encoded with a |
| // SinglePassTwoDimRapporHistogramEncoder. |
| class SinglePassTwoDimRapporHistogramSumEstimator { |
| public: |
| // |num_buckets| is the number of histogram buckets, |max_count| is the maximum contribution of an |
| // individual user to each bucket count, and |p| is a noise parameter. These should be equal to |
| // the arguments that were used to construct the SinglePassTwoDimRapporHistogramSumEstimator which |
| // produced the encoded histograms. |
| SinglePassTwoDimRapporHistogramSumEstimator(uint32_t num_buckets, uint64_t max_count, double p); |
| |
| // |encoded_histograms| is a vector of encoded histograms produced by a |
| // SinglePassTwoDimRapporHistogramSumEstimator, and |num_participants| is the number of users |
| // contributing to the report. For each index n in the range [0, |num_buckets| - 1], ComputeSum() |
| // returns an unbiased estimate of the sum of the counts in the n-th buckets of the encoded |
| // histograms. |
| std::vector<double> ComputeSum( |
| const std::vector<std::vector<std::pair<uint32_t, uint64_t>>>& encoded_histograms, |
| uint64_t num_participants); |
| |
| private: |
| uint32_t num_buckets_; |
| uint64_t max_count_; |
| double p_; |
| }; |
| |
| // An encoder that operates on histograms. |
| // For each bucket_index such that the count for that bucket is greater than |
| // zero, the encoder include a pair (bucket_index, bucket_count) in the output. |
| // Then, for every pair (i, j) where i is an integer between 0 and |
| // |num_buckets| - 1 and j is an integer between 1 and |max_count|, the encoder |
| // add a pair (i, j) to the output with probability p. |
| class TwoDimBinomialHistogramEncoder { |
| public: |
| // |num_buckets| is the number of buckets in each input histogram, |max_count| is the maximum |
| // count that should be reported for any bucket, and |p| is the noise parameter, i.e., |
| // the probability of additionally including each pair (i, j) where i is from |
| // [0, ||num_buckets| - 1|] and j is from [1, |max_count|]. |
| TwoDimBinomialHistogramEncoder(BitGeneratorInterface<uint32_t>* gen, uint32_t num_buckets, |
| uint64_t max_count, double p); |
| |
| // |histogram| is a vector of tuples (i, j), which denotes that the count of the i-th bucket is |
| // equal to j. The buckets that do not appear in the vector have count zero. |
| // bucket. Counts should be in the range [1, |max_count|] inclusive; larger bucket counts are |
| // clipped to |max_count| before encoding. |
| std::vector<std::pair<uint32_t, uint64_t>> Encode( |
| const std::vector<std::pair<uint32_t, uint64_t>>& histogram); |
| |
| private: |
| BitGeneratorInterface<uint32_t>* gen_; |
| uint32_t num_buckets_; |
| uint64_t max_count_; |
| double p_; |
| }; |
| |
| // Estimates the true sum of a collection of histograms, each of which was encoded with a |
| // TwoDimBinomialHistogramEncoder. |
| class TwoDimBinomialHistogramSumEstimator { |
| public: |
| // |num_buckets| is the number of histogram buckets, |max_count| is the maximum contribution of an |
| // individual user to each bucket count, and |p| is a noise parameter. These should be equal to |
| // the arguments that were used to construct TwoDimBinomialHistogramEncoder which produced the |
| // encoded histograms. |
| TwoDimBinomialHistogramSumEstimator(uint32_t num_buckets, uint64_t max_count, double p); |
| |
| // |encoded_histograms| is a vector of encoded histograms produced by an |
| // TwoDimBinomialHistogramEncoder, and |num_participants| is the number of users contributing to |
| // the report. For each index n in the range [0, |num_buckets| - 1], ComputeSum() |
| // returns an unbiased estimate of the sum of the counts in the n-th buckets of the encoded |
| // histograms. |
| std::vector<double> ComputeSum( |
| const std::vector<std::vector<std::pair<uint32_t, uint64_t>>>& encoded_histograms, |
| uint64_t num_participants); |
| |
| private: |
| uint32_t num_buckets_; |
| uint64_t max_count_; |
| double p_; |
| }; |
| |
| } // namespace cobalt |
| |
| #endif // COBALT_SRC_ALGORITHMS_EXPERIMENTAL_HISTOGRAM_ENCODER_H_ |