| #include "src/algorithms/experimental/histogram_encoder.h" |
| |
| #include <gmock/gmock.h> |
| #include <gtest/gtest.h> |
| |
| #include "src/algorithms/experimental/integer_encoder.h" |
| #include "src/algorithms/experimental/randomized_response.h" |
| #include "src/algorithms/random/random.h" |
| |
| using testing::DoubleNear; |
| using testing::ElementsAre; |
| using testing::Pair; |
| using testing::Pointwise; |
| using testing::UnorderedElementsAre; |
| |
| namespace cobalt { |
| |
| class HistogramEncoderTest : public ::testing::Test { |
| protected: |
| void SetUp() override { gen_ = std::make_unique<RandomNumberGenerator>(0); } |
| |
| RandomNumberGenerator* GetGenerator() { return gen_.get(); } |
| |
| // Create an IntegerEncoder for the range [0, |max_count|] where the number of partitions is equal |
| // to |max_count|, and where the probability of a data-independent response is 0. Both the initial |
| // fixed-point encoding and the final output of the encoder are deterministic in this case. |
| std::unique_ptr<IntegerEncoder> MakeDeterministicIntegerEncoder(uint64_t max_count) { |
| return std::make_unique<IntegerEncoder>(GetGenerator(), /*min_int=*/0, /*max_int=*/max_count, |
| /*partitions=*/max_count, |
| /*p=*/0.0); |
| } |
| |
| // Create an IntegerSumEstimator for an IntegerEncoder with range [0, |max_count|] and where the |
| // number of partitions is equal to |max_count|. |
| static std::unique_ptr<IntegerSumEstimator> MakeIntegerSumEstimator(uint64_t max_count, |
| double p) { |
| return std::make_unique<IntegerSumEstimator>(/*min_int=*/0, /*max_int=*/max_count, |
| /*partitions=*/max_count, p); |
| } |
| |
| private: |
| std::unique_ptr<RandomNumberGenerator> gen_; |
| }; |
| |
| // Bucket-wise encoding with p = 0. The encoder should snap negative counts to 0 and overflow |
| // counts to |max_count|, but otherwise leave counts unchanged. |
| TEST_F(HistogramEncoderTest, BucketWiseEncode) { |
| // Parameters for encoding bucket counts |
| uint64_t max_count = 5; |
| auto integer_encoder = MakeDeterministicIntegerEncoder(max_count); |
| // Number of buckets in the top-level histogram |
| uint32_t num_buckets = 10; |
| auto histogram_encoder = BucketWiseHistogramEncoder(num_buckets, integer_encoder.get()); |
| |
| std::vector<int64_t> histogram = {-1, 0, 1, 2, 3, 4, 5, 6, 0, 1}; |
| std::vector<uint32_t> encoded = histogram_encoder.Encode(histogram); |
| EXPECT_THAT(encoded, ElementsAre(0u, 0u, 1u, 2u, 3u, 4u, 5u, 5u, 0u, 1u)); |
| } |
| |
| // Bucket-wise encoding with p > 0. The expected result depends on the seed passed to |gen_|. |
| TEST_F(HistogramEncoderTest, BucketWiseEncodeNonzeroP) { |
| // Parameters for encoding occurrences |
| uint64_t max_count = 5; |
| double p = 0.1; |
| // Number of buckets in the top-level histogram |
| uint32_t num_buckets = 10; |
| auto histogram_encoder = |
| OccurrenceWiseHistogramEncoder(GetGenerator(), num_buckets, max_count, p); |
| |
| std::vector<uint64_t> histogram = {0, 0, 1, 2, 3, 4, 5, 6, 0, 1}; |
| std::vector<uint64_t> encoded = histogram_encoder.Encode(histogram); |
| EXPECT_THAT(encoded, ElementsAre(0ul, 0ul, 2ul, 2ul, 3ul, 4ul, 5ul, 4ul, 0ul, 1ul)); |
| } |
| |
| TEST_F(HistogramEncoderTest, BucketWiseEstimateSum) { |
| // Parameters for encoding bucket counts |
| uint64_t max_count = 5; |
| double p = 0.0; |
| auto integer_sum_estimator = MakeIntegerSumEstimator(max_count, p); |
| // Number of buckets in the top-level histogram |
| uint32_t num_buckets = 10; |
| auto histogram_sum_estimator = |
| BucketWiseHistogramSumEstimator(num_buckets, integer_sum_estimator.get()); |
| |
| // Since |partitions| is 5, the elements of the encoded histograms have values between 0 and 5 |
| // inclusive. |
| std::vector<uint32_t> encoded_histogram_1 = {0, 1, 2, 3, 4, 5, 4, 3, 2, 1}; |
| std::vector<uint32_t> encoded_histogram_2 = {1, 2, 3, 4, 5, 5, 4, 3, 2, 1}; |
| std::vector<uint32_t> encoded_histogram_3 = {4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; |
| |
| std::vector<std::vector<uint32_t>> encoded_histograms = {encoded_histogram_1, encoded_histogram_2, |
| encoded_histogram_3}; |
| |
| auto estimate = histogram_sum_estimator.ComputeSum(encoded_histograms); |
| std::vector<double> expected_sums = {5.0, 7.0, 9.0, 11.0, 13.0, 14.0, 12.0, 10.0, 8.0, 6.0}; |
| EXPECT_THAT(estimate, Pointwise(DoubleNear(0.001), expected_sums)); |
| } |
| |
| TEST_F(HistogramEncoderTest, BucketWiseEstimateSumNonzeroP) { |
| // Parameters for encoding bucket counts |
| uint64_t max_count = 5; |
| double p = 0.1; |
| auto integer_sum_estimator = MakeIntegerSumEstimator(max_count, p); |
| // Number of buckets in the top-level histogram |
| uint32_t num_buckets = 10; |
| auto histogram_sum_estimator = |
| BucketWiseHistogramSumEstimator(num_buckets, integer_sum_estimator.get()); |
| |
| // Since |max_count| is 5 and this is equal to the number of partitions, the elements of the |
| // encoded histograms have values between 0 and 5 inclusive. |
| std::vector<uint32_t> encoded_histogram_1 = {0, 1, 2, 3, 4, 5, 4, 3, 2, 1}; |
| std::vector<uint32_t> encoded_histogram_2 = {1, 2, 3, 4, 5, 5, 4, 3, 2, 1}; |
| std::vector<uint32_t> encoded_histogram_3 = {4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; |
| |
| std::vector<std::vector<uint32_t>> encoded_histograms = {encoded_histogram_1, encoded_histogram_2, |
| encoded_histogram_3}; |
| |
| auto estimate = histogram_sum_estimator.ComputeSum(encoded_histograms); |
| std::vector<double> expected_sums = {4.722, 6.944, 9.166, 11.388, 13.611, |
| 14.722, 12.500, 10.277, 8.0555, 5.833}; |
| EXPECT_THAT(estimate, Pointwise(DoubleNear(0.001), expected_sums)); |
| } |
| |
| // Occurrence-wise encoding with p = 0. The encoder should snap overflow counts to the max count, |
| // but otherwise leave counts unchanged. |
| TEST_F(HistogramEncoderTest, OccurrenceWiseEncode) { |
| // Parameters for encoding occurrences |
| uint64_t max_count = 5; |
| double p = 0.0; |
| // Number of buckets in the top-level histogram |
| uint32_t num_buckets = 10; |
| auto histogram_encoder = |
| OccurrenceWiseHistogramEncoder(GetGenerator(), num_buckets, max_count, p); |
| |
| std::vector<uint64_t> histogram = {0, 0, 1, 2, 3, 4, 5, 6, 0, 1}; |
| std::vector<uint64_t> encoded = histogram_encoder.Encode(histogram); |
| EXPECT_THAT(encoded, ElementsAre(0ul, 0ul, 1ul, 2ul, 3ul, 4ul, 5ul, 5ul, 0ul, 1ul)); |
| } |
| |
| // Occurrence-wise encoding with p > 0. The expected result depends on the seed passed to |gen_|. |
| TEST_F(HistogramEncoderTest, OccurrenceWiseEncodeNonzeroP) { |
| // Parameters for encoding occurrences |
| uint64_t max_count = 5; |
| double p = 0.1; |
| // Number of buckets in the top-level histogram |
| uint32_t num_buckets = 10; |
| auto histogram_encoder = |
| OccurrenceWiseHistogramEncoder(GetGenerator(), num_buckets, max_count, p); |
| |
| std::vector<uint64_t> histogram = {0, 0, 1, 2, 3, 4, 5, 6, 0, 1}; |
| std::vector<uint64_t> encoded = histogram_encoder.Encode(histogram); |
| EXPECT_THAT(encoded, ElementsAre(0ul, 0ul, 2ul, 2ul, 3ul, 4ul, 5ul, 4ul, 0ul, 1ul)); |
| } |
| |
| TEST_F(HistogramEncoderTest, OccurrenceWiseEstimateSum) { |
| // Noise parameter for encoded occurrences |
| double p = 0.0; |
| // Number of buckets in the top-level histogram |
| uint32_t num_buckets = 10; |
| auto histogram_sum_estimator = OccurrenceWiseHistogramSumEstimator(num_buckets, p); |
| |
| std::vector<uint64_t> encoded_histogram_1 = {0, 1, 2, 3, 4, 5, 4, 3, 2, 1}; |
| std::vector<uint64_t> encoded_histogram_2 = {1, 2, 3, 4, 5, 5, 4, 3, 2, 1}; |
| std::vector<uint64_t> encoded_histogram_3 = {4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; |
| std::vector<std::vector<uint64_t>> encoded_histograms = {encoded_histogram_1, encoded_histogram_2, |
| encoded_histogram_3}; |
| auto estimate = histogram_sum_estimator.ComputeSum(encoded_histograms); |
| std::vector<double> expected_sums = {5.0, 7.0, 9.0, 11.0, 13.0, 14.0, 12.0, 10.0, 8.0, 6.0}; |
| EXPECT_THAT(estimate, Pointwise(DoubleNear(0.001), expected_sums)); |
| } |
| |
| // Occurrence-wise estimation with p > 0. |
| TEST_F(HistogramEncoderTest, OccurrenceWiseEstimateSumNonzeroP) { |
| // Noise parameter for encoded occurrences |
| double p = 0.1; |
| // Number of buckets in the top-level histogram |
| uint32_t num_buckets = 10; |
| auto histogram_sum_estimator = OccurrenceWiseHistogramSumEstimator(num_buckets, p); |
| |
| std::vector<uint64_t> encoded_histogram_1 = {0, 1, 2, 3, 4, 5, 4, 3, 2, 1}; |
| std::vector<uint64_t> encoded_histogram_2 = {1, 2, 3, 4, 5, 5, 4, 3, 2, 1}; |
| std::vector<uint64_t> encoded_histogram_3 = {4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; |
| std::vector<std::vector<uint64_t>> encoded_histograms = {encoded_histogram_1, encoded_histogram_2, |
| encoded_histogram_3}; |
| auto estimate = histogram_sum_estimator.ComputeSum(encoded_histograms); |
| std::vector<double> expected_sums = {4.500, 6.722, 8.944, 11.166, 13.388, |
| 14.500, 12.277, 10.055, 7.833, 5.611}; |
| EXPECT_THAT(estimate, Pointwise(DoubleNear(0.001), expected_sums)); |
| } |
| |
| TEST_F(HistogramEncoderTest, TwoDimRapporHistogramEncoderAllZero) { |
| auto encoder = TwoDimRapporHistogramEncoder(GetGenerator(), 20, 50, 0.0002); |
| std::vector<uint64_t> histogram(20, 0); |
| std::vector<std::pair<uint32_t, uint64_t>> encoded = encoder.Encode(histogram); |
| EXPECT_THAT(encoded, ElementsAre(Pair(17u, 39ul))); |
| } |
| |
| TEST_F(HistogramEncoderTest, TwoDimRapporHistogramEncoder) { |
| auto encoder = TwoDimRapporHistogramEncoder(GetGenerator(), 20, 50, 0.0002); |
| std::vector<uint64_t> histogram = {0, 0, 1, 0, 0, 10, 60, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0}; |
| std::vector<std::pair<uint32_t, uint64_t>> encoded = encoder.Encode(histogram); |
| EXPECT_THAT(encoded, ElementsAre(Pair(2u, 1ul), Pair(5u, 10ul), Pair(6u, 50ul), Pair(7u, 1ul), |
| Pair(15u, 3ul), Pair(17u, 34ul))); |
| } |
| |
| TEST_F(HistogramEncoderTest, TwoDimRapporHistogramSumEstimator) { |
| auto estimator = TwoDimRapporHistogramSumEstimator(20, 50, 0.0002); |
| auto estimate = estimator.ComputeSum({{{2, 1}, {5, 10}, {6, 50}}, |
| {{2, 1}, {7, 1}, {15, 3}, {17, 34}}, |
| {{0, 1}, {2, 3}, {15, 6}, {19, 25}}}, |
| 3); |
| std::vector<double> expected_sums = {0.988, 0.000, 4.953, 0.000, 0.000, 9.883, 49.419, |
| 0.988, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, |
| 0.000, 8.895, 0.000, 33.605, 0.000, 24.709}; |
| EXPECT_THAT(estimate, Pointwise(DoubleNear(0.001), expected_sums)); |
| } |
| |
| TEST_F(HistogramEncoderTest, SinglePassTwoDimRapporHistogramEncoderAllZero) { |
| // Input is a histogram with all counts equal to zero. |
| std::vector<std::pair<uint32_t, uint64_t>> histogram; |
| |
| // With the constant seed used by all tests in this file and a small noise parameter, the output |
| // histogram happens to be equal to the input histogram. |
| auto encoder = SinglePassTwoDimRapporHistogramEncoder(GetGenerator(), 20, 50, 0.0002); |
| std::vector<std::pair<uint32_t, uint64_t>> encoded = encoder.Encode(histogram); |
| EXPECT_EQ(encoded.size(), 0u); |
| |
| // When the noise parameter is equal to 1 (i.e., all bits get flipped) then the output histogram |
| // is the inverse of the input histogram: a pair (index, count) with count > 0 appears in the |
| // output if and only if it did not appear in the input. |
| encoder = SinglePassTwoDimRapporHistogramEncoder(GetGenerator(), 3, 2, 1.0); |
| encoded = encoder.Encode(histogram); |
| EXPECT_THAT(encoded, ElementsAre(Pair(0u, 1ul), Pair(0u, 2ul), Pair(1u, 1ul), Pair(1u, 2ul), |
| Pair(2u, 1ul), Pair(2u, 2ul))); |
| } |
| |
| TEST_F(HistogramEncoderTest, SinglePassTwoDimRapporHistogramEncoderZeroP) { |
| // When the noise parameter is equal to 0 then the output histogram is equal to the input |
| // histogram. |
| std::vector<std::pair<uint32_t, uint64_t>> histogram = {{2, 1}}; |
| auto encoder = SinglePassTwoDimRapporHistogramEncoder(GetGenerator(), 3, 2, 0.0); |
| std::vector<std::pair<uint32_t, uint64_t>> encoded = encoder.Encode(histogram); |
| EXPECT_THAT(encoded, ElementsAre(Pair(2u, 1ul))); |
| } |
| |
| TEST_F(HistogramEncoderTest, SinglePassTwoDimRapporHistogramEncoder) { |
| // With the constant seed used by all tests in this file and a small noise parameter, the output |
| // histogram has its counts clipped to the max count, but otherwise happens to be equal to the |
| // input histogram. |
| auto encoder = SinglePassTwoDimRapporHistogramEncoder(GetGenerator(), 20, 50, 0.0002); |
| std::vector<std::pair<uint32_t, uint64_t>> histogram = { |
| {2, 1}, {5, 10}, {6, 60}, {7, 1}, {15, 3}}; |
| std::vector<std::pair<uint32_t, uint64_t>> encoded = encoder.Encode(histogram); |
| EXPECT_THAT(encoded, ElementsAre(Pair(2u, 1ul), Pair(5u, 10ul), Pair(6u, 50ul), Pair(7u, 1ul), |
| Pair(15u, 3ul))); |
| |
| // When the noise parameter is equal to 1 (i.e., all bits get flipped) then the output histogram |
| // is the inverse of the input histogram: a pair (index, count) with count > 0 appears in the |
| // output if and only if it did not appear in the input. |
| encoder = SinglePassTwoDimRapporHistogramEncoder(GetGenerator(), 3, 2, 1.0); |
| histogram = {{2, 1}}; |
| encoded = encoder.Encode(histogram); |
| EXPECT_THAT(encoded, ElementsAre(Pair(0u, 1ul), Pair(0u, 2ul), Pair(1u, 1ul), Pair(1u, 2ul), |
| Pair(2u, 2ul))); |
| } |
| |
| TEST_F(HistogramEncoderTest, SinglePassTwoDimRapporHistogramSumEstimator) { |
| auto estimator = SinglePassTwoDimRapporHistogramSumEstimator(20, 50, 0.0002); |
| auto estimate = estimator.ComputeSum({{{2, 1}, {5, 10}, {6, 50}}, |
| {{2, 1}, {7, 1}, {15, 3}, {17, 34}}, |
| {{0, 1}, {2, 3}, {15, 6}, {19, 25}}}, |
| 3); |
| std::vector<double> expected_sums = {0.999, 0.000, 4.999, 0.000, 0.000, 9.998, 49.990, |
| 0.999, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, |
| 0.000, 8.998, 0.000, 33.993, 0.000, 24.995}; |
| EXPECT_THAT(estimate, Pointwise(DoubleNear(0.001), expected_sums)); |
| } |
| |
| TEST_F(HistogramEncoderTest, TwoDimBinomialHistogramEncoderAllZero) { |
| // Input is a histogram with all counts equal to zero. |
| std::vector<std::pair<uint32_t, uint64_t>> histogram; |
| |
| // With the constant seed used by all tests in this file and a small noise parameter, the output |
| // histogram happens to be equal to the input histogram. |
| auto encoder = TwoDimBinomialHistogramEncoder(GetGenerator(), 20, 50, 0.0002); |
| std::vector<std::pair<uint32_t, uint64_t>> encoded = encoder.Encode(histogram); |
| EXPECT_EQ(encoded.size(), 0u); |
| |
| // When the noise parameter is equal to 1 then the output histogram contains all pairs |
| // (bucket_index, count) where count > 0. |
| encoder = TwoDimBinomialHistogramEncoder(GetGenerator(), 3, 2, 1.0); |
| encoded = encoder.Encode(histogram); |
| EXPECT_THAT(encoded, ElementsAre(Pair(0u, 1ul), Pair(0u, 2ul), Pair(1u, 1ul), Pair(1u, 2ul), |
| Pair(2u, 1ul), Pair(2u, 2ul))); |
| } |
| |
| TEST_F(HistogramEncoderTest, TwoDimBinomialHistogramEncoderZeroP) { |
| // When the noise parameter is equal to 0 then the output histogram is equal to the input |
| // histogram. |
| std::vector<std::pair<uint32_t, uint64_t>> histogram = {{2, 1}}; |
| auto encoder = TwoDimBinomialHistogramEncoder(GetGenerator(), 3, 2, 0.0); |
| std::vector<std::pair<uint32_t, uint64_t>> encoded = encoder.Encode(histogram); |
| EXPECT_THAT(encoded, ElementsAre(Pair(2u, 1ul))); |
| } |
| |
| TEST_F(HistogramEncoderTest, TwoDimBinomialHistogramEncoder) { |
| // With the constant seed used by all tests in this file and a small noise parameter, the output |
| // histogram has its counts clipped to the max count, but otherwise happens to be equal to the |
| // input histogram. |
| auto encoder = TwoDimBinomialHistogramEncoder(GetGenerator(), 20, 50, 0.0002); |
| std::vector<std::pair<uint32_t, uint64_t>> histogram = { |
| {2, 1}, {5, 10}, {6, 60}, {7, 1}, {15, 3}}; |
| std::vector<std::pair<uint32_t, uint64_t>> encoded = encoder.Encode(histogram); |
| EXPECT_THAT(encoded, ElementsAre(Pair(2u, 1ul), Pair(5u, 10ul), Pair(6u, 50ul), Pair(7u, 1ul), |
| Pair(15u, 3ul))); |
| |
| // When the noise parameter is equal to 1, then the output histogram contains all pairs |
| // (bucket_index, count) where count > 0 in addition to the contents of the input histogram. (In |
| // particular, each pair in the input histogram appears twice in the output.) |
| encoder = TwoDimBinomialHistogramEncoder(GetGenerator(), 3, 2, 1.0); |
| histogram = {{2, 1}}; |
| encoded = encoder.Encode(histogram); |
| EXPECT_THAT(encoded, |
| UnorderedElementsAre(Pair(0u, 1ul), Pair(0u, 2ul), Pair(1u, 1ul), Pair(1u, 2ul), |
| Pair(2u, 1ul), Pair(2u, 1ul), Pair(2u, 2ul))); |
| } |
| |
| TEST_F(HistogramEncoderTest, TwoDimBinomialHistogramSumEstimator) { |
| auto estimator = TwoDimBinomialHistogramSumEstimator(20, 50, 0.0002); |
| auto estimate = estimator.ComputeSum({{{2, 1}, {5, 10}, {6, 50}}, |
| {{2, 1}, {7, 1}, {15, 3}, {17, 34}}, |
| {{0, 1}, {2, 3}, {15, 6}, {19, 25}}}, |
| 3); |
| std::vector<double> expected_sums = {0.999, 0.000, 4.997, 0.000, 0.000, 9.994, 49.970, |
| 0.999, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, |
| 0.000, 8.994, 0.000, 33.979, 0.000, 24.985}; |
| EXPECT_THAT(estimate, Pointwise(DoubleNear(0.001), expected_sums)); |
| } |
| |
| } // namespace cobalt |