[Cobalt 1.1 privacy] Clip CountMin sketch cell counts to max_count

Previously, when encoding observations for StringCounts reports,
we clipped the count for each individual string to the report's
max_count before incrementing the CountMin sketch. However, when
two different strings hashed to the same sketch cell, this could
result in a cell count greater than max_count. This broke an
assumption of the method used to encode the CountMin sketch,
and could in some cases allow a device to contribute > max_count
for a single string.

After this change, the PrivacyEncoder clips each sketch cell
count to max_count after the sketch is fully formed rather than
incrementally.

Change-Id: I54c1a8a482de39d4f8b994ce8ffe87837ae1ff18
Reviewed-on: https://fuchsia-review.googlesource.com/c/cobalt/+/507059
Fuchsia-Auto-Submit: Laura Peskin <pesk@google.com>
Reviewed-by: Alexandre Zani <azani@google.com>
Commit-Queue: Laura Peskin <pesk@google.com>
diff --git a/src/logger/privacy_encoder.cc b/src/logger/privacy_encoder.cc
index c480d1f..88e18c0 100644
--- a/src/logger/privacy_encoder.cc
+++ b/src/logger/privacy_encoder.cc
@@ -371,17 +371,17 @@
 
     auto count_min = CountMin<uint64_t>::MakeSketch(num_cells_per_hash, num_hashes);
     for (int i = 0; i < string_histogram.bucket_indices_size(); ++i) {
-      uint64_t clipped_count = ClipCount(string_histogram.bucket_counts(i), report_def);
       std::string string_hash =
           observation.string_histogram().string_hashes(string_histogram.bucket_indices(i));
-      count_min.Increment(string_hash, clipped_count);
+      count_min.Increment(string_hash, string_histogram.bucket_counts(i));
     }
 
     for (size_t cell_index = 0; cell_index < count_min.size(); ++cell_index) {
       CB_ASSIGN_OR_RETURN(uint64_t cell_value, count_min.GetCellValue(cell_index));
       if (cell_value > 0) {
+        uint64_t clipped_count = ClipCount(cell_value, report_def);
         uint64_t count_min_index =
-            HistogramBucketAndCountToIndex(cell_index, cell_value, report_def.max_count(),
+            HistogramBucketAndCountToIndex(cell_index, clipped_count, report_def.max_count(),
                                            report_def.num_index_points(), gen_.get());
         occurred_indices.push_back(ValueAndEventVectorIndicesToIndex(
             count_min_index, event_vector_index,
diff --git a/src/logger/privacy_encoder_test.cc b/src/logger/privacy_encoder_test.cc
index 5b901f6..e02eb20 100644
--- a/src/logger/privacy_encoder_test.cc
+++ b/src/logger/privacy_encoder_test.cc
@@ -512,6 +512,61 @@
   EXPECT_EQ(indices, expected_private_indices);
 }
 
+// Checks that the `max_count` bound of a StringCounts report is enforced on each
+// cell of the CountMin sketch before encoding. In particular, if two strings hash
+// to the same cell of the sketch, then the total count for that sketch cell must
+// be clipped to `max_count`.
+TEST_F(PrivacyEncoderTest, StringCountsEnforceMaxCount) {
+  // Use a 1x1 CountMin sketch so that all strings hash to the same cell.
+  size_t num_cells_per_hash = 1;
+  size_t num_hashes = 1;
+  uint32_t max_count = 2;
+  uint32_t num_index_points = max_count + 1;
+  uint32_t max_event_code = 1;
+
+  // |metric_def| has 2 valid event vectors.
+  MetricDefinition metric_def;
+  metric_def.set_metric_type(MetricDefinition::STRING);
+  metric_def.set_string_buffer_max(10);
+  auto metric_dim = metric_def.add_metric_dimensions();
+  metric_dim->set_dimension("dimension 0");
+  metric_dim->set_max_event_code(max_event_code);
+
+  // |report_def| has 3 valid count values: {0, 1, 2}.
+  ReportDefinition report_def;
+  report_def.set_max_count(max_count);
+  report_def.set_num_index_points(num_index_points);
+
+  // Prepare a StringHistogramObservation with 1 IndexHistogram containing two string counts
+  // such that the sum of the counts is greater than `max_count`.
+  //
+  // The sum of the counts should be clipped to `max_count` when the CountMin sketch is prepared.
+  Observation observation;
+  StringHistogramObservation *string_histogram_obs = observation.mutable_string_histogram();
+
+  string_histogram_obs->add_string_hashes(util::FarmhashFingerprint("blobfs"));
+  string_histogram_obs->add_string_hashes(util::FarmhashFingerprint("minfs"));
+
+  IndexHistogram *histogram = string_histogram_obs->add_string_histograms();
+  histogram->add_event_codes(0u);
+  histogram->add_bucket_indices(0u);
+  histogram->add_bucket_counts(max_count);
+  histogram->add_bucket_indices(1u);
+  histogram->add_bucket_counts(1u);
+
+  // The general formula for an expected index is:
+  // (count + num_index_points * sketch cell index) * (num_event_vectors) + event_vector_index.
+  //
+  // The expected index is of this form with count = `max_count`, sketch cell index = 0, and
+  // event_vector_index = 0.
+  uint64_t expected_index = max_count * (max_event_code + 1);
+
+  CB_ASSERT_OK_AND_ASSIGN(std::vector<uint64_t> indices,
+                          PrepareIndexVectorForStringCountsReport(
+                              observation, metric_def, report_def, num_cells_per_hash, num_hashes));
+  EXPECT_EQ(indices, std::vector<uint64_t>({expected_index}));
+}
+
 TEST_F(PrivacyEncoderTest, ObservationsFromIndicesNoIndices) {
   std::vector<uint64_t> indices;
   auto observations = ObservationsFromIndices(indices);