[privacy] Support Farmhash Fingerprint 64 string hashes

Support Farmhash Fingerprint 64 string hashes in string counts report
privacy encoding.

Note that this change does not supports Farmhash Fingerprint 64 string hashes in unique device string counts report privacy encoding. It will be supported in a separate change.

Bug: b/321745113

TESTED=./cobaltb.py test

Change-Id: Ia55288e01ab2a6ea12f5af739fc4aa676394e623
Reviewed-on: https://fuchsia-review.googlesource.com/c/cobalt/+/1006918
Reviewed-by: Alexandre Zani <azani@google.com>
Commit-Queue: Anivia Li <aniviali@google.com>
Reviewed-by: Alex Pankhurst <pankhurst@google.com>
diff --git a/src/logger/BUILD.gn b/src/logger/BUILD.gn
index b5b52d8..1fd1368 100644
--- a/src/logger/BUILD.gn
+++ b/src/logger/BUILD.gn
@@ -456,6 +456,7 @@
     "$cobalt_root/src/lib/util:hash",
     "$cobalt_root/src/public/lib/statusor",
     "$cobalt_root/src/registry:cobalt_registry_proto",
+    "//third_party/googletest:gmock",
     "//third_party/googletest:gtest",
   ]
 }
diff --git a/src/logger/privacy_encoder.cc b/src/logger/privacy_encoder.cc
index de113b8..7426388 100644
--- a/src/logger/privacy_encoder.cc
+++ b/src/logger/privacy_encoder.cc
@@ -9,6 +9,8 @@
 #include "src/pb/observation.pb.h"
 #include "src/public/lib/statusor/status_macros.h"
 
+using google::protobuf::RepeatedPtrField;
+
 namespace cobalt::logger {
 namespace {
 
@@ -393,10 +395,15 @@
                                       string_histogram.event_codes().end());
     CB_ASSIGN_OR_RETURN(auto event_vector_index, EventVectorToIndex(event_codes, metric_def));
 
+    // TODO(b/322409910): Delete usage of legacy hash after clients no longer store them. Use legacy
+    // hashes if they're present. Use Farmhash Fingerprint 64 hashes otherwise.
+    const RepeatedPtrField<std::string> &string_hashes =
+        observation.string_histogram().string_hashes().empty()
+            ? observation.string_histogram().string_hashes_ff64()
+            : observation.string_histogram().string_hashes();
     CB_ASSIGN_OR_RETURN(
         CountMin<uint64_t> count_min,
-        MakeCountMinSketch(string_histogram, observation.string_histogram().string_hashes(),
-                           num_cells_per_hash, num_hashes));
+        MakeCountMinSketch(string_histogram, string_hashes, num_cells_per_hash, num_hashes));
 
     for (size_t cell_index = 0; cell_index < count_min.size(); ++cell_index) {
       CB_ASSIGN_OR_RETURN(uint64_t cell_value, count_min.GetCellValue(cell_index));
@@ -414,6 +421,8 @@
   return occurred_indices;
 }
 
+// TODO(b/321745113): Support Farmhash Fingerprint 64 string hashes in unique device string counts
+// report privacy encoding.
 lib::statusor::StatusOr<std::vector<uint64_t>>
 PrivacyEncoder::PrepareIndexVectorForUniqueDeviceStringCountsReport(
     const Observation &observation, const MetricDefinition &metric_def,
@@ -464,9 +473,8 @@
 }
 
 lib::statusor::StatusOr<CountMin<uint64_t>> PrivacyEncoder::MakeCountMinSketch(
-    const IndexHistogram &string_histogram,
-    const google::protobuf::RepeatedPtrField<std::string> &string_hashes, size_t num_cells_per_hash,
-    size_t num_hashes) {
+    const IndexHistogram &string_histogram, const RepeatedPtrField<std::string> &string_hashes,
+    size_t num_cells_per_hash, size_t num_hashes) {
   auto count_min = CountMin<uint64_t>::MakeSketch(num_cells_per_hash, num_hashes);
   for (int i = 0; i < string_histogram.bucket_indices_size(); ++i) {
     const std::string &string_hash =
diff --git a/src/logger/privacy_encoder_test.cc b/src/logger/privacy_encoder_test.cc
index 992a280..7e104cb 100644
--- a/src/logger/privacy_encoder_test.cc
+++ b/src/logger/privacy_encoder_test.cc
@@ -2,6 +2,7 @@
 
 #include <algorithm>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 #include "src/algorithms/privacy/count_min.h"
@@ -104,6 +105,10 @@
   std::unique_ptr<PrivacyEncoder> privacy_encoder_;
 };
 
+namespace {
+
+using testing::UnorderedElementsAreArray;
+
 TEST_F(PrivacyEncoderTest, MaybeMakePrivateObservationsNoAddedPrivacyReport) {
   MetricDefinition metric_def;
   ReportDefinition report_def;
@@ -509,11 +514,26 @@
   EXPECT_EQ(indices.value(), expected_indices);
 }
 
+namespace {
+
+// Increments cell at index |index| in |sketch| by |count|.
+void IncrementCellBy(std::unordered_map<size_t, uint64_t> &sketch, size_t index, uint64_t count) {
+  auto iter = sketch.find(index);
+  if (iter == sketch.end()) {
+    sketch[index] = count;
+  } else {
+    iter->second += count;
+  }
+}
+
+}  // namespace
+
+// TODO(b/322409910): Delete this test after clients no longer store legacy string hash.
 TEST_F(PrivacyEncoderTest, StringCounts) {
   size_t num_cells_per_hash = 2;
   size_t num_hashes = 2;
   uint32_t max_event_code = 1;
-  uint32_t max_count = 2;
+  uint64_t max_count = 2;
   uint32_t num_index_points = max_count + 1;
 
   // |metric_def| has 2 valid event vectors.
@@ -552,34 +572,135 @@
 
   // The general formula for an expected index is:
   // (count + num_index_points * sketch cell index) * (num_event_vectors) + event_vector_index.
-  // Cells with count 0 are omitted.
   auto count_min = CountMin<uint64_t>::MakeSketch(num_cells_per_hash, num_hashes);
-  std::vector<size_t> blobfs_indices =
-      count_min.GetCellIndices(string_histogram_obs->string_hashes(0));
-  std::vector<size_t> thinfs_indices =
-      count_min.GetCellIndices(string_histogram_obs->string_hashes(1));
+  std::unordered_map<size_t, uint64_t> sketch_0;
+  std::unordered_map<size_t, uint64_t> sketch_1;
 
-  std::vector<uint64_t> expected_private_indices;
-  expected_private_indices.reserve(6);
-
-  for (size_t index : blobfs_indices) {
-    // Expected private indices for a count of 1 for "blobfs" with event vector {0}
-    expected_private_indices.push_back((1 + num_index_points * index) * (max_event_code + 1) + 0);
+  // Increment cells for "blobfs" with event vector {0} by 1
+  for (const size_t index : count_min.GetCellIndices(string_histogram_obs->string_hashes(0))) {
+    IncrementCellBy(sketch_0, index, /*count=*/1);
+  }
+  for (const size_t index : count_min.GetCellIndices(string_histogram_obs->string_hashes(1))) {
+    // Increment cells for "thinfs" with event vector {0} by 2
+    IncrementCellBy(sketch_0, index, /*count=*/2);
+    // Increment cells for "thinfs" with event vector {1} by 2
+    IncrementCellBy(sketch_1, index, /*count=*/2);
   }
 
-  for (size_t index : thinfs_indices) {
-    // Expected private indices for a count of 2 for "thinfs" with event vector {0}
-    expected_private_indices.push_back((2 + num_index_points * index) * (max_event_code + 1) + 0);
-    // Expected private indices for a count of 2 for "thinfs" with event vector {1}
-    expected_private_indices.push_back((2 + num_index_points * index) * (max_event_code + 1) + 1);
+  std::vector<uint64_t> expected_private_indices;
+  for (const auto &[cell_index, cell_value] : sketch_0) {
+    if (cell_value == 0) {
+      continue;
+    }
+    const uint64_t clipped_count = std::clamp(cell_value, 0ul, max_count);
+
+    // Expected private indices for event vector {0}
+    expected_private_indices.push_back(
+        (clipped_count + num_index_points * cell_index) * (max_event_code + 1) + 0 /*event_code*/);
+  }
+
+  for (const auto &[cell_index, cell_value] : sketch_1) {
+    if (cell_value == 0) {
+      continue;
+    }
+    const uint64_t clipped_count = std::clamp(cell_value, 0ul, max_count);
+
+    // Expected private indices for event vector {1}
+    expected_private_indices.push_back(
+        (clipped_count + num_index_points * cell_index) * (max_event_code + 1) + 1 /*event_code*/);
   }
 
   CB_ASSERT_OK_AND_ASSIGN(std::vector<uint64_t> indices,
                           PrepareIndexVectorForStringCountsReport(
                               observation, metric_def, report_def, num_cells_per_hash, num_hashes));
-  std::sort(indices.begin(), indices.end());
-  std::sort(expected_private_indices.begin(), expected_private_indices.end());
-  EXPECT_EQ(indices, expected_private_indices);
+  EXPECT_THAT(indices, UnorderedElementsAreArray(expected_private_indices));
+}
+
+TEST_F(PrivacyEncoderTest, StringCountsFF64) {
+  size_t num_cells_per_hash = 2;
+  size_t num_hashes = 2;
+  uint32_t max_event_code = 1;
+  uint64_t max_count = 2;
+  uint32_t num_index_points = max_count + 1;
+
+  // |metric_def| has 2 valid event vectors.
+  MetricDefinition metric_def;
+  metric_def.set_metric_type(MetricDefinition::STRING);
+  MetricDefinition::MetricDimension *metric_dim = metric_def.add_metric_dimensions();
+  metric_dim->set_dimension("dimension 0");
+  metric_dim->set_max_event_code(max_event_code);
+
+  // |report_def| has 3 valid count values: {0, 1, 2}.
+  ReportDefinition report_def;
+  report_def.set_max_count(max_count);
+  report_def.set_num_index_points(num_index_points);
+  report_def.set_string_buffer_max(10);
+
+  // Prepare a StringHistogramObservation with 2 IndexHistograms:
+  // - with event vector {0}: ("blobfs", count = 1), ("thinfs", count = 2)
+  // - with event vector {1}: ("thinfs", count = 2)
+  Observation observation;
+  StringHistogramObservation *string_histogram_obs = observation.mutable_string_histogram();
+
+  string_histogram_obs->add_string_hashes_ff64(util::FarmhashFingerprint64("blobfs"));
+  string_histogram_obs->add_string_hashes_ff64(util::FarmhashFingerprint64("thinfs"));
+
+  IndexHistogram *histogram_1 = string_histogram_obs->add_string_histograms();
+  histogram_1->add_event_codes(0u);
+  histogram_1->add_bucket_indices(0u);
+  histogram_1->add_bucket_counts(1u);
+  histogram_1->add_bucket_indices(1u);
+  histogram_1->add_bucket_counts(2u);
+
+  IndexHistogram *histogram_2 = string_histogram_obs->add_string_histograms();
+  histogram_2->add_event_codes(1u);
+  histogram_2->add_bucket_indices(1u);
+  histogram_2->add_bucket_counts(2u);
+
+  // The general formula for an expected index is:
+  // (count + num_index_points * sketch cell index) * (num_event_vectors) + event_vector_index
+  auto count_min = CountMin<uint64_t>::MakeSketch(num_cells_per_hash, num_hashes);
+  std::unordered_map<size_t, uint64_t> sketch_0;
+  std::unordered_map<size_t, uint64_t> sketch_1;
+
+  // Increment cells for "blobfs" with event vector {0} by 1
+  for (const size_t index : count_min.GetCellIndices(string_histogram_obs->string_hashes_ff64(0))) {
+    IncrementCellBy(sketch_0, index, /*count=*/1);
+  }
+  for (const size_t index : count_min.GetCellIndices(string_histogram_obs->string_hashes_ff64(1))) {
+    // Increment cells for "thinfs" with event vector {0} by 2
+    IncrementCellBy(sketch_0, index, /*count=*/2);
+    // Increment cells for "thinfs" with event vector {1} by 2
+    IncrementCellBy(sketch_1, index, /*count=*/2);
+  }
+
+  std::vector<uint64_t> expected_private_indices;
+  for (const auto &[cell_index, cell_value] : sketch_0) {
+    if (cell_value == 0) {
+      continue;
+    }
+    const uint64_t clipped_count = std::clamp(cell_value, 0ul, max_count);
+
+    // Expected private indices for event vector {0}
+    expected_private_indices.push_back(
+        (clipped_count + num_index_points * cell_index) * (max_event_code + 1) + 0 /*event_code*/);
+  }
+
+  for (const auto &[cell_index, cell_value] : sketch_1) {
+    if (cell_value == 0) {
+      continue;
+    }
+    uint64_t clipped_count = std::clamp(cell_value, 0ul, max_count);
+
+    // Expected private indices for event vector {1}
+    expected_private_indices.push_back(
+        (clipped_count + num_index_points * cell_index) * (max_event_code + 1) + 1 /*event_code*/);
+  }
+
+  CB_ASSERT_OK_AND_ASSIGN(std::vector<uint64_t> indices,
+                          PrepareIndexVectorForStringCountsReport(
+                              observation, metric_def, report_def, num_cells_per_hash, num_hashes));
+  EXPECT_THAT(indices, UnorderedElementsAreArray(expected_private_indices));
 }
 
 // Checks that the `max_count` bound of a StringCounts report is enforced on each
@@ -1071,4 +1192,5 @@
   EXPECT_EQ(ClipCount(count, report_def), max_count);
 }
 
+}  // namespace
 }  // namespace cobalt::logger