[encoder] Support FF64 in String Histogram Observation encoding

Added a boolean parameter in String Histogram Observation encoding to
support the soft migration on Farmhash Fingerprint 64.

Note that this change does not encodes any observation with FF64 string hashes. Nothing but supporting it.

Bug: 321745113
Tested: ./cobaltb.py test
Change-Id: I0176a7f85df73f64202a8cf3e00e435c53d258a2
Reviewed-on: https://fuchsia-review.googlesource.com/c/cobalt/+/1011999
Reviewed-by: Cameron Dale <camrdale@google.com>
Commit-Queue: Anivia Li <aniviali@google.com>
Reviewed-by: Alex Pankhurst <pankhurst@google.com>
diff --git a/src/local_aggregation/aggregation_procedures/at_least_once_string_aggregation_procedure.cc b/src/local_aggregation/aggregation_procedures/at_least_once_string_aggregation_procedure.cc
index 28fc6f4..c7a5aca 100644
--- a/src/local_aggregation/aggregation_procedures/at_least_once_string_aggregation_procedure.cc
+++ b/src/local_aggregation/aggregation_procedures/at_least_once_string_aggregation_procedure.cc
@@ -92,6 +92,10 @@
   // seen hashes is a mapping from a string hash to it's hash index, which correlates to the index
   // of string hashes in the hashes vector above.
   std::map<std::string, uint32_t> seen_hashes;
+
+  // TODO(https://fxbug.dev/321745113): Support Farmhash Fingerprint 64 string hashes once they are
+  // supported in local aggregation, drop any legacy hashes if a multi-day report has a mix of
+  // legacy and ff64 across multiple days.
   for (const AggregateDataToGenerate &bucket : buckets) {
     for (const EventCodesAggregateData &aggregate_data : bucket.aggregate_data) {
       std::vector<uint32_t> event_vector(aggregate_data.event_codes().begin(),
@@ -139,7 +143,7 @@
     return {nullptr};
   }
 
-  return logger::encoder::EncodeStringHistogramObservation(hashes, data);
+  return logger::encoder::EncodeStringHistogramObservation(hashes, data, /*use_legacy_hash=*/true);
 }
 
 void AtLeastOnceStringAggregationProcedure::ObservationsCommitted(
diff --git a/src/local_aggregation/aggregation_procedures/string_histogram_aggregation_procedure.cc b/src/local_aggregation/aggregation_procedures/string_histogram_aggregation_procedure.cc
index 27176b6..4a53cb8 100644
--- a/src/local_aggregation/aggregation_procedures/string_histogram_aggregation_procedure.cc
+++ b/src/local_aggregation/aggregation_procedures/string_histogram_aggregation_procedure.cc
@@ -75,7 +75,9 @@
     hashes.push_back(hash);
   }
 
-  return logger::encoder::EncodeStringHistogramObservation(hashes, data);
+  // TODO(https://fxbug.dev/321745113): Support Farmhash Fingerprint 64 string hashes once they are
+  // supported in local aggregation.
+  return logger::encoder::EncodeStringHistogramObservation(hashes, data, /*use_legacy_hash=*/true);
 }
 
 std::string StringHistogramAggregationProcedure::DebugString() const { return "STRING_HISTOGRAM"; }
diff --git a/src/logger/encoder.cc b/src/logger/encoder.cc
index 68d28dd..410f233 100644
--- a/src/logger/encoder.cc
+++ b/src/logger/encoder.cc
@@ -73,14 +73,18 @@
 
 lib::statusor::StatusOr<std::unique_ptr<Observation>> EncodeStringHistogramObservation(
     const std::vector<std::string>& hashes,
-    const std::vector<std::tuple<EventCodes, Histogram>>& data) {
+    const std::vector<std::tuple<EventCodes, Histogram>>& data, bool use_legacy_hash) {
   auto observation = std::make_unique<Observation>();
 
   StringHistogramObservation* string_histogram_observation =
       observation->mutable_string_histogram();
 
   for (const std::string& hash : hashes) {
-    string_histogram_observation->add_string_hashes(hash);
+    if (use_legacy_hash) {
+      string_histogram_observation->add_string_hashes(hash);
+    } else {
+      string_histogram_observation->add_string_hashes_ff64(hash);
+    }
   }
 
   for (const auto& [event_codes, histogram] : data) {
diff --git a/src/logger/encoder.h b/src/logger/encoder.h
index 1f99f1c..530bdfa 100644
--- a/src/logger/encoder.h
+++ b/src/logger/encoder.h
@@ -49,9 +49,15 @@
 // data: A vector of (event_codes, histogram vector of (index, count)) pairs that will be used to
 // encode the Observation. The bucket with index i in the histogram contains the count for the
 // string whose hash is in position i in the hashes vector.
+// use_legacy_hash: a boolean indicating if legacy hash function (Farmhash Fingerprint 128) is used
+// to produce the string hashes.
+//
+// TODO(https://fxbug.dev/322409910): Delete usage of legacy hash after clients stop sending the
+// filed.
 [[nodiscard]] lib::statusor::StatusOr<std::unique_ptr<Observation>>
 EncodeStringHistogramObservation(const std::vector<std::string>& hashes,
-                                 const std::vector<std::tuple<EventCodes, Histogram>>& data);
+                                 const std::vector<std::tuple<EventCodes, Histogram>>& data,
+                                 bool use_legacy_hash);
 
 }  // namespace cobalt::logger::encoder
 
diff --git a/src/logger/encoder_test.cc b/src/logger/encoder_test.cc
index bbf3228..7ed882f 100644
--- a/src/logger/encoder_test.cc
+++ b/src/logger/encoder_test.cc
@@ -105,13 +105,15 @@
   }
 }
 
-TEST(Encoder, EncodeStringHistogramObservationSingleValue) {
+// TODO(https://fxbug.dev/322409910): Delete this test after clients stop using legacy hash.
+TEST(Encoder, EncodeStringHistogramObservationSingleValueLegacy) {
   StatusOr<std::unique_ptr<Observation>> obs_or =
-      EncodeStringHistogramObservation({"4"}, {{{1, 2, 3}, {{4, 5}}}});
+      EncodeStringHistogramObservation({"4"}, {{{1, 2, 3}, {{4, 5}}}}, /*use_legacy_hash=*/true);
   ASSERT_EQ(obs_or.status(), Status::OkStatus());
   std::unique_ptr<Observation> obs = std::move(obs_or.value());
   ASSERT_TRUE(obs->has_string_histogram());
   ASSERT_EQ(obs->string_histogram().string_hashes_size(), 1);
+  ASSERT_EQ(obs->string_histogram().string_hashes_ff64_size(), 0);
   ASSERT_EQ(obs->string_histogram().string_hashes(0), "4");
   ASSERT_EQ(obs->string_histogram().string_histograms_size(), 1);
   EXPECT_EQ(obs->string_histogram().string_histograms(0).event_codes_size(), 3);
@@ -121,17 +123,37 @@
   EXPECT_EQ(obs->string_histogram().string_histograms(0).bucket_counts(0), 5);
 }
 
-TEST(Encoder, EncodeStringHistogramObservationMultipleValues) {
+TEST(Encoder, EncodeStringHistogramObservationSingleValueFF64) {
+  StatusOr<std::unique_ptr<Observation>> obs_or =
+      EncodeStringHistogramObservation({"4"}, {{{1, 2, 3}, {{4, 5}}}}, /*use_legacy_hash=*/false);
+  ASSERT_EQ(obs_or.status(), Status::OkStatus());
+  std::unique_ptr<Observation> obs = std::move(obs_or.value());
+  ASSERT_TRUE(obs->has_string_histogram());
+  ASSERT_EQ(obs->string_histogram().string_hashes_size(), 0);
+  ASSERT_EQ(obs->string_histogram().string_hashes_ff64_size(), 1);
+  ASSERT_EQ(obs->string_histogram().string_hashes_ff64(0), "4");
+  ASSERT_EQ(obs->string_histogram().string_histograms_size(), 1);
+  EXPECT_EQ(obs->string_histogram().string_histograms(0).event_codes_size(), 3);
+  ASSERT_EQ(obs->string_histogram().string_histograms(0).bucket_indices_size(), 1);
+  EXPECT_EQ(obs->string_histogram().string_histograms(0).bucket_indices(0), 4u);
+  ASSERT_EQ(obs->string_histogram().string_histograms(0).bucket_counts_size(), 1);
+  EXPECT_EQ(obs->string_histogram().string_histograms(0).bucket_counts(0), 5);
+}
+
+// TODO(https://fxbug.dev/322409910): Delete this test after clients stop using legacy hash.
+TEST(Encoder, EncodeStringHistogramObservationMultipleValuesLegacy) {
   StatusOr<std::unique_ptr<Observation>> obs_or =
       EncodeStringHistogramObservation({"1", "2", "3", "4", "5", "6", "7", "8", "10"},
                                        {{{}, {{1, 2}}},
                                         {{1}, {{2, 3}, {4, 5}}},
                                         {{1, 2}, {{3, 4}, {5, 6}, {7, 8}}},
-                                        {{1, 2, 3}, {{4, 5}, {6, 7}, {8, 9}, {10, 11}}}});
+                                        {{1, 2, 3}, {{4, 5}, {6, 7}, {8, 9}, {10, 11}}}},
+                                       /*use_legacy_hash=*/true);
   ASSERT_EQ(obs_or.status(), Status::OkStatus());
   std::unique_ptr<Observation> obs = std::move(obs_or.value());
   ASSERT_TRUE(obs->has_string_histogram());
   ASSERT_EQ(obs->string_histogram().string_hashes_size(), 9);
+  ASSERT_EQ(obs->string_histogram().string_hashes_ff64_size(), 0);
   ASSERT_EQ(obs->string_histogram().string_hashes(0), "1");
   ASSERT_EQ(obs->string_histogram().string_histograms_size(), 4);
   for (int i = 0; i < 4; i++) {
@@ -146,4 +168,31 @@
   }
 }
 
+TEST(Encoder, EncodeStringHistogramObservationMultipleValuesFF64) {
+  StatusOr<std::unique_ptr<Observation>> obs_or =
+      EncodeStringHistogramObservation({"1", "2", "3", "4", "5", "6", "7", "8", "10"},
+                                       {{{}, {{1, 2}}},
+                                        {{1}, {{2, 3}, {4, 5}}},
+                                        {{1, 2}, {{3, 4}, {5, 6}, {7, 8}}},
+                                        {{1, 2, 3}, {{4, 5}, {6, 7}, {8, 9}, {10, 11}}}},
+                                       /*use_legacy_hash=*/false);
+  ASSERT_EQ(obs_or.status(), Status::OkStatus());
+  std::unique_ptr<Observation> obs = std::move(obs_or.value());
+  ASSERT_TRUE(obs->has_string_histogram());
+  ASSERT_EQ(obs->string_histogram().string_hashes_size(), 0);
+  ASSERT_EQ(obs->string_histogram().string_hashes_ff64_size(), 9);
+  ASSERT_EQ(obs->string_histogram().string_hashes_ff64(0), "1");
+  ASSERT_EQ(obs->string_histogram().string_histograms_size(), 4);
+  for (int i = 0; i < 4; i++) {
+    EXPECT_EQ(obs->string_histogram().string_histograms(i).event_codes_size(), i);
+    ASSERT_EQ(obs->string_histogram().string_histograms(i).bucket_indices_size(), i + 1);
+    ASSERT_EQ(obs->string_histogram().string_histograms(i).bucket_counts_size(), i + 1);
+    for (int j = 0; j < i + 1; j++) {
+      EXPECT_EQ(obs->string_histogram().string_histograms(i).bucket_indices(j),
+                static_cast<uint32_t>(i + j * 2 + 1));
+      EXPECT_EQ(obs->string_histogram().string_histograms(i).bucket_counts(j), i + j * 2 + 2);
+    }
+  }
+}
+
 }  // namespace cobalt::logger::encoder