[Cobalt 1.1 privacy] Implement MakePrivateObservations for StringCounts

Uses the CountMin library to encode StringHistogramObservations for
StringCounts reports with added privacy.

Co-authored-by: azani@google.com
Co-authored-by: pesk@google.com

Change-Id: I6ae5eee56d129340d2c018bfa327dfe5338732df
Reviewed-on: https://fuchsia-review.googlesource.com/c/cobalt/+/464474
Reviewed-by: Alexandre Zani <azani@google.com>
Fuchsia-Auto-Submit: Alexandre Zani <azani@google.com>
Commit-Queue: Auto-Submit <auto-submit@fuchsia-infra.iam.gserviceaccount.com>
diff --git a/src/logger/BUILD.gn b/src/logger/BUILD.gn
index 487dbdd..cd44307 100644
--- a/src/logger/BUILD.gn
+++ b/src/logger/BUILD.gn
@@ -456,6 +456,7 @@
 
   public_deps = [
     ":event_vector_index",
+    "$cobalt_root/src/algorithms/privacy:count_min",
     "$cobalt_root/src/algorithms/privacy:numeric_encoding",
     "$cobalt_root/src/algorithms/privacy:rappor",
     "$cobalt_root/src/algorithms/random:random",
@@ -473,7 +474,10 @@
 
   deps = [
     ":privacy_encoder",
+    "$cobalt_root/src/algorithms/privacy:count_min",
     "$cobalt_root/src/algorithms/random:test_secure_random",
+    "$cobalt_root/src/lib/statusor",
+    "$cobalt_root/src/lib/util:hash",
     "$cobalt_root/src/registry:cobalt_registry_proto",
     "//third_party/googletest:gtest",
   ]
diff --git a/src/logger/privacy_encoder.cc b/src/logger/privacy_encoder.cc
index df061f5..c480d1f 100644
--- a/src/logger/privacy_encoder.cc
+++ b/src/logger/privacy_encoder.cc
@@ -1,5 +1,6 @@
 #include "src/logger/privacy_encoder.h"
 
+#include "src/algorithms/privacy/count_min.h"
 #include "src/algorithms/privacy/numeric_encoding.h"
 #include "src/algorithms/privacy/rappor.h"
 #include "src/algorithms/random/random.h"
@@ -11,6 +12,10 @@
 namespace cobalt::logger {
 namespace {
 
+// The dimensions of a CountMin sketch for a report of type StringCounts.
+const size_t kNumCountMinCellsPerHash = 10;
+const size_t kNumCountMinHashes = 5;
+
 // Returns the number of histogram buckets associated to an IntegerBuckets, including the underflow
 // and overflow buckets.
 lib::statusor::StatusOr<uint32_t> GetNumIntegerBuckets(const IntegerBuckets &int_buckets) {
@@ -117,12 +122,34 @@
               report_def.num_index_points()) -
              1;
     }
-
+    case ReportDefinition::STRING_COUNTS: {
+      CB_ASSIGN_OR_RETURN(size_t num_cells_per_hash, GetNumCountMinCellsPerHash(report_def));
+      CB_ASSIGN_OR_RETURN(size_t num_hashes, GetNumCountMinHashes(report_def));
+      return (num_cells_per_hash * num_hashes * GetNumEventVectors(metric_def.metric_dimensions()) *
+              report_def.num_index_points()) -
+             1;
+    }
     default:
       return util::Status(util::UNIMPLEMENTED, "this is not yet implemented");
   }
 }
 
+lib::statusor::StatusOr<size_t> PrivacyEncoder::GetNumCountMinCellsPerHash(
+    const ReportDefinition &report_def) {
+  if (report_def.report_type() != ReportDefinition::STRING_COUNTS) {
+    return util::Status(util::INVALID_ARGUMENT, "report must have type StringCounts");
+  }
+  return kNumCountMinCellsPerHash;
+}
+
+lib::statusor::StatusOr<size_t> PrivacyEncoder::GetNumCountMinHashes(
+    const ReportDefinition &report_def) {
+  if (report_def.report_type() != ReportDefinition::STRING_COUNTS) {
+    return util::Status(util::INVALID_ARGUMENT, "report must have type StringCounts");
+  }
+  return kNumCountMinHashes;
+}
+
 lib::statusor::StatusOr<std::vector<uint64_t>> PrivacyEncoder::PrepareIndexVector(
     const Observation &observation, const MetricDefinition &metric_def,
     const ReportDefinition &report_def) {
@@ -155,6 +182,14 @@
                                        observation, metric_def, report_def));
       break;
     }
+    case ReportDefinition::STRING_COUNTS: {
+      CB_ASSIGN_OR_RETURN(size_t num_cells_per_hash, GetNumCountMinCellsPerHash(report_def));
+      CB_ASSIGN_OR_RETURN(size_t num_hashes, GetNumCountMinHashes(report_def));
+      CB_ASSIGN_OR_RETURN(
+          indices, PrepareIndexVectorForStringCountsReport(observation, metric_def, report_def,
+                                                           num_cells_per_hash, num_hashes));
+      break;
+    }
 
     default:
       return util::Status(util::UNIMPLEMENTED, "this is not yet implemented");
@@ -316,6 +351,47 @@
   return occurred_indices;
 }
 
+lib::statusor::StatusOr<std::vector<uint64_t>>
+PrivacyEncoder::PrepareIndexVectorForStringCountsReport(const Observation &observation,
+                                                        const MetricDefinition &metric_def,
+                                                        const ReportDefinition &report_def,
+                                                        size_t num_cells_per_hash,
+                                                        size_t num_hashes) {
+  if (!observation.has_string_histogram()) {
+    return util::Status(util::INVALID_ARGUMENT,
+                        "observation type is not StringHistogramObservation.");
+  }
+
+  std::vector<uint64_t> occurred_indices;
+
+  for (const auto &string_histogram : observation.string_histogram().string_histograms()) {
+    std::vector<uint32_t> event_codes(string_histogram.event_codes().begin(),
+                                      string_histogram.event_codes().end());
+    CB_ASSIGN_OR_RETURN(auto event_vector_index, EventVectorToIndex(event_codes, metric_def));
+
+    auto count_min = CountMin<uint64_t>::MakeSketch(num_cells_per_hash, num_hashes);
+    for (int i = 0; i < string_histogram.bucket_indices_size(); ++i) {
+      uint64_t clipped_count = ClipCount(string_histogram.bucket_counts(i), report_def);
+      std::string string_hash =
+          observation.string_histogram().string_hashes(string_histogram.bucket_indices(i));
+      count_min.Increment(string_hash, clipped_count);
+    }
+
+    for (size_t cell_index = 0; cell_index < count_min.size(); ++cell_index) {
+      CB_ASSIGN_OR_RETURN(uint64_t cell_value, count_min.GetCellValue(cell_index));
+      if (cell_value > 0) {
+        uint64_t count_min_index =
+            HistogramBucketAndCountToIndex(cell_index, cell_value, report_def.max_count(),
+                                           report_def.num_index_points(), gen_.get());
+        occurred_indices.push_back(ValueAndEventVectorIndicesToIndex(
+            count_min_index, event_vector_index,
+            GetNumEventVectors(metric_def.metric_dimensions()) - 1));
+      }
+    }
+  }
+  return occurred_indices;
+}
+
 int64_t PrivacyEncoder::ClipValue(int64_t value, const ReportDefinition &report_def) {
   if (value > report_def.max_value()) {
     return report_def.max_value();
diff --git a/src/logger/privacy_encoder.h b/src/logger/privacy_encoder.h
index b165bd0..ec95a92 100644
--- a/src/logger/privacy_encoder.h
+++ b/src/logger/privacy_encoder.h
@@ -5,6 +5,7 @@
 #ifndef COBALT_SRC_LOGGER_PRIVACY_ENCODER_H_
 #define COBALT_SRC_LOGGER_PRIVACY_ENCODER_H_
 
+#include "src/algorithms/privacy/count_min.h"
 #include "src/algorithms/random/random.h"
 #include "src/lib/statusor/statusor.h"
 #include "src/pb/observation.pb.h"
@@ -49,6 +50,16 @@
   static lib::statusor::StatusOr<uint64_t> MaxIndexForReport(const MetricDefinition &metric_def,
                                                              const ReportDefinition &report_def);
 
+  // The number of sketch cells per hash function in CountMin sketch for a StringCounts report
+  // |report_def|. Currently hard-coded to 10. Returns an error status if |report_def| is not of
+  // type StringCounts.
+  static lib::statusor::StatusOr<size_t> GetNumCountMinCellsPerHash(
+      const ReportDefinition &report_def);
+
+  // The number of hash functions in a CountMin sketch for a StringCounts report |report_def|.
+  // Currently hard-coded to 5. Returns an error status if |report_def| is not of type StringCounts.
+  static lib::statusor::StatusOr<size_t> GetNumCountMinHashes(const ReportDefinition &report_def);
+
  private:
   friend class PrivacyEncoderTest;
 
@@ -80,6 +91,10 @@
       const Observation &observation, const MetricDefinition &metric_def,
       const ReportDefinition &report_def);
 
+  lib::statusor::StatusOr<std::vector<uint64_t>> PrepareIndexVectorForStringCountsReport(
+      const Observation &observation, const MetricDefinition &metric_def,
+      const ReportDefinition &report_def, size_t num_cells_per_hash, size_t num_hashes);
+
   static std::vector<std::unique_ptr<Observation>> ObservationsFromIndices(
       const std::vector<uint64_t> &indices);
 
diff --git a/src/logger/privacy_encoder_test.cc b/src/logger/privacy_encoder_test.cc
index 7ccd449..5b901f6 100644
--- a/src/logger/privacy_encoder_test.cc
+++ b/src/logger/privacy_encoder_test.cc
@@ -1,9 +1,14 @@
 #include "src/logger/privacy_encoder.h"
 
+#include <algorithm>
+
 #include <gtest/gtest.h>
 
+#include "src/algorithms/privacy/count_min.h"
 #include "src/algorithms/random/test_secure_random.h"
+#include "src/lib/statusor/status_macros.h"
 #include "src/lib/statusor/statusor.h"
+#include "src/lib/util/hash.h"
 #include "src/pb/observation.pb.h"
 #include "src/registry/metric_definition.pb.h"
 #include "src/registry/report_definition.pb.h"
@@ -53,6 +58,13 @@
                                                                             report_def);
   }
 
+  lib::statusor::StatusOr<std::vector<uint64_t>> PrepareIndexVectorForStringCountsReport(
+      const Observation &observation, const MetricDefinition &metric_def,
+      const ReportDefinition &report_def, size_t num_cells_per_hash, size_t num_hashes) {
+    return privacy_encoder_->PrepareIndexVectorForStringCountsReport(
+        observation, metric_def, report_def, num_cells_per_hash, num_hashes);
+  }
+
   static std::vector<std::unique_ptr<Observation>> ObservationsFromIndices(
       const std::vector<uint64_t> &indices) {
     return PrivacyEncoder::ObservationsFromIndices(indices);
@@ -127,7 +139,8 @@
       ReportDefinition::FLEETWIDE_MEANS,
       ReportDefinition::HOURLY_VALUE_HISTOGRAMS,
       ReportDefinition::UNIQUE_DEVICE_HISTOGRAMS,
-      ReportDefinition::FLEETWIDE_HISTOGRAMS};
+      ReportDefinition::FLEETWIDE_HISTOGRAMS,
+      ReportDefinition::STRING_COUNTS};
 
   MetricDefinition metric_def;
   ReportDefinition report_def;
@@ -141,24 +154,6 @@
   }
 }
 
-// MakePrivateObservations() is not implemented yet for these report types. Move report types to the
-// MakePrivateObservationsImplemented test as they are implemented.
-TEST_F(PrivacyEncoderTest, MakePrivateObservationsUnimplemented) {
-  std::vector<ReportDefinition::ReportType> unimplemented_report_types = {
-      ReportDefinition::STRING_COUNTS};
-
-  MetricDefinition metric_def;
-  ReportDefinition report_def;
-  report_def.set_privacy_level(ReportDefinition::LOW_PRIVACY);
-
-  Observation observation;
-  for (const ReportDefinition::ReportType report_type : unimplemented_report_types) {
-    report_def.set_report_type(report_type);
-    EXPECT_EQ(MakePrivateObservations(&observation, metric_def, report_def).status().error_code(),
-              util::UNIMPLEMENTED);
-  }
-}
-
 TEST_F(PrivacyEncoderTest, MakePrivateObservationsNullObservation) {
   MetricDefinition metric_def;
   ReportDefinition report_def;
@@ -444,6 +439,79 @@
   EXPECT_EQ(indices.ValueOrDie(), expected_indices);
 }
 
+TEST_F(PrivacyEncoderTest, StringCounts) {
+  size_t num_cells_per_hash = 2;
+  size_t num_hashes = 2;
+  uint32_t max_event_code = 1;
+  uint32_t max_count = 2;
+  uint32_t num_index_points = max_count + 1;
+
+  // |metric_def| has 2 valid event vectors.
+  MetricDefinition metric_def;
+  metric_def.set_metric_type(MetricDefinition::STRING);
+  metric_def.set_string_buffer_max(10);
+  auto metric_dim = metric_def.add_metric_dimensions();
+  metric_dim->set_dimension("dimension 0");
+  metric_dim->set_max_event_code(max_event_code);
+
+  // |report_def| has 3 valid count values: {0, 1, 2}.
+  ReportDefinition report_def;
+  report_def.set_max_count(max_count);
+  report_def.set_num_index_points(num_index_points);
+
+  // Prepare a StringHistogramObservation with 2 IndexHistograms:
+  // - with event vector {0}: ("blobfs", count = 1), ("thinfs", count = 2)
+  // - with event vector {1}: ("thinfs", count = 2)
+  Observation observation;
+  StringHistogramObservation *string_histogram_obs = observation.mutable_string_histogram();
+
+  string_histogram_obs->add_string_hashes(util::FarmhashFingerprint("blobfs"));
+  string_histogram_obs->add_string_hashes(util::FarmhashFingerprint("thinfs"));
+
+  IndexHistogram *histogram_1 = string_histogram_obs->add_string_histograms();
+  histogram_1->add_event_codes(0u);
+  histogram_1->add_bucket_indices(0u);
+  histogram_1->add_bucket_counts(1u);
+  histogram_1->add_bucket_indices(1u);
+  histogram_1->add_bucket_counts(2u);
+
+  IndexHistogram *histogram_2 = string_histogram_obs->add_string_histograms();
+  histogram_2->add_event_codes(1u);
+  histogram_2->add_bucket_indices(1u);
+  histogram_2->add_bucket_counts(2u);
+
+  // The general formula for an expected index is:
+  // (count + num_index_points * sketch cell index) * (num_event_vectors) + event_vector_index.
+  // Cells with count 0 are omitted.
+  auto count_min = CountMin<uint64_t>::MakeSketch(num_cells_per_hash, num_hashes);
+  std::vector<size_t> blobfs_indices =
+      count_min.GetCellIndices(string_histogram_obs->string_hashes(0));
+  std::vector<size_t> thinfs_indices =
+      count_min.GetCellIndices(string_histogram_obs->string_hashes(1));
+
+  std::vector<uint64_t> expected_private_indices;
+  expected_private_indices.reserve(6);
+
+  for (size_t index : blobfs_indices) {
+    // Expected private indices for a count of 1 for "blobfs" with event vector {0}
+    expected_private_indices.push_back((1 + num_index_points * index) * (max_event_code + 1) + 0);
+  }
+
+  for (size_t index : thinfs_indices) {
+    // Expected private indices for a count of 2 for "thinfs" with event vector {0}
+    expected_private_indices.push_back((2 + num_index_points * index) * (max_event_code + 1) + 0);
+    // Expected private indices for a count of 2 for "thinfs" with event vector {1}
+    expected_private_indices.push_back((2 + num_index_points * index) * (max_event_code + 1) + 1);
+  }
+
+  CB_ASSERT_OK_AND_ASSIGN(std::vector<uint64_t> indices,
+                          PrepareIndexVectorForStringCountsReport(
+                              observation, metric_def, report_def, num_cells_per_hash, num_hashes));
+  std::sort(indices.begin(), indices.end());
+  std::sort(expected_private_indices.begin(), expected_private_indices.end());
+  EXPECT_EQ(indices, expected_private_indices);
+}
+
 TEST_F(PrivacyEncoderTest, ObservationsFromIndicesNoIndices) {
   std::vector<uint64_t> indices;
   auto observations = ObservationsFromIndices(indices);
@@ -647,6 +715,40 @@
   }
 }
 
+TEST_F(PrivacyEncoderTest, MaxIndexForReportStringCounts) {
+  uint32_t max_event_code = 9;
+  uint32_t num_index_points = 6;
+
+  // |metric_def| has 10 valid event vectors.
+  MetricDefinition metric_def;
+  metric_def.set_metric_type(MetricDefinition::STRING);
+  MetricDefinition::MetricDimension *dim = metric_def.add_metric_dimensions();
+  dim->set_dimension("dimension1");
+  dim->set_max_event_code(max_event_code);
+
+  // |report_def| has 6 valid count values.
+  ReportDefinition report_def;
+  report_def.set_report_type(ReportDefinition::STRING_COUNTS);
+  report_def.set_num_index_points(num_index_points);
+
+  CB_ASSERT_OK_AND_ASSIGN(size_t num_cells_per_hash,
+                          PrivacyEncoder::GetNumCountMinCellsPerHash(report_def));
+  CB_ASSERT_OK_AND_ASSIGN(size_t num_hashes, PrivacyEncoder::GetNumCountMinHashes(report_def));
+
+  // The expected max index is:
+  // (# of valid event vectors) * (# valid count values) * (size of count min sketch) - 1
+  // = 10 * 6 * 50 - 1 = 2999.
+  uint64_t expected_max_index =
+      (max_event_code + 1) * num_index_points * num_cells_per_hash * num_hashes - 1;
+
+  auto max_index = PrivacyEncoder::MaxIndexForReport(metric_def, report_def);
+  ASSERT_TRUE(max_index.ok()) << "Failed to get max index with status "
+                              << max_index.status().error_code() << ", "
+                              << max_index.status().error_message();
+
+  EXPECT_EQ(expected_max_index, max_index.ValueOrDie());
+}
+
 TEST_F(PrivacyEncoderTest, MaxIndexForReportUnimplemented) {
   MetricDefinition metric_def;
   ReportDefinition report_def;