[Cobalt 1.1 privacy] Bucket integers on client for per-device histogram reports with privacy

For reports without added privacy, contributions for the
Cobalt 1.1 per-device histogram report types consist of
raw integers which are bucketed on the server side.

However, for per-device histogram reports with privacy,
the encoding is much more straightforward and will likely
give us a better privacy/error tradeoff if we bucket
contributions on the client side and send an encoding of
the histogram bucket index. This CL implements client-side
bucketing and privacy encoding for per-device histogram
reports with privacy and updates the ReportDefinition
proto to note the change.

Change-Id: I817b33125d6960b20c341ccfc684cc7b2c62315d
Reviewed-on: https://fuchsia-review.googlesource.com/c/cobalt/+/442720
Commit-Queue: Laura Peskin <pesk@google.com>
Reviewed-by: Alexandre Zani <azani@google.com>
diff --git a/src/algorithms/privacy/numeric_encoding.cc b/src/algorithms/privacy/numeric_encoding.cc
index 9c496af..88c7000 100644
--- a/src/algorithms/privacy/numeric_encoding.cc
+++ b/src/algorithms/privacy/numeric_encoding.cc
@@ -34,8 +34,9 @@
          num_index_points;
 }
 
-uint64_t HistogramBucketToIndex(uint64_t bucket_count, uint32_t bucket_index, uint64_t max_count,
-                                uint64_t num_index_points, BitGeneratorInterface<uint32_t>* gen) {
+uint64_t HistogramBucketAndCountToIndex(uint64_t bucket_count, uint32_t bucket_index,
+                                        uint64_t max_count, uint64_t num_index_points,
+                                        BitGeneratorInterface<uint32_t>* gen) {
   return IntegerToIndex(static_cast<int64_t>(bucket_count), 0, static_cast<int64_t>(max_count),
                         num_index_points, gen) +
          num_index_points * static_cast<uint64_t>(bucket_index);
@@ -58,8 +59,8 @@
                                                 static_cast<int64_t>(max_count), num_index_points));
 }
 
-void HistogramBucketFromIndex(uint64_t index, uint64_t max_count, uint64_t num_index_points,
-                              uint64_t* bucket_count, uint32_t* bucket_index) {
+void HistogramBucketAndCountFromIndex(uint64_t index, uint64_t max_count, uint64_t num_index_points,
+                                      uint64_t* bucket_count, uint32_t* bucket_index) {
   *bucket_index = static_cast<uint32_t>(index / num_index_points);
   uint64_t numeric_index = index - (*bucket_index * num_index_points);
   *bucket_count = static_cast<uint64_t>(
diff --git a/src/algorithms/privacy/numeric_encoding.h b/src/algorithms/privacy/numeric_encoding.h
index 83e16d7..560cfc3 100644
--- a/src/algorithms/privacy/numeric_encoding.h
+++ b/src/algorithms/privacy/numeric_encoding.h
@@ -32,8 +32,9 @@
 
 // Encodes a |bucket_count|, |bucket_index| pair by encoding the |bucket_count| as per the
 // DoubleToIndex encoding scheme plus an offset equal to |bucket_index| * |num_index_points|.
-uint64_t HistogramBucketToIndex(uint64_t bucket_count, uint32_t bucket_index, uint64_t max_count,
-                                uint64_t num_index_points, BitGeneratorInterface<uint32_t>* gen);
+uint64_t HistogramBucketAndCountToIndex(uint64_t bucket_count, uint32_t bucket_index,
+                                        uint64_t max_count, uint64_t num_index_points,
+                                        BitGeneratorInterface<uint32_t>* gen);
 
 // DoubleFromIndex maps |index| to the numeric value corresponding to the boundary point with index
 // |index|.
@@ -47,18 +48,33 @@
 // Decodes an |index| that was encoded with CountToIndex.
 uint64_t CountFromIndex(uint64_t index, uint64_t max_count, uint64_t num_index_points);
 
-// Decodes an |index| that was encoded with HistogramBucketFromIndex.
-void HistogramBucketFromIndex(uint64_t index, uint64_t max_count, uint64_t num_index_points,
-                              uint64_t* bucket_count, uint32_t* bucket_index);
+// Decodes an |index| that was encoded with HistogramBucketAndCountToIndex.
+void HistogramBucketAndCountFromIndex(uint64_t index, uint64_t max_count, uint64_t num_index_points,
+                                      uint64_t* bucket_count, uint32_t* bucket_index);
 
 // ValueAndEventVectorIndicesToIndex uniquely maps a |value_index|, |event_vector_index| pair to a
-// single index in the range [0, |max_event_vector_index + 1| * |num_index_points| - 1].
+// single index.
 //
-// See DoubleToIndex for |num_index_points|.
+// If |value_index| is the result of a call to DoubleToIndex() with num_index_points =
+// |num_index_points|, then the returned index is in the range
+// [0, ((|max_event_vector_index| + 1) * |num_index_points|) - 1].
+//
+// If |value_index| is an index into a vector of |num_buckets| histogram buckets, then the returned
+// index is in the range
+// [0, ((|max_event_vector_index| + 1) * |num_buckets|) - 1].
+//
+// If |value_index| is the result of a call to HistogramBucketAndCountToIndex() with
+// num_index_points = |num_index_points|, and if the histogram has |num_buckets| buckets, then the
+// returned index is in the range
+// [0, (|max_event_vector_index| + 1) * |num_buckets| * |num_index_points| - 1].
 uint64_t ValueAndEventVectorIndicesToIndex(uint64_t value_index, uint64_t event_vector_index,
                                            uint64_t max_event_vector_index);
 
 // ValueAndEventVectorIndicesFromIndex is the inverse of ValueAndEventVectorIndicesToIndex.
+//
+// After a call to this function, depending on the context, |value_index| is an index corresponding
+// to a signed numeric value, a count, a histogram bucket index, or a histogram bucket index
+// together with a count.
 void ValueAndEventVectorIndicesFromIndex(uint64_t index, uint64_t max_event_vector_index,
                                          uint64_t* value_index, uint64_t* event_vector_index);
 
diff --git a/src/algorithms/privacy/numeric_encoding_test.cc b/src/algorithms/privacy/numeric_encoding_test.cc
index c86c1b8..70c9d82 100644
--- a/src/algorithms/privacy/numeric_encoding_test.cc
+++ b/src/algorithms/privacy/numeric_encoding_test.cc
@@ -85,7 +85,7 @@
   EXPECT_EQ(CountToIndex(count, max_count, num_index_points, &gen), expected);
 }
 
-TEST(NumericEncodingTest, HistogramBucketToIndex) {
+TEST(NumericEncodingTest, HistogramBucketAndCountToIndex) {
   uint64_t max_count = 10u;
   uint64_t num_index_points = 6u;
   uint64_t bucket_count = 4u;
@@ -94,8 +94,9 @@
 
   uint64_t expected = 44u;
 
-  EXPECT_EQ(HistogramBucketToIndex(bucket_count, bucket_index, max_count, num_index_points, &gen),
-            expected);
+  EXPECT_EQ(
+      HistogramBucketAndCountToIndex(bucket_count, bucket_index, max_count, num_index_points, &gen),
+      expected);
 }
 
 TEST(NumericEncodingTest, DoubleFromIndex) {
@@ -134,7 +135,7 @@
   }
 }
 
-TEST(NumericEncodingTest, HistogramBucketFromIndex) {
+TEST(NumericEncodingTest, HistogramBucketAndCountFromIndex) {
   uint64_t max_count = 10u;
   uint64_t num_index_points = 6u;
   uint64_t num_buckets = 10u;
@@ -146,7 +147,8 @@
     uint64_t bucket_count;
     uint32_t bucket_index;
 
-    HistogramBucketFromIndex(index, max_count, num_index_points, &bucket_count, &bucket_index);
+    HistogramBucketAndCountFromIndex(index, max_count, num_index_points, &bucket_count,
+                                     &bucket_index);
     EXPECT_EQ(bucket_count, expected_bucket_count);
     EXPECT_EQ(bucket_index, expected_bucket_index);
 
diff --git a/src/logger/BUILD.gn b/src/logger/BUILD.gn
index 5daefce..3d1bb87 100644
--- a/src/logger/BUILD.gn
+++ b/src/logger/BUILD.gn
@@ -459,6 +459,7 @@
     "$cobalt_root/src/algorithms/random:random",
     "$cobalt_root/src/lib/statusor",
     "$cobalt_root/src/pb",
+    "$cobalt_root/src/registry:buckets_config",
     "$cobalt_root/src/registry:cobalt_registry_proto",
   ]
 }
diff --git a/src/logger/privacy_encoder.cc b/src/logger/privacy_encoder.cc
index 3374e7f..3e98aaa 100644
--- a/src/logger/privacy_encoder.cc
+++ b/src/logger/privacy_encoder.cc
@@ -6,8 +6,29 @@
 #include "src/lib/statusor/status_macros.h"
 #include "src/logger/event_vector_index.h"
 #include "src/pb/observation.pb.h"
+#include "src/registry/buckets_config.h"
 
 namespace cobalt::logger {
+namespace {
+
+// Returns the number of histogram buckets associated to an IntegerBuckets, including the underflow
+// and overflow buckets.
+lib::statusor::StatusOr<uint32_t> GetNumIntegerBuckets(const IntegerBuckets &int_buckets) {
+  uint32_t num_buckets = 2;
+  switch (int_buckets.buckets_case()) {
+    case IntegerBuckets::kExponential:
+      num_buckets += int_buckets.exponential().num_buckets();
+      break;
+    case IntegerBuckets::kLinear:
+      num_buckets += int_buckets.linear().num_buckets();
+      break;
+    default:
+      return util::Status(util::INVALID_ARGUMENT, "invalid IntegerBuckets type.");
+  }
+  return num_buckets;
+}
+
+}  // namespace
 
 PrivacyEncoder::PrivacyEncoder(std::unique_ptr<SecureBitGeneratorInterface<uint32_t>> secure_gen,
                                std::unique_ptr<BitGeneratorInterface<uint32_t>> gen)
@@ -58,9 +79,7 @@
     }
     case ReportDefinition::FLEETWIDE_OCCURRENCE_COUNTS:
     case ReportDefinition::HOURLY_VALUE_NUMERIC_STATS:
-    case ReportDefinition::HOURLY_VALUE_HISTOGRAMS:
-    case ReportDefinition::UNIQUE_DEVICE_NUMERIC_STATS:
-    case ReportDefinition::UNIQUE_DEVICE_HISTOGRAMS: {
+    case ReportDefinition::UNIQUE_DEVICE_NUMERIC_STATS: {
       return (GetNumEventVectors(metric_def.metric_dimensions()) * report_def.num_index_points()) -
              1;
     }
@@ -69,6 +88,15 @@
                   report_def.num_index_points()) -
              1;
     }
+    case ReportDefinition::HOURLY_VALUE_HISTOGRAMS:
+    case ReportDefinition::UNIQUE_DEVICE_HISTOGRAMS: {
+      lib::statusor::StatusOr<uint32_t> num_buckets =
+          GetNumIntegerBuckets(report_def.int_buckets());
+      if (!num_buckets.ok()) {
+        return num_buckets;
+      }
+      return (GetNumEventVectors(metric_def.metric_dimensions()) * num_buckets.ValueOrDie()) - 1;
+    }
 
     default:
       return util::Status(util::UNIMPLEMENTED, "this is not yet implemented");
@@ -86,9 +114,7 @@
     }
     case ReportDefinition::FLEETWIDE_OCCURRENCE_COUNTS:
     case ReportDefinition::HOURLY_VALUE_NUMERIC_STATS:
-    case ReportDefinition::HOURLY_VALUE_HISTOGRAMS:
-    case ReportDefinition::UNIQUE_DEVICE_NUMERIC_STATS:
-    case ReportDefinition::UNIQUE_DEVICE_HISTOGRAMS: {
+    case ReportDefinition::UNIQUE_DEVICE_NUMERIC_STATS: {
       CB_ASSIGN_OR_RETURN(indices, PrepareIndexVectorForPerDeviceIntegerReport(
                                        observation, metric_def, report_def));
       break;
@@ -98,6 +124,12 @@
           indices, PrepareIndexVectorForFleetwideMeansReport(observation, metric_def, report_def));
       break;
     }
+    case ReportDefinition::HOURLY_VALUE_HISTOGRAMS:
+    case ReportDefinition::UNIQUE_DEVICE_HISTOGRAMS: {
+      CB_ASSIGN_OR_RETURN(indices, PrepareIndexVectorForPerDeviceHistogramsReport(
+                                       observation, metric_def, report_def));
+      break;
+    }
 
     default:
       return util::Status(util::UNIMPLEMENTED, "this is not yet implemented");
@@ -204,4 +236,28 @@
   return occurred_indices;
 }
 
+lib::statusor::StatusOr<std::vector<uint64_t>>
+PrivacyEncoder::PrepareIndexVectorForPerDeviceHistogramsReport(const Observation &observation,
+                                                               const MetricDefinition &metric_def,
+                                                               const ReportDefinition &report_def) {
+  std::vector<uint64_t> occurred_indices;
+  if (!observation.has_integer()) {
+    return util::Status(util::INVALID_ARGUMENT, "observation type is not IntegerObservation.");
+  }
+
+  std::unique_ptr<config::IntegerBucketConfig> integer_buckets =
+      config::IntegerBucketConfig::CreateFromProto(report_def.int_buckets());
+
+  for (const auto &value : observation.integer().values()) {
+    std::vector<uint32_t> event_codes(value.event_codes().begin(), value.event_codes().end());
+    CB_ASSIGN_OR_RETURN(auto event_vector_index, EventVectorToIndex(event_codes, metric_def));
+
+    uint32_t bucket_index = integer_buckets->BucketIndex(value.value());
+    occurred_indices.push_back(ValueAndEventVectorIndicesToIndex(
+        bucket_index, event_vector_index, GetNumEventVectors(metric_def.metric_dimensions()) - 1));
+  }
+
+  return occurred_indices;
+}
+
 }  // namespace cobalt::logger
diff --git a/src/logger/privacy_encoder.h b/src/logger/privacy_encoder.h
index a8adf7e..e38e3b8 100644
--- a/src/logger/privacy_encoder.h
+++ b/src/logger/privacy_encoder.h
@@ -63,8 +63,6 @@
   static lib::statusor::StatusOr<std::vector<uint64_t>> PrepareIndexVectorForUniqueDeviceCount(
       const Observation &observation, const MetricDefinition &metric_def);
 
-  // Prepare the indices for reports where the contribution per device is an integer per event
-  // vector.
   lib::statusor::StatusOr<std::vector<uint64_t>> PrepareIndexVectorForPerDeviceIntegerReport(
       const Observation &observation, const MetricDefinition &metric_def,
       const ReportDefinition &report_def);
@@ -73,6 +71,11 @@
       const Observation &observation, const MetricDefinition &metric_def,
       const ReportDefinition &report_def);
 
+  static lib::statusor::StatusOr<std::vector<uint64_t>>
+  PrepareIndexVectorForPerDeviceHistogramsReport(const Observation &observation,
+                                                 const MetricDefinition &metric_def,
+                                                 const ReportDefinition &report_def);
+
   static std::vector<std::unique_ptr<Observation>> ObservationsFromIndices(
       const std::vector<uint64_t> &indices);
 
diff --git a/src/logger/privacy_encoder_test.cc b/src/logger/privacy_encoder_test.cc
index 129fdf8..153e633 100644
--- a/src/logger/privacy_encoder_test.cc
+++ b/src/logger/privacy_encoder_test.cc
@@ -39,6 +39,13 @@
                                                                        report_def);
   }
 
+  lib::statusor::StatusOr<std::vector<uint64_t>> PrepareIndexVectorForPerDeviceHistogramsReport(
+      const Observation &observation, const MetricDefinition &metric_def,
+      const ReportDefinition &report_def) {
+    return privacy_encoder_->PrepareIndexVectorForPerDeviceHistogramsReport(observation, metric_def,
+                                                                            report_def);
+  }
+
   static std::vector<std::unique_ptr<Observation>> ObservationsFromIndices(
       const std::vector<uint64_t> &indices) {
     return PrivacyEncoder::ObservationsFromIndices(indices);
@@ -260,6 +267,58 @@
   EXPECT_EQ(status_or_indices.status().error_code(), util::INVALID_ARGUMENT);
 }
 
+TEST_F(PrivacyEncoderTest, PerDeviceHistograms) {
+  // |metric_def| has 11 valid event vectors.
+  MetricDefinition metric_def;
+  metric_def.set_metric_type(MetricDefinition::OCCURRENCE);
+  auto metric_dim = metric_def.add_metric_dimensions();
+  metric_dim->set_dimension("dimension 0");
+  metric_dim->set_max_event_code(10);
+
+  // Counting the underflow and overflow buckets, |int_buckets| has 8 + 2 = 10 valid bucket indices.
+  LinearIntegerBuckets int_buckets;
+  int_buckets.set_floor(0);
+  int_buckets.set_num_buckets(8);
+  int_buckets.set_step_size(2);
+
+  ReportDefinition report_def;
+  *report_def.mutable_int_buckets()->mutable_linear() = int_buckets;
+
+  std::vector<uint64_t> expected_indices = {14, 40};
+  Observation observation;
+  auto integer_obs = observation.mutable_integer();
+
+  // Add a value for event code 3 and bucket index 1.
+  // The expected index is:
+  // num_event_vectors * bucket_index + event_code = 11 * 1 + 3 = 14.
+  auto val = integer_obs->add_values();
+  val->add_event_codes(3u);
+  val->set_value(0);
+
+  // Add a value for event code 7 and bucket index 3.
+  // The expected index is:
+  // num_event_vectors * bucket_index + event_code = 11 * 3 + 7 = 40.
+  val = integer_obs->add_values();
+  val->add_event_codes(7u);
+  val->set_value(5);
+
+  auto status_or_indices =
+      PrepareIndexVectorForPerDeviceHistogramsReport(observation, metric_def, report_def);
+  ASSERT_TRUE(status_or_indices.ok());
+  EXPECT_EQ(status_or_indices.ValueOrDie(), expected_indices);
+}
+
+TEST_F(PrivacyEncoderTest, PerDeviceHistogramsInvalidObservationType) {
+  MetricDefinition metric_def;
+  ReportDefinition report_def;
+  Observation observation;
+
+  auto status_or_indices =
+      PrepareIndexVectorForPerDeviceHistogramsReport(observation, metric_def, report_def);
+  ASSERT_FALSE(status_or_indices.ok());
+  EXPECT_EQ(status_or_indices.status().error_code(), util::INVALID_ARGUMENT);
+}
+
 TEST_F(PrivacyEncoderTest, ObservationsFromIndicesNoIndices) {
   std::vector<uint64_t> indices;
   auto observations = ObservationsFromIndices(indices);
@@ -307,9 +366,7 @@
 
   std::vector<ReportDefinition::ReportType> per_device_integer_report_types = {
       ReportDefinition::FLEETWIDE_OCCURRENCE_COUNTS, ReportDefinition::HOURLY_VALUE_NUMERIC_STATS,
-      ReportDefinition::HOURLY_VALUE_HISTOGRAMS,     ReportDefinition::UNIQUE_DEVICE_NUMERIC_STATS,
-      ReportDefinition::UNIQUE_DEVICE_HISTOGRAMS,
-  };
+      ReportDefinition::UNIQUE_DEVICE_NUMERIC_STATS};
 
   ReportDefinition report_def;
   uint32_t num_index_points = 5;
@@ -346,6 +403,47 @@
   EXPECT_EQ(status_or.ValueOrDie(), expected_max_index);
 }
 
+TEST_F(PrivacyEncoderTest, MaxIndexForReportPerDeviceHistogram) {
+  MetricDefinition metric_def;
+  uint32_t max_event_code = 10;
+  auto dim = metric_def.add_metric_dimensions();
+  dim->set_dimension("dimension 1");
+  dim->set_max_event_code(max_event_code);
+
+  std::vector<ReportDefinition::ReportType> per_device_histogram_report_types = {
+      ReportDefinition::HOURLY_VALUE_HISTOGRAMS, ReportDefinition::UNIQUE_DEVICE_HISTOGRAMS};
+
+  IntegerBuckets linear_buckets;
+  linear_buckets.mutable_linear()->set_floor(0);
+  linear_buckets.mutable_linear()->set_num_buckets(3);
+
+  IntegerBuckets exp_buckets;
+  exp_buckets.mutable_exponential()->set_floor(0);
+  exp_buckets.mutable_exponential()->set_num_buckets(3);
+
+  std::vector<IntegerBuckets> bucket_variants = {linear_buckets, exp_buckets};
+
+  ReportDefinition report_def;
+
+  // There are 11 event codes and 5 histogram buckets (3 registered + 1 underflow + 1 overflow) for
+  // a total of 55 possible indices.
+  uint64_t expected_max_index = 54;
+
+  for (const auto report_type : per_device_histogram_report_types) {
+    for (const auto &int_buckets : bucket_variants) {
+      report_def.set_report_type(report_type);
+      *report_def.mutable_int_buckets() = int_buckets;
+
+      auto max_index = PrivacyEncoder::MaxIndexForReport(metric_def, report_def);
+      ASSERT_TRUE(max_index.ok()) << "Failed to get max index with status "
+                                  << max_index.status().error_code() << ", "
+                                  << max_index.status().error_message();
+
+      EXPECT_EQ(expected_max_index, max_index.ValueOrDie());
+    }
+  }
+}
+
 TEST_F(PrivacyEncoderTest, MaxIndexForReportUnimplemented) {
   MetricDefinition metric_def;
   ReportDefinition report_def;
diff --git a/src/registry/report_definition.proto b/src/registry/report_definition.proto
index b84a1ef..62fd95e 100644
--- a/src/registry/report_definition.proto
+++ b/src/registry/report_definition.proto
@@ -322,7 +322,9 @@
     // ReportDefinition fields particular to this type:
     //   - local_aggregation_procedure (only when the metric type is INTEGER)
     //   - local_aggregation_period
-    //   - int_buckets (This is used only on the server.)
+    //   - int_buckets (this is used only on the server for reports without
+    //     added privacy, but is used on the client for reports with added
+    //     privacy)
     //
     // This report type is part of Cobalt 1.1. Do not use it yet.
     UNIQUE_DEVICE_HISTOGRAMS = 13;
@@ -365,7 +367,9 @@
     //
     // ReportDefinition fields particular to this type:
     //   - local_aggregation_procedure (only when the metric type is INTEGER)
-    //   - int_buckets (This is used only on the server.)
+    //   - int_buckets (this is used only on the server for reports without
+    //     added privacy, but is used on the client for reports with added
+    //     privacy)
     //
     // This report type is part of Cobalt 1.1. Do not use it yet.
     HOURLY_VALUE_HISTOGRAMS = 14;