[Cobalt 1.1 privacy] Bucket integers on client for per-device histogram reports with privacy
For reports without added privacy, contributions for the
Cobalt 1.1 per-device histogram report types consist of
raw integers which are bucketed on the server side.
However, for per-device histogram reports with privacy,
the encoding is much more straightforward and will likely
give us a better privacy/error tradeoff if we bucket
contributions on the client side and send an encoding of
the histogram bucket index. This CL implements client-side
bucketing and privacy encoding for per-device histogram
reports with privacy and updates the ReportDefinition
proto to note the change.
Change-Id: I817b33125d6960b20c341ccfc684cc7b2c62315d
Reviewed-on: https://fuchsia-review.googlesource.com/c/cobalt/+/442720
Commit-Queue: Laura Peskin <pesk@google.com>
Reviewed-by: Alexandre Zani <azani@google.com>
diff --git a/src/algorithms/privacy/numeric_encoding.cc b/src/algorithms/privacy/numeric_encoding.cc
index 9c496af..88c7000 100644
--- a/src/algorithms/privacy/numeric_encoding.cc
+++ b/src/algorithms/privacy/numeric_encoding.cc
@@ -34,8 +34,9 @@
num_index_points;
}
-uint64_t HistogramBucketToIndex(uint64_t bucket_count, uint32_t bucket_index, uint64_t max_count,
- uint64_t num_index_points, BitGeneratorInterface<uint32_t>* gen) {
+uint64_t HistogramBucketAndCountToIndex(uint64_t bucket_count, uint32_t bucket_index,
+ uint64_t max_count, uint64_t num_index_points,
+ BitGeneratorInterface<uint32_t>* gen) {
return IntegerToIndex(static_cast<int64_t>(bucket_count), 0, static_cast<int64_t>(max_count),
num_index_points, gen) +
num_index_points * static_cast<uint64_t>(bucket_index);
@@ -58,8 +59,8 @@
static_cast<int64_t>(max_count), num_index_points));
}
-void HistogramBucketFromIndex(uint64_t index, uint64_t max_count, uint64_t num_index_points,
- uint64_t* bucket_count, uint32_t* bucket_index) {
+void HistogramBucketAndCountFromIndex(uint64_t index, uint64_t max_count, uint64_t num_index_points,
+ uint64_t* bucket_count, uint32_t* bucket_index) {
*bucket_index = static_cast<uint32_t>(index / num_index_points);
uint64_t numeric_index = index - (*bucket_index * num_index_points);
*bucket_count = static_cast<uint64_t>(
diff --git a/src/algorithms/privacy/numeric_encoding.h b/src/algorithms/privacy/numeric_encoding.h
index 83e16d7..560cfc3 100644
--- a/src/algorithms/privacy/numeric_encoding.h
+++ b/src/algorithms/privacy/numeric_encoding.h
@@ -32,8 +32,9 @@
// Encodes a |bucket_count|, |bucket_index| pair by encoding the |bucket_count| as per the
// DoubleToIndex encoding scheme plus an offset equal to |bucket_index| * |num_index_points|.
-uint64_t HistogramBucketToIndex(uint64_t bucket_count, uint32_t bucket_index, uint64_t max_count,
- uint64_t num_index_points, BitGeneratorInterface<uint32_t>* gen);
+uint64_t HistogramBucketAndCountToIndex(uint64_t bucket_count, uint32_t bucket_index,
+ uint64_t max_count, uint64_t num_index_points,
+ BitGeneratorInterface<uint32_t>* gen);
// DoubleFromIndex maps |index| to the numeric value corresponding to the boundary point with index
// |index|.
@@ -47,18 +48,33 @@
// Decodes an |index| that was encoded with CountToIndex.
uint64_t CountFromIndex(uint64_t index, uint64_t max_count, uint64_t num_index_points);
-// Decodes an |index| that was encoded with HistogramBucketFromIndex.
-void HistogramBucketFromIndex(uint64_t index, uint64_t max_count, uint64_t num_index_points,
- uint64_t* bucket_count, uint32_t* bucket_index);
+// Decodes an |index| that was encoded with HistogramBucketAndCountToIndex.
+void HistogramBucketAndCountFromIndex(uint64_t index, uint64_t max_count, uint64_t num_index_points,
+ uint64_t* bucket_count, uint32_t* bucket_index);
// ValueAndEventVectorIndicesToIndex uniquely maps a |value_index|, |event_vector_index| pair to a
-// single index in the range [0, |max_event_vector_index + 1| * |num_index_points| - 1].
+// single index.
//
-// See DoubleToIndex for |num_index_points|.
+// If |value_index| is the result of a call to DoubleToIndex() with num_index_points =
+// |num_index_points|, then the returned index is in the range
+// [0, ((|max_event_vector_index| + 1) * |num_index_points|) - 1].
+//
+// If |value_index| is an index into a vector of |num_buckets| histogram buckets, then the returned
+// index is in the range
+// [0, ((|max_event_vector_index| + 1) * |num_buckets|) - 1].
+//
+// If |value_index| is the result of a call to HistogramBucketAndCountToIndex() with
+// num_index_points = |num_index_points|, and if the histogram has |num_buckets| buckets, then the
+// returned index is in the range
+// [0, (|max_event_vector_index| + 1) * |num_buckets| * |num_index_points| - 1].
uint64_t ValueAndEventVectorIndicesToIndex(uint64_t value_index, uint64_t event_vector_index,
uint64_t max_event_vector_index);
// ValueAndEventVectorIndicesFromIndex is the inverse of ValueAndEventVectorIndicesToIndex.
+//
+// After a call to this function, depending on the context, |value_index| is an index corresponding
+// to a signed numeric value, a count, a histogram bucket index, or a histogram bucket index
+// together with a count.
void ValueAndEventVectorIndicesFromIndex(uint64_t index, uint64_t max_event_vector_index,
uint64_t* value_index, uint64_t* event_vector_index);
diff --git a/src/algorithms/privacy/numeric_encoding_test.cc b/src/algorithms/privacy/numeric_encoding_test.cc
index c86c1b8..70c9d82 100644
--- a/src/algorithms/privacy/numeric_encoding_test.cc
+++ b/src/algorithms/privacy/numeric_encoding_test.cc
@@ -85,7 +85,7 @@
EXPECT_EQ(CountToIndex(count, max_count, num_index_points, &gen), expected);
}
-TEST(NumericEncodingTest, HistogramBucketToIndex) {
+TEST(NumericEncodingTest, HistogramBucketAndCountToIndex) {
uint64_t max_count = 10u;
uint64_t num_index_points = 6u;
uint64_t bucket_count = 4u;
@@ -94,8 +94,9 @@
uint64_t expected = 44u;
- EXPECT_EQ(HistogramBucketToIndex(bucket_count, bucket_index, max_count, num_index_points, &gen),
- expected);
+ EXPECT_EQ(
+ HistogramBucketAndCountToIndex(bucket_count, bucket_index, max_count, num_index_points, &gen),
+ expected);
}
TEST(NumericEncodingTest, DoubleFromIndex) {
@@ -134,7 +135,7 @@
}
}
-TEST(NumericEncodingTest, HistogramBucketFromIndex) {
+TEST(NumericEncodingTest, HistogramBucketAndCountFromIndex) {
uint64_t max_count = 10u;
uint64_t num_index_points = 6u;
uint64_t num_buckets = 10u;
@@ -146,7 +147,8 @@
uint64_t bucket_count;
uint32_t bucket_index;
- HistogramBucketFromIndex(index, max_count, num_index_points, &bucket_count, &bucket_index);
+ HistogramBucketAndCountFromIndex(index, max_count, num_index_points, &bucket_count,
+ &bucket_index);
EXPECT_EQ(bucket_count, expected_bucket_count);
EXPECT_EQ(bucket_index, expected_bucket_index);
diff --git a/src/logger/BUILD.gn b/src/logger/BUILD.gn
index 5daefce..3d1bb87 100644
--- a/src/logger/BUILD.gn
+++ b/src/logger/BUILD.gn
@@ -459,6 +459,7 @@
"$cobalt_root/src/algorithms/random:random",
"$cobalt_root/src/lib/statusor",
"$cobalt_root/src/pb",
+ "$cobalt_root/src/registry:buckets_config",
"$cobalt_root/src/registry:cobalt_registry_proto",
]
}
diff --git a/src/logger/privacy_encoder.cc b/src/logger/privacy_encoder.cc
index 3374e7f..3e98aaa 100644
--- a/src/logger/privacy_encoder.cc
+++ b/src/logger/privacy_encoder.cc
@@ -6,8 +6,29 @@
#include "src/lib/statusor/status_macros.h"
#include "src/logger/event_vector_index.h"
#include "src/pb/observation.pb.h"
+#include "src/registry/buckets_config.h"
namespace cobalt::logger {
+namespace {
+
+// Returns the number of histogram buckets associated to an IntegerBuckets, including the underflow
+// and overflow buckets.
+lib::statusor::StatusOr<uint32_t> GetNumIntegerBuckets(const IntegerBuckets &int_buckets) {
+ uint32_t num_buckets = 2;
+ switch (int_buckets.buckets_case()) {
+ case IntegerBuckets::kExponential:
+ num_buckets += int_buckets.exponential().num_buckets();
+ break;
+ case IntegerBuckets::kLinear:
+ num_buckets += int_buckets.linear().num_buckets();
+ break;
+ default:
+ return util::Status(util::INVALID_ARGUMENT, "invalid IntegerBuckets type.");
+ }
+ return num_buckets;
+}
+
+} // namespace
PrivacyEncoder::PrivacyEncoder(std::unique_ptr<SecureBitGeneratorInterface<uint32_t>> secure_gen,
std::unique_ptr<BitGeneratorInterface<uint32_t>> gen)
@@ -58,9 +79,7 @@
}
case ReportDefinition::FLEETWIDE_OCCURRENCE_COUNTS:
case ReportDefinition::HOURLY_VALUE_NUMERIC_STATS:
- case ReportDefinition::HOURLY_VALUE_HISTOGRAMS:
- case ReportDefinition::UNIQUE_DEVICE_NUMERIC_STATS:
- case ReportDefinition::UNIQUE_DEVICE_HISTOGRAMS: {
+ case ReportDefinition::UNIQUE_DEVICE_NUMERIC_STATS: {
return (GetNumEventVectors(metric_def.metric_dimensions()) * report_def.num_index_points()) -
1;
}
@@ -69,6 +88,15 @@
report_def.num_index_points()) -
1;
}
+ case ReportDefinition::HOURLY_VALUE_HISTOGRAMS:
+ case ReportDefinition::UNIQUE_DEVICE_HISTOGRAMS: {
+ lib::statusor::StatusOr<uint32_t> num_buckets =
+ GetNumIntegerBuckets(report_def.int_buckets());
+ if (!num_buckets.ok()) {
+ return num_buckets;
+ }
+ return (GetNumEventVectors(metric_def.metric_dimensions()) * num_buckets.ValueOrDie()) - 1;
+ }
default:
return util::Status(util::UNIMPLEMENTED, "this is not yet implemented");
@@ -86,9 +114,7 @@
}
case ReportDefinition::FLEETWIDE_OCCURRENCE_COUNTS:
case ReportDefinition::HOURLY_VALUE_NUMERIC_STATS:
- case ReportDefinition::HOURLY_VALUE_HISTOGRAMS:
- case ReportDefinition::UNIQUE_DEVICE_NUMERIC_STATS:
- case ReportDefinition::UNIQUE_DEVICE_HISTOGRAMS: {
+ case ReportDefinition::UNIQUE_DEVICE_NUMERIC_STATS: {
CB_ASSIGN_OR_RETURN(indices, PrepareIndexVectorForPerDeviceIntegerReport(
observation, metric_def, report_def));
break;
@@ -98,6 +124,12 @@
indices, PrepareIndexVectorForFleetwideMeansReport(observation, metric_def, report_def));
break;
}
+ case ReportDefinition::HOURLY_VALUE_HISTOGRAMS:
+ case ReportDefinition::UNIQUE_DEVICE_HISTOGRAMS: {
+ CB_ASSIGN_OR_RETURN(indices, PrepareIndexVectorForPerDeviceHistogramsReport(
+ observation, metric_def, report_def));
+ break;
+ }
default:
return util::Status(util::UNIMPLEMENTED, "this is not yet implemented");
@@ -204,4 +236,28 @@
return occurred_indices;
}
+lib::statusor::StatusOr<std::vector<uint64_t>>
+PrivacyEncoder::PrepareIndexVectorForPerDeviceHistogramsReport(const Observation &observation,
+ const MetricDefinition &metric_def,
+ const ReportDefinition &report_def) {
+ std::vector<uint64_t> occurred_indices;
+ if (!observation.has_integer()) {
+ return util::Status(util::INVALID_ARGUMENT, "observation type is not IntegerObservation.");
+ }
+
+ std::unique_ptr<config::IntegerBucketConfig> integer_buckets =
+ config::IntegerBucketConfig::CreateFromProto(report_def.int_buckets());
+
+ for (const auto &value : observation.integer().values()) {
+ std::vector<uint32_t> event_codes(value.event_codes().begin(), value.event_codes().end());
+ CB_ASSIGN_OR_RETURN(auto event_vector_index, EventVectorToIndex(event_codes, metric_def));
+
+ uint32_t bucket_index = integer_buckets->BucketIndex(value.value());
+ occurred_indices.push_back(ValueAndEventVectorIndicesToIndex(
+ bucket_index, event_vector_index, GetNumEventVectors(metric_def.metric_dimensions()) - 1));
+ }
+
+ return occurred_indices;
+}
+
} // namespace cobalt::logger
diff --git a/src/logger/privacy_encoder.h b/src/logger/privacy_encoder.h
index a8adf7e..e38e3b8 100644
--- a/src/logger/privacy_encoder.h
+++ b/src/logger/privacy_encoder.h
@@ -63,8 +63,6 @@
static lib::statusor::StatusOr<std::vector<uint64_t>> PrepareIndexVectorForUniqueDeviceCount(
const Observation &observation, const MetricDefinition &metric_def);
- // Prepare the indices for reports where the contribution per device is an integer per event
- // vector.
lib::statusor::StatusOr<std::vector<uint64_t>> PrepareIndexVectorForPerDeviceIntegerReport(
const Observation &observation, const MetricDefinition &metric_def,
const ReportDefinition &report_def);
@@ -73,6 +71,11 @@
const Observation &observation, const MetricDefinition &metric_def,
const ReportDefinition &report_def);
+ static lib::statusor::StatusOr<std::vector<uint64_t>>
+ PrepareIndexVectorForPerDeviceHistogramsReport(const Observation &observation,
+ const MetricDefinition &metric_def,
+ const ReportDefinition &report_def);
+
static std::vector<std::unique_ptr<Observation>> ObservationsFromIndices(
const std::vector<uint64_t> &indices);
diff --git a/src/logger/privacy_encoder_test.cc b/src/logger/privacy_encoder_test.cc
index 129fdf8..153e633 100644
--- a/src/logger/privacy_encoder_test.cc
+++ b/src/logger/privacy_encoder_test.cc
@@ -39,6 +39,13 @@
report_def);
}
+ lib::statusor::StatusOr<std::vector<uint64_t>> PrepareIndexVectorForPerDeviceHistogramsReport(
+ const Observation &observation, const MetricDefinition &metric_def,
+ const ReportDefinition &report_def) {
+ return privacy_encoder_->PrepareIndexVectorForPerDeviceHistogramsReport(observation, metric_def,
+ report_def);
+ }
+
static std::vector<std::unique_ptr<Observation>> ObservationsFromIndices(
const std::vector<uint64_t> &indices) {
return PrivacyEncoder::ObservationsFromIndices(indices);
@@ -260,6 +267,58 @@
EXPECT_EQ(status_or_indices.status().error_code(), util::INVALID_ARGUMENT);
}
+TEST_F(PrivacyEncoderTest, PerDeviceHistograms) {
+ // |metric_def| has 11 valid event vectors.
+ MetricDefinition metric_def;
+ metric_def.set_metric_type(MetricDefinition::OCCURRENCE);
+ auto metric_dim = metric_def.add_metric_dimensions();
+ metric_dim->set_dimension("dimension 0");
+ metric_dim->set_max_event_code(10);
+
+ // Counting the underflow and overflow buckets, |int_buckets| has 8 + 2 = 10 valid bucket indices.
+ LinearIntegerBuckets int_buckets;
+ int_buckets.set_floor(0);
+ int_buckets.set_num_buckets(8);
+ int_buckets.set_step_size(2);
+
+ ReportDefinition report_def;
+ *report_def.mutable_int_buckets()->mutable_linear() = int_buckets;
+
+ std::vector<uint64_t> expected_indices = {14, 40};
+ Observation observation;
+ auto integer_obs = observation.mutable_integer();
+
+ // Add a value for event code 3 and bucket index 1.
+ // The expected index is:
+ // num_event_vectors * bucket_index + event_code = 11 * 1 + 3 = 14.
+ auto val = integer_obs->add_values();
+ val->add_event_codes(3u);
+ val->set_value(0);
+
+ // Add a value for event code 7 and bucket index 3.
+ // The expected index is:
+ // num_event_vectors * bucket_index + event_code = 11 * 3 + 7 = 40.
+ val = integer_obs->add_values();
+ val->add_event_codes(7u);
+ val->set_value(5);
+
+ auto status_or_indices =
+ PrepareIndexVectorForPerDeviceHistogramsReport(observation, metric_def, report_def);
+ ASSERT_TRUE(status_or_indices.ok());
+ EXPECT_EQ(status_or_indices.ValueOrDie(), expected_indices);
+}
+
+TEST_F(PrivacyEncoderTest, PerDeviceHistogramsInvalidObservationType) {
+ MetricDefinition metric_def;
+ ReportDefinition report_def;
+ Observation observation;
+
+ auto status_or_indices =
+ PrepareIndexVectorForPerDeviceHistogramsReport(observation, metric_def, report_def);
+ ASSERT_FALSE(status_or_indices.ok());
+ EXPECT_EQ(status_or_indices.status().error_code(), util::INVALID_ARGUMENT);
+}
+
TEST_F(PrivacyEncoderTest, ObservationsFromIndicesNoIndices) {
std::vector<uint64_t> indices;
auto observations = ObservationsFromIndices(indices);
@@ -307,9 +366,7 @@
std::vector<ReportDefinition::ReportType> per_device_integer_report_types = {
ReportDefinition::FLEETWIDE_OCCURRENCE_COUNTS, ReportDefinition::HOURLY_VALUE_NUMERIC_STATS,
- ReportDefinition::HOURLY_VALUE_HISTOGRAMS, ReportDefinition::UNIQUE_DEVICE_NUMERIC_STATS,
- ReportDefinition::UNIQUE_DEVICE_HISTOGRAMS,
- };
+ ReportDefinition::UNIQUE_DEVICE_NUMERIC_STATS};
ReportDefinition report_def;
uint32_t num_index_points = 5;
@@ -346,6 +403,47 @@
EXPECT_EQ(status_or.ValueOrDie(), expected_max_index);
}
+TEST_F(PrivacyEncoderTest, MaxIndexForReportPerDeviceHistogram) {
+ MetricDefinition metric_def;
+ uint32_t max_event_code = 10;
+ auto dim = metric_def.add_metric_dimensions();
+ dim->set_dimension("dimension 1");
+ dim->set_max_event_code(max_event_code);
+
+ std::vector<ReportDefinition::ReportType> per_device_histogram_report_types = {
+ ReportDefinition::HOURLY_VALUE_HISTOGRAMS, ReportDefinition::UNIQUE_DEVICE_HISTOGRAMS};
+
+ IntegerBuckets linear_buckets;
+ linear_buckets.mutable_linear()->set_floor(0);
+ linear_buckets.mutable_linear()->set_num_buckets(3);
+
+ IntegerBuckets exp_buckets;
+ exp_buckets.mutable_exponential()->set_floor(0);
+ exp_buckets.mutable_exponential()->set_num_buckets(3);
+
+ std::vector<IntegerBuckets> bucket_variants = {linear_buckets, exp_buckets};
+
+ ReportDefinition report_def;
+
+ // There are 11 event codes and 5 histogram buckets (3 registered + 1 underflow + 1 overflow) for
+ // a total of 55 possible indices.
+ uint64_t expected_max_index = 54;
+
+ for (const auto report_type : per_device_histogram_report_types) {
+ for (const auto &int_buckets : bucket_variants) {
+ report_def.set_report_type(report_type);
+ *report_def.mutable_int_buckets() = int_buckets;
+
+ auto max_index = PrivacyEncoder::MaxIndexForReport(metric_def, report_def);
+ ASSERT_TRUE(max_index.ok()) << "Failed to get max index with status "
+ << max_index.status().error_code() << ", "
+ << max_index.status().error_message();
+
+ EXPECT_EQ(expected_max_index, max_index.ValueOrDie());
+ }
+ }
+}
+
TEST_F(PrivacyEncoderTest, MaxIndexForReportUnimplemented) {
MetricDefinition metric_def;
ReportDefinition report_def;
diff --git a/src/registry/report_definition.proto b/src/registry/report_definition.proto
index b84a1ef..62fd95e 100644
--- a/src/registry/report_definition.proto
+++ b/src/registry/report_definition.proto
@@ -322,7 +322,9 @@
// ReportDefinition fields particular to this type:
// - local_aggregation_procedure (only when the metric type is INTEGER)
// - local_aggregation_period
- // - int_buckets (This is used only on the server.)
+ // - int_buckets (this is used only on the server for reports without
+ // added privacy, but is used on the client for reports with added
+ // privacy)
//
// This report type is part of Cobalt 1.1. Do not use it yet.
UNIQUE_DEVICE_HISTOGRAMS = 13;
@@ -365,7 +367,9 @@
//
// ReportDefinition fields particular to this type:
// - local_aggregation_procedure (only when the metric type is INTEGER)
- // - int_buckets (This is used only on the server.)
+ // - int_buckets (this is used only on the server for reports without
+ // added privacy, but is used on the client for reports with added
+ // privacy)
//
// This report type is part of Cobalt 1.1. Do not use it yet.
HOURLY_VALUE_HISTOGRAMS = 14;