[Cobalt 1.1 privacy] Implement MakePrivateObservations for StringCounts
Uses the CountMin library to encode StringHistogramObservations for
StringCounts reports with added privacy.
Co-authored-by: azani@google.com
Co-authored-by: pesk@google.com
Change-Id: I6ae5eee56d129340d2c018bfa327dfe5338732df
Reviewed-on: https://fuchsia-review.googlesource.com/c/cobalt/+/464474
Reviewed-by: Alexandre Zani <azani@google.com>
Fuchsia-Auto-Submit: Alexandre Zani <azani@google.com>
Commit-Queue: Auto-Submit <auto-submit@fuchsia-infra.iam.gserviceaccount.com>
diff --git a/src/logger/BUILD.gn b/src/logger/BUILD.gn
index 487dbdd..cd44307 100644
--- a/src/logger/BUILD.gn
+++ b/src/logger/BUILD.gn
@@ -456,6 +456,7 @@
public_deps = [
":event_vector_index",
+ "$cobalt_root/src/algorithms/privacy:count_min",
"$cobalt_root/src/algorithms/privacy:numeric_encoding",
"$cobalt_root/src/algorithms/privacy:rappor",
"$cobalt_root/src/algorithms/random:random",
@@ -473,7 +474,10 @@
deps = [
":privacy_encoder",
+ "$cobalt_root/src/algorithms/privacy:count_min",
"$cobalt_root/src/algorithms/random:test_secure_random",
+ "$cobalt_root/src/lib/statusor",
+ "$cobalt_root/src/lib/util:hash",
"$cobalt_root/src/registry:cobalt_registry_proto",
"//third_party/googletest:gtest",
]
diff --git a/src/logger/privacy_encoder.cc b/src/logger/privacy_encoder.cc
index df061f5..c480d1f 100644
--- a/src/logger/privacy_encoder.cc
+++ b/src/logger/privacy_encoder.cc
@@ -1,5 +1,6 @@
#include "src/logger/privacy_encoder.h"
+#include "src/algorithms/privacy/count_min.h"
#include "src/algorithms/privacy/numeric_encoding.h"
#include "src/algorithms/privacy/rappor.h"
#include "src/algorithms/random/random.h"
@@ -11,6 +12,10 @@
namespace cobalt::logger {
namespace {
+// The dimensions of a CountMin sketch for a report of type StringCounts.
+const size_t kNumCountMinCellsPerHash = 10;
+const size_t kNumCountMinHashes = 5;
+
// Returns the number of histogram buckets associated to an IntegerBuckets, including the underflow
// and overflow buckets.
lib::statusor::StatusOr<uint32_t> GetNumIntegerBuckets(const IntegerBuckets &int_buckets) {
@@ -117,12 +122,34 @@
report_def.num_index_points()) -
1;
}
-
+ case ReportDefinition::STRING_COUNTS: {
+ CB_ASSIGN_OR_RETURN(size_t num_cells_per_hash, GetNumCountMinCellsPerHash(report_def));
+ CB_ASSIGN_OR_RETURN(size_t num_hashes, GetNumCountMinHashes(report_def));
+ return (num_cells_per_hash * num_hashes * GetNumEventVectors(metric_def.metric_dimensions()) *
+ report_def.num_index_points()) -
+ 1;
+ }
default:
return util::Status(util::UNIMPLEMENTED, "this is not yet implemented");
}
}
+lib::statusor::StatusOr<size_t> PrivacyEncoder::GetNumCountMinCellsPerHash(
+ const ReportDefinition &report_def) {
+ if (report_def.report_type() != ReportDefinition::STRING_COUNTS) {
+ return util::Status(util::INVALID_ARGUMENT, "report must have type StringCounts");
+ }
+ return kNumCountMinCellsPerHash;
+}
+
+lib::statusor::StatusOr<size_t> PrivacyEncoder::GetNumCountMinHashes(
+ const ReportDefinition &report_def) {
+ if (report_def.report_type() != ReportDefinition::STRING_COUNTS) {
+ return util::Status(util::INVALID_ARGUMENT, "report must have type StringCounts");
+ }
+ return kNumCountMinHashes;
+}
+
lib::statusor::StatusOr<std::vector<uint64_t>> PrivacyEncoder::PrepareIndexVector(
const Observation &observation, const MetricDefinition &metric_def,
const ReportDefinition &report_def) {
@@ -155,6 +182,14 @@
observation, metric_def, report_def));
break;
}
+ case ReportDefinition::STRING_COUNTS: {
+ CB_ASSIGN_OR_RETURN(size_t num_cells_per_hash, GetNumCountMinCellsPerHash(report_def));
+ CB_ASSIGN_OR_RETURN(size_t num_hashes, GetNumCountMinHashes(report_def));
+ CB_ASSIGN_OR_RETURN(
+ indices, PrepareIndexVectorForStringCountsReport(observation, metric_def, report_def,
+ num_cells_per_hash, num_hashes));
+ break;
+ }
default:
return util::Status(util::UNIMPLEMENTED, "this is not yet implemented");
@@ -316,6 +351,47 @@
return occurred_indices;
}
+lib::statusor::StatusOr<std::vector<uint64_t>>
+PrivacyEncoder::PrepareIndexVectorForStringCountsReport(const Observation &observation,
+ const MetricDefinition &metric_def,
+ const ReportDefinition &report_def,
+ size_t num_cells_per_hash,
+ size_t num_hashes) {
+ if (!observation.has_string_histogram()) {
+ return util::Status(util::INVALID_ARGUMENT,
+ "observation type is not StringHistogramObservation.");
+ }
+
+ std::vector<uint64_t> occurred_indices;
+
+ for (const auto &string_histogram : observation.string_histogram().string_histograms()) {
+ std::vector<uint32_t> event_codes(string_histogram.event_codes().begin(),
+ string_histogram.event_codes().end());
+ CB_ASSIGN_OR_RETURN(auto event_vector_index, EventVectorToIndex(event_codes, metric_def));
+
+ auto count_min = CountMin<uint64_t>::MakeSketch(num_cells_per_hash, num_hashes);
+ for (int i = 0; i < string_histogram.bucket_indices_size(); ++i) {
+ uint64_t clipped_count = ClipCount(string_histogram.bucket_counts(i), report_def);
+ std::string string_hash =
+ observation.string_histogram().string_hashes(string_histogram.bucket_indices(i));
+ count_min.Increment(string_hash, clipped_count);
+ }
+
+ for (size_t cell_index = 0; cell_index < count_min.size(); ++cell_index) {
+ CB_ASSIGN_OR_RETURN(uint64_t cell_value, count_min.GetCellValue(cell_index));
+ if (cell_value > 0) {
+ uint64_t count_min_index =
+ HistogramBucketAndCountToIndex(cell_index, cell_value, report_def.max_count(),
+ report_def.num_index_points(), gen_.get());
+ occurred_indices.push_back(ValueAndEventVectorIndicesToIndex(
+ count_min_index, event_vector_index,
+ GetNumEventVectors(metric_def.metric_dimensions()) - 1));
+ }
+ }
+ }
+ return occurred_indices;
+}
+
int64_t PrivacyEncoder::ClipValue(int64_t value, const ReportDefinition &report_def) {
if (value > report_def.max_value()) {
return report_def.max_value();
diff --git a/src/logger/privacy_encoder.h b/src/logger/privacy_encoder.h
index b165bd0..ec95a92 100644
--- a/src/logger/privacy_encoder.h
+++ b/src/logger/privacy_encoder.h
@@ -5,6 +5,7 @@
#ifndef COBALT_SRC_LOGGER_PRIVACY_ENCODER_H_
#define COBALT_SRC_LOGGER_PRIVACY_ENCODER_H_
+#include "src/algorithms/privacy/count_min.h"
#include "src/algorithms/random/random.h"
#include "src/lib/statusor/statusor.h"
#include "src/pb/observation.pb.h"
@@ -49,6 +50,16 @@
static lib::statusor::StatusOr<uint64_t> MaxIndexForReport(const MetricDefinition &metric_def,
const ReportDefinition &report_def);
+ // The number of sketch cells per hash function in CountMin sketch for a StringCounts report
+ // |report_def|. Currently hard-coded to 10. Returns an error status if |report_def| is not of
+ // type StringCounts.
+ static lib::statusor::StatusOr<size_t> GetNumCountMinCellsPerHash(
+ const ReportDefinition &report_def);
+
+ // The number of hash functions in a CountMin sketch for a StringCounts report |report_def|.
+ // Currently hard-coded to 5. Returns an error status if |report_def| is not of type StringCounts.
+ static lib::statusor::StatusOr<size_t> GetNumCountMinHashes(const ReportDefinition &report_def);
+
private:
friend class PrivacyEncoderTest;
@@ -80,6 +91,10 @@
const Observation &observation, const MetricDefinition &metric_def,
const ReportDefinition &report_def);
+ lib::statusor::StatusOr<std::vector<uint64_t>> PrepareIndexVectorForStringCountsReport(
+ const Observation &observation, const MetricDefinition &metric_def,
+ const ReportDefinition &report_def, size_t num_cells_per_hash, size_t num_hashes);
+
static std::vector<std::unique_ptr<Observation>> ObservationsFromIndices(
const std::vector<uint64_t> &indices);
diff --git a/src/logger/privacy_encoder_test.cc b/src/logger/privacy_encoder_test.cc
index 7ccd449..5b901f6 100644
--- a/src/logger/privacy_encoder_test.cc
+++ b/src/logger/privacy_encoder_test.cc
@@ -1,9 +1,14 @@
#include "src/logger/privacy_encoder.h"
+#include <algorithm>
+
#include <gtest/gtest.h>
+#include "src/algorithms/privacy/count_min.h"
#include "src/algorithms/random/test_secure_random.h"
+#include "src/lib/statusor/status_macros.h"
#include "src/lib/statusor/statusor.h"
+#include "src/lib/util/hash.h"
#include "src/pb/observation.pb.h"
#include "src/registry/metric_definition.pb.h"
#include "src/registry/report_definition.pb.h"
@@ -53,6 +58,13 @@
report_def);
}
+ lib::statusor::StatusOr<std::vector<uint64_t>> PrepareIndexVectorForStringCountsReport(
+ const Observation &observation, const MetricDefinition &metric_def,
+ const ReportDefinition &report_def, size_t num_cells_per_hash, size_t num_hashes) {
+ return privacy_encoder_->PrepareIndexVectorForStringCountsReport(
+ observation, metric_def, report_def, num_cells_per_hash, num_hashes);
+ }
+
static std::vector<std::unique_ptr<Observation>> ObservationsFromIndices(
const std::vector<uint64_t> &indices) {
return PrivacyEncoder::ObservationsFromIndices(indices);
@@ -127,7 +139,8 @@
ReportDefinition::FLEETWIDE_MEANS,
ReportDefinition::HOURLY_VALUE_HISTOGRAMS,
ReportDefinition::UNIQUE_DEVICE_HISTOGRAMS,
- ReportDefinition::FLEETWIDE_HISTOGRAMS};
+ ReportDefinition::FLEETWIDE_HISTOGRAMS,
+ ReportDefinition::STRING_COUNTS};
MetricDefinition metric_def;
ReportDefinition report_def;
@@ -141,24 +154,6 @@
}
}
-// MakePrivateObservations() is not implemented yet for these report types. Move report types to the
-// MakePrivateObservationsImplemented test as they are implemented.
-TEST_F(PrivacyEncoderTest, MakePrivateObservationsUnimplemented) {
- std::vector<ReportDefinition::ReportType> unimplemented_report_types = {
- ReportDefinition::STRING_COUNTS};
-
- MetricDefinition metric_def;
- ReportDefinition report_def;
- report_def.set_privacy_level(ReportDefinition::LOW_PRIVACY);
-
- Observation observation;
- for (const ReportDefinition::ReportType report_type : unimplemented_report_types) {
- report_def.set_report_type(report_type);
- EXPECT_EQ(MakePrivateObservations(&observation, metric_def, report_def).status().error_code(),
- util::UNIMPLEMENTED);
- }
-}
-
TEST_F(PrivacyEncoderTest, MakePrivateObservationsNullObservation) {
MetricDefinition metric_def;
ReportDefinition report_def;
@@ -444,6 +439,79 @@
EXPECT_EQ(indices.ValueOrDie(), expected_indices);
}
+TEST_F(PrivacyEncoderTest, StringCounts) {
+ size_t num_cells_per_hash = 2;
+ size_t num_hashes = 2;
+ uint32_t max_event_code = 1;
+ uint32_t max_count = 2;
+ uint32_t num_index_points = max_count + 1;
+
+ // |metric_def| has 2 valid event vectors.
+ MetricDefinition metric_def;
+ metric_def.set_metric_type(MetricDefinition::STRING);
+ metric_def.set_string_buffer_max(10);
+ auto metric_dim = metric_def.add_metric_dimensions();
+ metric_dim->set_dimension("dimension 0");
+ metric_dim->set_max_event_code(max_event_code);
+
+ // |report_def| has 3 valid count values: {0, 1, 2}.
+ ReportDefinition report_def;
+ report_def.set_max_count(max_count);
+ report_def.set_num_index_points(num_index_points);
+
+ // Prepare a StringHistogramObservation with 2 IndexHistograms:
+ // - with event vector {0}: ("blobfs", count = 1), ("thinfs", count = 2)
+ // - with event vector {1}: ("thinfs", count = 2)
+ Observation observation;
+ StringHistogramObservation *string_histogram_obs = observation.mutable_string_histogram();
+
+ string_histogram_obs->add_string_hashes(util::FarmhashFingerprint("blobfs"));
+ string_histogram_obs->add_string_hashes(util::FarmhashFingerprint("thinfs"));
+
+ IndexHistogram *histogram_1 = string_histogram_obs->add_string_histograms();
+ histogram_1->add_event_codes(0u);
+ histogram_1->add_bucket_indices(0u);
+ histogram_1->add_bucket_counts(1u);
+ histogram_1->add_bucket_indices(1u);
+ histogram_1->add_bucket_counts(2u);
+
+ IndexHistogram *histogram_2 = string_histogram_obs->add_string_histograms();
+ histogram_2->add_event_codes(1u);
+ histogram_2->add_bucket_indices(1u);
+ histogram_2->add_bucket_counts(2u);
+
+ // The general formula for an expected index is:
+ // (count + num_index_points * sketch cell index) * (num_event_vectors) + event_vector_index.
+ // Cells with count 0 are omitted.
+ auto count_min = CountMin<uint64_t>::MakeSketch(num_cells_per_hash, num_hashes);
+ std::vector<size_t> blobfs_indices =
+ count_min.GetCellIndices(string_histogram_obs->string_hashes(0));
+ std::vector<size_t> thinfs_indices =
+ count_min.GetCellIndices(string_histogram_obs->string_hashes(1));
+
+ std::vector<uint64_t> expected_private_indices;
+ expected_private_indices.reserve(6);
+
+ for (size_t index : blobfs_indices) {
+ // Expected private indices for a count of 1 for "blobfs" with event vector {0}
+ expected_private_indices.push_back((1 + num_index_points * index) * (max_event_code + 1) + 0);
+ }
+
+ for (size_t index : thinfs_indices) {
+ // Expected private indices for a count of 2 for "thinfs" with event vector {0}
+ expected_private_indices.push_back((2 + num_index_points * index) * (max_event_code + 1) + 0);
+ // Expected private indices for a count of 2 for "thinfs" with event vector {1}
+ expected_private_indices.push_back((2 + num_index_points * index) * (max_event_code + 1) + 1);
+ }
+
+ CB_ASSERT_OK_AND_ASSIGN(std::vector<uint64_t> indices,
+ PrepareIndexVectorForStringCountsReport(
+ observation, metric_def, report_def, num_cells_per_hash, num_hashes));
+ std::sort(indices.begin(), indices.end());
+ std::sort(expected_private_indices.begin(), expected_private_indices.end());
+ EXPECT_EQ(indices, expected_private_indices);
+}
+
TEST_F(PrivacyEncoderTest, ObservationsFromIndicesNoIndices) {
std::vector<uint64_t> indices;
auto observations = ObservationsFromIndices(indices);
@@ -647,6 +715,40 @@
}
}
+TEST_F(PrivacyEncoderTest, MaxIndexForReportStringCounts) {
+ uint32_t max_event_code = 9;
+ uint32_t num_index_points = 6;
+
+ // |metric_def| has 10 valid event vectors.
+ MetricDefinition metric_def;
+ metric_def.set_metric_type(MetricDefinition::STRING);
+ MetricDefinition::MetricDimension *dim = metric_def.add_metric_dimensions();
+ dim->set_dimension("dimension1");
+ dim->set_max_event_code(max_event_code);
+
+ // |report_def| has 6 valid count values.
+ ReportDefinition report_def;
+ report_def.set_report_type(ReportDefinition::STRING_COUNTS);
+ report_def.set_num_index_points(num_index_points);
+
+ CB_ASSERT_OK_AND_ASSIGN(size_t num_cells_per_hash,
+ PrivacyEncoder::GetNumCountMinCellsPerHash(report_def));
+ CB_ASSERT_OK_AND_ASSIGN(size_t num_hashes, PrivacyEncoder::GetNumCountMinHashes(report_def));
+
+ // The expected max index is:
+ // (# of valid event vectors) * (# valid count values) * (size of count min sketch) - 1
+ // = 10 * 6 * 50 - 1 = 2999.
+ uint64_t expected_max_index =
+ (max_event_code + 1) * num_index_points * num_cells_per_hash * num_hashes - 1;
+
+ auto max_index = PrivacyEncoder::MaxIndexForReport(metric_def, report_def);
+ ASSERT_TRUE(max_index.ok()) << "Failed to get max index with status "
+ << max_index.status().error_code() << ", "
+ << max_index.status().error_message();
+
+ EXPECT_EQ(expected_max_index, max_index.ValueOrDie());
+}
+
TEST_F(PrivacyEncoderTest, MaxIndexForReportUnimplemented) {
MetricDefinition metric_def;
ReportDefinition report_def;