[Cobalt 1.1 privacy] Move and update CountMin sketch for private string
counts
Moves the CountMin class out of src/algorithms/experimental/
into src/algorithms/privacy/ and updates the interface for
encoding+decoding StringHistogramObservations.
Co-authored-by: azani@google.com
Co-authored-by: pesk@google.com
Change-Id: I493353f7b24cfbcc09b8e92c964ec20427d0db7c
Reviewed-on: https://fuchsia-review.googlesource.com/c/cobalt/+/501539
Reviewed-by: Alexandre Zani <azani@google.com>
Commit-Queue: Laura Peskin <pesk@google.com>
diff --git a/src/algorithms/experimental/BUILD.gn b/src/algorithms/experimental/BUILD.gn
index 141ab26..3d05237 100644
--- a/src/algorithms/experimental/BUILD.gn
+++ b/src/algorithms/experimental/BUILD.gn
@@ -11,57 +11,6 @@
import("//build/test.gni")
-source_set("hash") {
- sources = [
- "hash.cc",
- "hash.h",
- ]
-
- configs += [ "$cobalt_root:cobalt_config" ]
-
- public_deps = [ "//third_party/boringssl" ]
-}
-
-source_set("hash_test") {
- testonly = true
- sources = [ "hash_test.cc" ]
-
- deps = [
- ":hash",
- "//third_party/gflags",
- "//third_party/googletest:gtest",
- ]
-
- configs += [ "$cobalt_root:cobalt_config" ]
-}
-
-source_set("count_min") {
- sources = [
- "count_min.cc",
- "count_min.h",
- ]
-
- configs += [ "$cobalt_root:cobalt_config" ]
-
- deps = [
- ":hash",
- "$cobalt_root/src:logging",
- ]
-}
-
-source_set("count_min_test") {
- testonly = true
- sources = [ "count_min_test.cc" ]
-
- deps = [
- ":count_min",
- "//third_party/gflags",
- "//third_party/googletest:gtest",
- ]
-
- configs += [ "$cobalt_root:cobalt_config" ]
-}
-
source_set("randomized_response") {
sources = [
"randomized_response.cc",
@@ -156,8 +105,6 @@
group("tests") {
testonly = true
deps = [
- ":count_min_test",
- ":hash_test",
":histogram_encoder_test",
":integer_encoder_test",
":randomized_response_test",
diff --git a/src/algorithms/experimental/count_min.cc b/src/algorithms/experimental/count_min.cc
deleted file mode 100644
index 54eab71..0000000
--- a/src/algorithms/experimental/count_min.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright 2019 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "src/algorithms/experimental/count_min.h"
-
-#include <iostream>
-
-#include "src/algorithms/experimental/hash.h"
-#include "src/logging.h"
-
-namespace cobalt {
-
-CountMin::CountMin(size_t num_cells, size_t num_hashes)
- : num_cells_(num_cells), num_hashes_(num_hashes), cells_(num_cells * num_hashes, 0u) {}
-
-CountMin::CountMin(size_t num_cells, size_t num_hashes, std::vector<uint32_t> cells)
- : num_cells_(num_cells), num_hashes_(num_hashes), cells_(std::move(cells)) {
- CHECK_EQ(cells_.size(), num_cells_ * num_hashes_);
-}
-
-void CountMin::Increment(const char* data, size_t len, uint32_t num) {
- for (size_t i = 0; i < num_hashes_; ++i) {
- cells_[GetSketchCell(data, len, i)] += num;
- }
-}
-
-void CountMin::Increment(const char* data, size_t len) { Increment(data, len, 1); }
-
-void CountMin::Increment(const std::string& data, uint32_t num) {
- Increment(data.data(), data.size(), num);
-}
-
-void CountMin::Increment(const std::string& data) { Increment(data.data(), data.size(), 1); }
-
-uint32_t CountMin::GetCount(const std::string& data) const {
- return GetCount(data.data(), data.size());
-}
-
-uint32_t CountMin::GetCount(const char* data, size_t len) const {
- uint32_t min_count = 0;
- for (size_t i = 0; i < num_hashes_; ++i) {
- uint32_t count = cells_[GetSketchCell(data, len, i)];
- if (count < min_count || i == 0) {
- min_count = count;
- }
- }
- return min_count;
-}
-
-const std::vector<uint32_t>* CountMin::GetSketch() const { return &cells_; }
-
-size_t CountMin::GetSketchCell(const char* data, size_t len, uint64_t index) const {
- return index * num_cells_ +
- TruncatedDigest(reinterpret_cast<const uint8_t*>(data), len, index, num_hashes_);
-}
-
-} // namespace cobalt
diff --git a/src/algorithms/experimental/count_min.h b/src/algorithms/experimental/count_min.h
deleted file mode 100644
index e116e44..0000000
--- a/src/algorithms/experimental/count_min.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2019 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef COBALT_SRC_ALGORITHMS_EXPERIMENTAL_COUNT_MIN_H_
-#define COBALT_SRC_ALGORITHMS_EXPERIMENTAL_COUNT_MIN_H_
-
-#include <cstdint>
-#include <string>
-#include <vector>
-
-namespace cobalt {
-
-// Implements Count-Min Sketch.
-//
-// The count-min sketch represents a distribution of hashable observations as |num_hashes| fields
-// of size |num_cells|.
-//
-// This implementation flattens the representation into a single vector of integers.
-// The field corresponding the kth hash function is from k*num_cells to k*(num_cells+1).
-// e.g. If the kth hash of an observation is n, this corresponds to GetSketch()[k *num_cells + n]
-class CountMin {
- public:
- CountMin(size_t num_cells, size_t num_hashes);
-
- CountMin(size_t num_cells, size_t num_hashes, std::vector<uint32_t> cells);
-
- // Increment the number of observations of |data| which has length |len| by |num|.
- void Increment(const char* data, size_t len, uint32_t num);
-
- // Increment the number of observations of |data| which has length |len| by 1.
- void Increment(const char* data, size_t len);
-
- // Increment the number of observations of |data| by |num|.
- void Increment(const std::string& data, uint32_t num);
-
- // Increment the number of observations of |data| by 1.
- void Increment(const std::string& data);
-
- // Get the estimated count for the specified |data| which has length |len|.
- [[nodiscard]] uint32_t GetCount(const char* data, size_t len) const;
-
- // Get the estimated count for the specified |data|.
- [[nodiscard]] uint32_t GetCount(const std::string& data) const;
-
- // Return a pointer to the cells of the sketch.
- [[nodiscard]] const std::vector<uint32_t>* GetSketch() const;
-
- private:
- // Returns the index in the sketch cells of the |data| of length |len| for hash function |index|.
- size_t GetSketchCell(const char* data, size_t len, uint64_t index) const;
-
- size_t num_cells_;
- size_t num_hashes_;
- std::vector<uint32_t> cells_;
-};
-
-} // namespace cobalt
-#endif // COBALT_SRC_ALGORITHMS_EXPERIMENTAL_COUNT_MIN_H_
diff --git a/src/algorithms/experimental/count_min_test.cc b/src/algorithms/experimental/count_min_test.cc
deleted file mode 100644
index fd1ffc2..0000000
--- a/src/algorithms/experimental/count_min_test.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2019 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "src/algorithms/experimental/count_min.h"
-
-#include <gtest/gtest.h>
-
-namespace cobalt {
-
-TEST(CountMin, Basic) {
- std::vector<std::pair<std::string, uint32_t>> test_cases = {
- {"Hello World", 100},
- {"Hallo Welt", 20},
- {"Hola Mundo", 73},
- {"Bonjour, Monde", 22},
- };
- CountMin count_min(2000, 10);
- for (auto [data, count] : test_cases) {
- count_min.Increment(data, count);
- }
-
- for (auto [data, count] : test_cases) {
- EXPECT_EQ(count, count_min.GetCount(data));
- }
-}
-
-TEST(CountMin, GetSketch) {
- std::vector<std::pair<std::string, uint32_t>> test_cases = {
- {"Hello World", 100},
- {"Hallo Welt", 20},
- {"Hola Mundo", 73},
- {"Bonjour, Monde", 22},
- };
- CountMin count_min(2000, 10);
- for (auto [data, count] : test_cases) {
- count_min.Increment(data, count);
- }
-
- auto count_min_cells = count_min.GetSketch();
- CountMin count_min_copy(2000, 10, *count_min_cells);
-
- for (auto [data, count] : test_cases) {
- EXPECT_EQ(count, count_min_copy.GetCount(data));
- }
-}
-
-} // namespace cobalt
diff --git a/src/algorithms/experimental/hash_test.cc b/src/algorithms/experimental/hash_test.cc
deleted file mode 100644
index 844f5d5..0000000
--- a/src/algorithms/experimental/hash_test.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2019 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "src/algorithms/experimental/hash.h"
-
-#include <gtest/gtest.h>
-
-namespace cobalt {
-
-TEST(Hash64WithSeed, StableHash) {
- EXPECT_EQ(0xb5ab47d4df4eb16cu, Hash64WithSeed("hello world", 1));
- EXPECT_EQ(0x351e4687378f8240u, Hash64WithSeed("hi there earth", 1));
- EXPECT_EQ(0xe7b848f961c5d168u, Hash64WithSeed("bonjour monde", 1));
- EXPECT_EQ(0x826c0bf8b3d8ccc9u, Hash64WithSeed("hallo Welt", 1));
-}
-
-TEST(Hash64WithSeed, DifferentSeeds) {
- auto hash0 = Hash64WithSeed("hello world", 0);
- auto hash1 = Hash64WithSeed("hello world", 1);
- EXPECT_NE(hash0, hash1);
-}
-
-TEST(Hash64WithSeed, DifferentFirstLetter) {
- auto hash0 = Hash64WithSeed("hello world", 1);
- auto hash1 = Hash64WithSeed("iello world", 1);
- EXPECT_NE(hash0, hash1);
-}
-
-TEST(Hash64WithSeed, DifferentLastLetter) {
- auto hash0 = Hash64WithSeed("hello world", 1);
- auto hash1 = Hash64WithSeed("hello worle", 1);
- EXPECT_NE(hash0, hash1);
-}
-
-TEST(TruncatedDigest, StableDigest) {
- EXPECT_EQ(36u, TruncatedDigest("hello world", 1, 100));
- EXPECT_EQ(40u, TruncatedDigest("hi there earth", 1, 100));
- EXPECT_EQ(48u, TruncatedDigest("bonjour monde", 1, 100));
- EXPECT_EQ(37u, TruncatedDigest("hallo Welt", 1, 100));
-}
-
-TEST(TruncatedDigest, DifferentSeeds) {
- auto digest0 = TruncatedDigest("hello world", 0, 100);
- auto digest1 = TruncatedDigest("hello world", 1, 100);
- EXPECT_NE(digest0, digest1);
-}
-
-TEST(TruncatedDigest, DifferentFirstLetter) {
- auto digest0 = TruncatedDigest("hello world", 1, 100);
- auto digest1 = TruncatedDigest("iello world", 1, 100);
- EXPECT_NE(digest0, digest1);
-}
-
-TEST(TruncatedDigest, DifferentLastLetter) {
- auto digest0 = TruncatedDigest("hello world", 1, 100);
- auto digest1 = TruncatedDigest("hello worle", 1, 100);
- EXPECT_NE(digest0, digest1);
-}
-
-} // namespace cobalt
diff --git a/src/algorithms/privacy/BUILD.gn b/src/algorithms/privacy/BUILD.gn
index 57fcc44..9f79a6f 100644
--- a/src/algorithms/privacy/BUILD.gn
+++ b/src/algorithms/privacy/BUILD.gn
@@ -8,6 +8,8 @@
testonly = true
deps = [
+ ":count_min_test",
+ ":hash_test",
":numeric_encoding_test",
":rappor_test",
]
@@ -69,3 +71,53 @@
"//build/config:Wno-conversion",
]
}
+
+source_set("count_min") {
+ sources = [ "count_min.h" ]
+
+ configs += [ "$cobalt_root:cobalt_config" ]
+
+ deps = [
+ ":hash",
+ "$cobalt_root/src/lib/statusor",
+ "$cobalt_root/src/lib/util:status",
+ ]
+}
+
+source_set("count_min_test") {
+ testonly = true
+ sources = [ "count_min_test.cc" ]
+
+ deps = [
+ ":count_min",
+ "$cobalt_root/src/lib/statusor",
+ "//third_party/gflags",
+ "//third_party/googletest:gtest",
+ ]
+
+ configs += [ "$cobalt_root:cobalt_config" ]
+}
+
+source_set("hash") {
+ sources = [
+ "hash.cc",
+ "hash.h",
+ ]
+
+ configs += [ "$cobalt_root:cobalt_config" ]
+
+ deps = [ "//third_party/github.com/google/farmhash" ]
+}
+
+source_set("hash_test") {
+ testonly = true
+ sources = [ "hash_test.cc" ]
+
+ deps = [
+ ":hash",
+ "//third_party/gflags",
+ "//third_party/googletest:gtest",
+ ]
+
+ configs += [ "$cobalt_root:cobalt_config" ]
+}
diff --git a/src/algorithms/privacy/count_min.h b/src/algorithms/privacy/count_min.h
new file mode 100644
index 0000000..f9e02d4
--- /dev/null
+++ b/src/algorithms/privacy/count_min.h
@@ -0,0 +1,161 @@
+// Copyright 2019 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COBALT_SRC_ALGORITHMS_PRIVACY_COUNT_MIN_H_
+#define COBALT_SRC_ALGORITHMS_PRIVACY_COUNT_MIN_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "src/algorithms/privacy/hash.h"
+#include "src/lib/statusor/statusor.h"
+#include "src/lib/util/status.h"
+
+namespace cobalt {
+
+// Implements Count-Min Sketch.
+//
+// The count-min sketch represents a distribution of hashable observations as |num_hashes| arrays
+// of size |num_cells_per_hash|. It relies on a fixed choice of |num_hashes| hash functions mapping
+// the observation space into the range [0,..., |num_cells_per_hash|].
+//
+// This implementation flattens the representation into a single vector of integer-valued cells.
+// The cell index range corresponding the kth hash function begins at (k * |num_cells_per_hash|) and
+// ends (inclusive) at ((k + 1) * |num_cells_per_hash| - 1).
+//
+// Incrementing the count for an observation |data| has the effect of incrementing the |num_hashes|
+// cells in the set
+// cells(|data|) = { (k * num_cells_per_hash + h_k(|data|)) for k = {1, ..., |num_hashes|} }.
+//
+// To estimate the recorded count for |data|, CountMin computes the minimum value of the cells in
+// cells(|data|).
+template <typename CountType>
+class CountMin {
+ public:
+ static_assert(std::is_arithmetic<CountType>::value,
+ "CountMin is only valid for arithmetic types.");
+
+ // Returns a CountMin sketch with dimensions |num_cells_per_hash| and |num_hashes| and with
+ // |num_cells_per_hash| * |num_hashes| zero-valued cells.
+ static CountMin<CountType> MakeSketch(size_t num_cells_per_hash, size_t num_hashes) {
+ CountMin count_min;
+ count_min.initialize_with_zeros(num_cells_per_hash, num_hashes);
+ return count_min;
+ }
+
+ // Returns a CountMin sketch with dimensions |num_cells_per_hash| and |num_hashes| containing
+ // |cells| if the size of |cells| is equal to |num_cells_per_hash| * |num_hashes|, or an error
+ // status if not.
+ static lib::statusor::StatusOr<CountMin<CountType>> MakeSketchFromCells(
+ size_t num_cells_per_hash, size_t num_hashes, std::vector<CountType> cells) {
+ CountMin count_min;
+ if (util::Status init_status =
+ count_min.initialize_with_cells(num_cells_per_hash, num_hashes, cells);
+ !init_status.ok()) {
+ return init_status;
+ }
+ return count_min;
+ }
+
+ // Returns the number of cells in the sketch.
+ [[nodiscard]] size_t size() const { return cells_.size(); }
+
+ // Increments the number of observations of |data| by |count|.
+ void Increment(const std::string& data, CountType count) {
+ Increment(data.data(), data.size(), count);
+ }
+
+ // Returns a vector of the |num_hashes| indices corresponding to |data|, without updating the
+ // sketch.
+ [[nodiscard]] std::vector<size_t> GetCellIndices(const std::string& data) const {
+ std::vector<size_t> indices;
+ for (size_t i = 0; i < num_hashes_; ++i) {
+ indices.push_back(GetSketchCell(data.data(), data.size(), i));
+ }
+ return indices;
+ }
+
+ // Gets the estimated count for the specified |data|.
+ [[nodiscard]] CountType GetCount(const std::string& data) const {
+ return GetCount(data.data(), data.size());
+ }
+
+ // Increments the value at |cell_index| by |count|. Returns an error status if |cell_index| is not
+ // a valid cell index.
+ util::Status IncrementCell(size_t cell_index, CountType count) {
+ if (cell_index >= cells_.size()) {
+ return util::Status(util::OUT_OF_RANGE, "cell index is out of range.");
+ }
+ cells_[cell_index] += count;
+ return util::Status::OK;
+ }
+
+ // Gets the value of the sketch cell with index |cell_index|. Returns an error status if
+ // |cell_index| is not a valid cell index.
+ [[nodiscard]] lib::statusor::StatusOr<CountType> GetCellValue(size_t cell_index) const {
+ if (cell_index >= cells_.size()) {
+ return util::Status(util::OUT_OF_RANGE, "cell index is out of range.");
+ }
+ return cells_[cell_index];
+ }
+
+ private:
+ CountMin() = default;
+
+ // Sets the dimensions of the sketch and sets |cells_| to a zero vector of length
+ // |num_cells_per_hash| * |num_hashes|.
+ void initialize_with_zeros(size_t num_cells_per_hash, size_t num_hashes) {
+ num_cells_per_hash_ = num_cells_per_hash;
+ num_hashes_ = num_hashes;
+ cells_ = std::vector<CountType>(num_cells_per_hash * num_hashes);
+ }
+
+ // Sets the dimensions of the sketch and sets |cells_| to the provided vector |cells|. Returns an
+ // error status if the size of |cells| is not equal to |num_cells_per_hash| * |num_hashes|.
+ util::Status initialize_with_cells(size_t num_cells_per_hash, size_t num_hashes,
+ std::vector<CountType> cells) {
+ if (cells.size() != num_cells_per_hash * num_hashes) {
+ return util::Status(util::INVALID_ARGUMENT,
+ "number of cells is not compatible with sketch dimensions.");
+ }
+ num_cells_per_hash_ = num_cells_per_hash;
+ num_hashes_ = num_hashes;
+ cells_ = std::move(cells);
+ return util::Status::OK;
+ }
+
+ // Increments the number of observations of |data| which has length |len| by |count|.
+ void Increment(const char* data, size_t len, CountType count) {
+ for (size_t i = 0; i < num_hashes_; ++i) {
+ cells_[GetSketchCell(data, len, i)] += count;
+ }
+ }
+
+ // Gets the estimated count for the specified |data| which has length |len|.
+ [[nodiscard]] CountType GetCount(const char* data, size_t len) const {
+ CountType min_count = 0;
+ for (size_t i = 0; i < num_hashes_; ++i) {
+ CountType count = cells_[GetSketchCell(data, len, i)];
+ if (count < min_count || i == 0) {
+ min_count = count;
+ }
+ }
+ return min_count;
+ }
+
+ // Returns the index in the sketch cells of the |data| of length |len| for hash function
+ // |hash_index|.
+ size_t GetSketchCell(const char* data, size_t len, size_t hash_index) const {
+ return hash_index * num_cells_per_hash_ +
+ TruncatedDigest(reinterpret_cast<const uint8_t*>(data), len, hash_index, num_hashes_);
+ }
+
+ size_t num_cells_per_hash_;
+ size_t num_hashes_;
+ std::vector<CountType> cells_;
+};
+
+} // namespace cobalt
+#endif // COBALT_SRC_ALGORITHMS_PRIVACY_COUNT_MIN_H_
diff --git a/src/algorithms/privacy/count_min_test.cc b/src/algorithms/privacy/count_min_test.cc
new file mode 100644
index 0000000..7e74b02
--- /dev/null
+++ b/src/algorithms/privacy/count_min_test.cc
@@ -0,0 +1,101 @@
+// Copyright 2019 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "src/algorithms/privacy/count_min.h"
+
+#include <gtest/gtest.h>
+
+#include "src/lib/statusor/status_macros.h"
+
+namespace cobalt {
+
+TEST(CountMin, IncrementStringAndGetCount) {
+ std::vector<std::pair<std::string, uint64_t>> test_cases = {
+ {"Hello World", 100},
+ {"Hallo Welt", 20},
+ {"Hola Mundo", 73},
+ {"Bonjour, Monde", 22},
+ };
+ auto count_min = CountMin<uint64_t>::MakeSketch(2000, 10);
+ for (auto [data, count] : test_cases) {
+ count_min.Increment(data, count);
+ }
+
+ for (auto [data, count] : test_cases) {
+ EXPECT_EQ(count, count_min.GetCount(data));
+ }
+}
+
+TEST(CountMin, IncrementCellAndGetValue) {
+ size_t num_cells_per_hash = 3;
+ size_t num_hashes = 2;
+ auto count_min = CountMin<uint64_t>::MakeSketch(num_cells_per_hash, num_hashes);
+ ASSERT_EQ(count_min.size(), num_cells_per_hash * num_hashes);
+
+ std::vector<uint64_t> expected_sketch;
+ for (size_t i = 0; i < count_min.size(); ++i) {
+ auto count = static_cast<uint64_t>(i);
+ count_min.IncrementCell(i, count);
+ count_min.IncrementCell(i, count);
+ expected_sketch.push_back(2 * count);
+ }
+
+ for (size_t i = 0; i < count_min.size(); ++i) {
+ CB_ASSERT_OK_AND_ASSIGN(uint64_t cell_value, count_min.GetCellValue(i));
+ EXPECT_EQ(expected_sketch[i], cell_value);
+ }
+}
+
+TEST(CountMin, IncrementCellOutOfRange) {
+ auto count_min = CountMin<uint64_t>::MakeSketch(3, 2);
+ EXPECT_FALSE(count_min.IncrementCell(count_min.size(), 1).ok());
+}
+
+TEST(CountMin, GetCellValueOutOfRange) {
+ auto count_min = CountMin<uint64_t>::MakeSketch(3, 2);
+ EXPECT_FALSE(count_min.GetCellValue(count_min.size()).ok());
+}
+
+TEST(CountMin, MakeSketchFromCells) {
+ size_t num_cells_per_hash = 3;
+ size_t num_hashes = 2;
+ uint64_t expected_cell_value = 10;
+ std::vector<uint64_t> cells(num_cells_per_hash * num_hashes, expected_cell_value);
+ CB_ASSERT_OK_AND_ASSIGN(auto count_min, CountMin<uint64_t>::MakeSketchFromCells(
+ num_cells_per_hash, num_hashes, cells));
+
+ for (size_t i = 0; i < count_min.size(); ++i) {
+ CB_ASSERT_OK_AND_ASSIGN(uint64_t cell_value, count_min.GetCellValue(i));
+ EXPECT_EQ(expected_cell_value, cell_value);
+ }
+}
+
+TEST(CountMin, MakeSketchFromCellsWrongDimensions) {
+ size_t num_cells_per_hash = 3;
+ size_t num_hashes = 2;
+ uint64_t expected_cell_value = 10;
+ std::vector<uint64_t> cells(num_cells_per_hash * num_hashes + 1, expected_cell_value);
+ EXPECT_FALSE(CountMin<uint64_t>::MakeSketchFromCells(num_cells_per_hash, num_hashes, cells).ok());
+}
+
+TEST(CountMin, GetCellIndices) {
+ size_t num_cells_per_hash = 3;
+ size_t num_hashes = 2;
+ auto count_min = CountMin<uint64_t>::MakeSketch(num_cells_per_hash, num_hashes);
+
+ std::string data = "Hello World";
+ count_min.Increment(data, 1);
+
+ std::vector<size_t> updated_indices;
+ for (size_t i = 0; i < count_min.size(); ++i) {
+ CB_ASSERT_OK_AND_ASSIGN(uint64_t count, count_min.GetCellValue(i));
+ if (count != 0) {
+ updated_indices.push_back(i);
+ }
+ }
+ ASSERT_EQ(updated_indices.size(), num_hashes);
+ EXPECT_EQ(updated_indices, count_min.GetCellIndices(data));
+}
+
+} // namespace cobalt
diff --git a/src/algorithms/experimental/hash.cc b/src/algorithms/privacy/hash.cc
similarity index 78%
rename from src/algorithms/experimental/hash.cc
rename to src/algorithms/privacy/hash.cc
index 71521bc..4bd9892 100644
--- a/src/algorithms/experimental/hash.cc
+++ b/src/algorithms/privacy/hash.cc
@@ -2,11 +2,11 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
-#include "src/algorithms/experimental/hash.h"
+#include "src/algorithms/privacy/hash.h"
#include <vector>
-#include <openssl/sha.h>
+#include "third_party/github.com/google/farmhash/src/farmhash.h"
namespace cobalt {
@@ -18,16 +18,11 @@
}
} // namespace
-const size_t SHA256_SIZE = 32;
-
uint64_t Hash64WithSeed(const uint8_t* data, size_t len, uint64_t seed) {
- // TODO(azani): Replace hash function with FarmHash.
std::vector<uint8_t> seeded_data(sizeof(uint64_t) + len);
*(reinterpret_cast<uint64_t*>(seeded_data.data())) = seed;
std::copy(data, data + len, seeded_data.data() + sizeof(uint64_t));
- uint8_t sha256_digest[SHA256_SIZE];
- SHA256(seeded_data.data(), seeded_data.size(), sha256_digest);
- return (*reinterpret_cast<uint64_t*>(sha256_digest));
+ return farmhash::Fingerprint64(reinterpret_cast<char*>(seeded_data.data()), seeded_data.size());
}
uint64_t Hash64WithSeed(const std::string& data, uint64_t seed) {
diff --git a/src/algorithms/experimental/hash.h b/src/algorithms/privacy/hash.h
similarity index 83%
rename from src/algorithms/experimental/hash.h
rename to src/algorithms/privacy/hash.h
index 77a7d3b..20df010 100644
--- a/src/algorithms/experimental/hash.h
+++ b/src/algorithms/privacy/hash.h
@@ -2,8 +2,8 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
-#ifndef COBALT_SRC_ALGORITHMS_EXPERIMENTAL_HASH_H_
-#define COBALT_SRC_ALGORITHMS_EXPERIMENTAL_HASH_H_
+#ifndef COBALT_SRC_ALGORITHMS_PRIVACY_HASH_H_
+#define COBALT_SRC_ALGORITHMS_PRIVACY_HASH_H_
#include <cstdint>
#include <string>
@@ -26,4 +26,4 @@
size_t TruncatedDigest(const std::string& data, uint64_t seed, size_t max);
} // namespace cobalt
-#endif // COBALT_SRC_ALGORITHMS_EXPERIMENTAL_HASH_H_
+#endif // COBALT_SRC_ALGORITHMS_PRIVACY_HASH_H_
diff --git a/src/algorithms/privacy/hash_test.cc b/src/algorithms/privacy/hash_test.cc
new file mode 100644
index 0000000..b2183ef
--- /dev/null
+++ b/src/algorithms/privacy/hash_test.cc
@@ -0,0 +1,61 @@
+// Copyright 2019 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "src/algorithms/privacy/hash.h"
+
+#include <gtest/gtest.h>
+
+namespace cobalt {
+
+TEST(HashTest, Hash64WithSeedStableHash) {
+ EXPECT_EQ(0x6817bff2c1a85fb3u, Hash64WithSeed("hello world", 1));
+ EXPECT_EQ(0x7304bac4b22251f5u, Hash64WithSeed("hi there earth", 1));
+ EXPECT_EQ(0x9b9bbae50eb096d4u, Hash64WithSeed("bonjour monde", 1));
+ EXPECT_EQ(0x61cb7a20b37ba347u, Hash64WithSeed("hallo Welt", 1));
+}
+
+TEST(HashTest, Hash64WithSeedDifferentSeeds) {
+ auto hash0 = Hash64WithSeed("hello world", 0);
+ auto hash1 = Hash64WithSeed("hello world", 1);
+ EXPECT_NE(hash0, hash1);
+}
+
+TEST(HashTest, Hash64WithSeedDifferentFirstLetter) {
+ auto hash0 = Hash64WithSeed("hello world", 1);
+ auto hash1 = Hash64WithSeed("iello world", 1);
+ EXPECT_NE(hash0, hash1);
+}
+
+TEST(HashTest, Hash64WithSeedDifferentLastLetter) {
+ auto hash0 = Hash64WithSeed("hello world", 1);
+ auto hash1 = Hash64WithSeed("hello worle", 1);
+ EXPECT_NE(hash0, hash1);
+}
+
+TEST(HashTest, TruncatedDigestStableDigest) {
+ EXPECT_EQ(15u, TruncatedDigest("hello world", 1, 100));
+ EXPECT_EQ(69u, TruncatedDigest("hi there earth", 1, 100));
+ EXPECT_EQ(52u, TruncatedDigest("bonjour monde", 1, 100));
+ EXPECT_EQ(95u, TruncatedDigest("hallo Welt", 1, 100));
+}
+
+TEST(HashTest, TruncatedDigestDifferentSeeds) {
+ auto digest0 = TruncatedDigest("hello world", 0, 100);
+ auto digest1 = TruncatedDigest("hello world", 1, 100);
+ EXPECT_NE(digest0, digest1);
+}
+
+TEST(HashTest, TruncatedDigestDifferentFirstLetter) {
+ auto digest0 = TruncatedDigest("hello world", 1, 100);
+ auto digest1 = TruncatedDigest("iello world", 1, 100);
+ EXPECT_NE(digest0, digest1);
+}
+
+TEST(HashTest, TruncatedDigestDifferentLastLetter) {
+ auto digest0 = TruncatedDigest("hello world", 1, 100);
+ auto digest1 = TruncatedDigest("hello worle", 1, 100);
+ EXPECT_NE(digest0, digest1);
+}
+
+} // namespace cobalt
diff --git a/src/lib/statusor/status_macros.h b/src/lib/statusor/status_macros.h
index f63ee58..fd4ef4a 100644
--- a/src/lib/statusor/status_macros.h
+++ b/src/lib/statusor/status_macros.h
@@ -181,10 +181,12 @@
CB_ASSERT_OK_AND_ASSIGN_IMPL(CB_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, \
rexpr);
-#define CB_ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr) \
- auto statusor = (rexpr); /* NOLINT */ \
- ASSERT_TRUE(statusor.status().ok()) << statusor.status(); /* NOLINT */ \
- lhs = std::move(statusor.ValueOrDie()) /* NOLINT */
+#define CB_ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr) \
+ auto statusor = (rexpr); /* NOLINT */ \
+ ASSERT_TRUE(statusor.status().ok()) /*NOLINT*/ \
+ << "status:" << statusor.status().error_code() << ", " /*NOLINT*/ \
+ << statusor.status().error_message(); /* NOLINT */ \
+ lhs = std::move(statusor.ValueOrDie()) /* NOLINT */
#define CB_STATUS_MACROS_CONCAT_NAME(x, y) CB_STATUS_MACROS_CONCAT_IMPL(x, y)
#define CB_STATUS_MACROS_CONCAT_IMPL(x, y) x##y