blob: b83273cbab16f3d5441fb130538afcbd46c48c11 [file] [log] [blame]
//
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef DIFFERENTIAL_PRIVACY_CPP_TESTING_STATISTICAL_TESTS_UTILS_H_
#define DIFFERENTIAL_PRIVACY_CPP_TESTING_STATISTICAL_TESTS_UTILS_H_
#include <fstream>
#include <functional>
#include <iostream>
#include "base/logging.h"
#include "google/protobuf/text_format.h"
#include "absl/container/flat_hash_map.h"
#include "absl/strings/str_cat.h"
#include "algorithms/rand.h"
namespace differential_privacy::testing {
// Sample a value from Laplace distribution, implemented with absl random
// generators (i.e. non-secure).
double SampleReferenceLaplacian(double mean, double variance,
SecureURBG* random);
// Generates number_of_samples samples from both sample_generator_a and
// sample_generator_b generators and decides whether these 2 sets of random
// samples were likely drawn from similar discrete distributions. See
// (broken link) for more information.
bool GenerateClosenessVote(std::function<double()> sample_generator_a,
std::function<double()> sample_generator_b,
int number_of_samples, double l2_tolerance,
double granularity);
template <typename T>
double ComputeApproximateDpTestValue(
absl::flat_hash_map<T, int64_t> histogram_a,
absl::flat_hash_map<T, int64_t> histogram_b, double epsilon,
int num_of_samples) {
double test_value = 0;
for (auto& [result, sample_count] : histogram_a) {
auto it_b = histogram_b.find(result);
if (it_b != histogram_b.end()) {
double sample_count_b = it_b->second;
test_value +=
std::max(0.0, (sample_count - std::exp(epsilon) * sample_count_b) /
num_of_samples);
} else {
test_value += sample_count / num_of_samples;
}
}
return test_value;
}
template <typename T>
absl::flat_hash_map<T, int64_t> BuildHistogram(const std::vector<T>& samples) {
absl::flat_hash_map<T, int64_t> histogram;
for (const T& sample : samples) ++histogram[sample];
return histogram;
}
// Decides whether two sets of random samples were likely drawn from a pair of
// discrete distributions that approximately satisfy (ε,δ) differential privacy.
//
// The two distributions are considered to be (ε,δ) differentially private if
// the likelihood of any event with respect to the first distribution is at most
// δ plus e^ε times the likelihood of the same event in the second distribution
// and vice versa. Moreover, the distributions are considered approximately
// (ε,δ) differentially private if there exists a δ' such that the distributions
// are (ε,δ') differentially private and |δ' - δ| is less than half of a given
// tolerance α. Otherwise if no δ' exists such that |δ' - δ| is less than α, the
// distributions are not considered approximately (ε, δ) differentially private.
// Assuming that α > (m / n)^0.5 * (1 + e^(2 * ε)), the error probability is at
// most (1 + e^(2 * ε)) / (n * (α - (m / n)^0.5 * (1 + e^(2 * ε)))^2), where m
// is the size of the support of the distributions and n is the expected value
// of a Poisson distribution from which the number of samples is drawn. See
// (broken link) for more information.
template <typename T>
bool VerifyApproximateDp(const std::vector<T>& samples_a,
const std::vector<T>& samples_b, double epsilon,
double delta, double delta_tolerance) {
DCHECK(samples_a.size() == samples_b.size())
<< "The sample sets must be of equal size.";
DCHECK(!samples_a.empty()) << "The sample sets must not be empty";
DCHECK(delta_tolerance > 0) << "The delta tolerance must be positive";
DCHECK(delta_tolerance < 1) << "The delta tolerance should be less than 1";
DCHECK(epsilon >= 0) << "Epsilon must not be negative";
DCHECK(delta >= 0) << "Delta must not be negative";
DCHECK(delta < 1) << "Delta should be less than 1";
absl::flat_hash_map<T, int64_t> histogram_a = BuildHistogram(samples_a);
absl::flat_hash_map<T, int64_t> histogram_b = BuildHistogram(samples_b);
double test_value_a = ComputeApproximateDpTestValue(
histogram_a, histogram_b, epsilon, samples_a.size());
double test_value_b = ComputeApproximateDpTestValue(
histogram_b, histogram_a, epsilon, samples_b.size());
return test_value_a < delta + delta_tolerance &&
test_value_b < delta + delta_tolerance;
}
// Generates number_of_samples samples from both sample_generator_a and
// sample_generator_b generators and decides whether two sets of random samples
// were likely drawn from a pair of discrete distributions that approximately
// satisfy (ε,δ) differential privacy. See (broken link) for more
// information. The test will fail if this pair of samples do not satisfy
// (ε,δ + delta_tolerance) differential privacy.
bool GenerateApproximateDpVote(std::function<double()> sample_generator_a,
std::function<double()> sample_generator_b,
int number_of_samples, double epsilon,
double delta, double delta_tolerance,
double granularity);
// Generates number_of_votes of votes from vote_generator to determine a
// majority. Stops early as soon as the majority is clear. Returns the majority.
bool RunBallot(std::function<bool()> vote_generator, int number_of_votes);
template <typename T>
std::optional<T> ReadProto(std::istream* proto_file) {
T tests;
std::string serialized_protobuf;
std::string line;
while (getline(*proto_file, line)) {
absl::StrAppend(&serialized_protobuf, line, "\n");
}
*proto_file >> serialized_protobuf;
if (!google::protobuf::TextFormat::ParseFromString(serialized_protobuf, &tests)) {
return std::optional<T>();
}
return tests;
}
template <typename T>
std::optional<T> ReadProto(const std::string& path) {
std::ifstream proto_file(path);
if (!proto_file.is_open()) {
return std::optional<T>();
}
return ReadProto<T>(&proto_file);
}
// Partitions the interval between lower and upper into num_buckets subintervals
// of equal size and return the index (from 0 to num_buckets - 1) of the
// subinterval that contains the specified sample.
// Samples outside the bounds will be assigned to the lowest or highest bin as
// appropriate. Samples that are exactly equal to bin boundaries will be
// assigned to the higher bin.
int Bucketize(double sample, double lower, double upper, int num_buckets);
} // namespace differential_privacy::testing
#endif // DIFFERENTIAL_PRIVACY_CPP_TESTING_STATISTICAL_TESTS_UTILS_H_