blob: 5220cad6c272dcd10f01145e588431b7043346c0 [file] [log] [blame]
//
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "algorithms/distributions.h"
#include <cmath>
#include <cstdlib>
#include <limits>
#include <memory>
#include <unordered_map>
#include "base/testing/status_matchers.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/memory/memory.h"
#include "absl/strings/str_replace.h"
#include "algorithms/numerical-mechanisms-testing.h"
#include "algorithms/util.h"
namespace differential_privacy {
namespace internal {
namespace {
using ::testing::Gt;
using ::testing::HasSubstr;
using ::testing::Lt;
using ::differential_privacy::base::testing::StatusIs;
constexpr int64_t kNumSamples = 10000000;
constexpr int64_t kNumGeometricSamples = 1000000;
constexpr int64_t kGaussianSamples = 1000000;
constexpr double kOneOverLog2 = 1.44269504089;
constexpr double kNan = std::numeric_limits<double>::quiet_NaN();
constexpr double kInfinity = std::numeric_limits<double>::infinity();
double Skew(const std::vector<double>& samples, double mu, double sigma) {
double skew = std::accumulate(
samples.begin(), samples.end(), 0.0, [&mu](double lhs, double rhs) {
return lhs + (rhs - mu) * (rhs - mu) * (rhs - mu);
});
return skew / (samples.size() * sigma * sigma * sigma);
}
double Kurtosis(const std::vector<double>& samples, double mu, double var) {
double kurt = std::accumulate(samples.begin(), samples.end(), 0.0,
[mu](double lhs, double rhs) {
double m4 = (rhs - mu) * (rhs - mu);
m4 *= m4;
return lhs + m4;
});
int64_t n = samples.size();
kurt = (n + 1) * kurt / (n * var * var);
kurt -= 3 * (n - 1);
kurt *= static_cast<double>(n - 1) / (n - 2) / (n - 3);
return kurt;
}
TEST(LaplaceDistributionTest, ParameterValidation) {
LaplaceDistribution::Builder builder;
EXPECT_THAT(builder.SetEpsilon(-1).Build(),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Epsilon must be finite and positive")));
EXPECT_THAT(builder.SetEpsilon(0).Build(),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Epsilon must be finite and positive")));
EXPECT_THAT(builder.SetEpsilon(kNan).Build(),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Epsilon must be a valid numeric value")));
EXPECT_THAT(builder.SetEpsilon(kInfinity).Build(),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Epsilon must be finite and positive")));
builder.SetEpsilon(1);
EXPECT_THAT(builder.SetSensitivity(-1).Build(),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Sensitivity must be finite and positive")));
EXPECT_THAT(builder.SetSensitivity(0).Build(),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Sensitivity must be finite and positive")));
EXPECT_THAT(builder.SetSensitivity(kNan).Build(),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Sensitivity must be a valid numeric value")));
EXPECT_THAT(builder.SetSensitivity(kInfinity).Build(),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Sensitivity must be finite and positive")));
}
TEST(LaplaceDistributionTest, CheckStatisticsForGeoUnitValues) {
LaplaceDistribution::Builder builder;
std::unique_ptr<LaplaceDistribution> dist =
builder.SetEpsilon(1.0).SetSensitivity(1.0).Build().value();
std::vector<double> samples(kNumGeometricSamples);
std::generate(samples.begin(), samples.end(),
[dist = std::move(dist)]() { return dist->Sample(); });
double mean = Mean(samples);
double var = Variance(samples);
EXPECT_NEAR(0.0, mean, 0.01);
EXPECT_NEAR(2.0, var, 0.1);
EXPECT_NEAR(0.0, Skew(samples, mean, std::sqrt(var)), 0.1);
EXPECT_NEAR(3.0, Kurtosis(samples, mean, var), 0.1);
}
TEST(LaplaceDistributionTest, CheckStatisticsForGeoSpecificDistribution) {
double sensitivity = kOneOverLog2;
LaplaceDistribution::Builder builder;
std::unique_ptr<LaplaceDistribution> dist =
builder.SetEpsilon(1.0).SetSensitivity(sensitivity).Build().value();
std::vector<double> samples(kNumGeometricSamples);
std::generate(samples.begin(), samples.end(),
[dist = std::move(dist)]() { return dist->Sample(); });
double mean = Mean(samples);
double var = Variance(samples);
EXPECT_NEAR(0.0, mean, 0.01);
EXPECT_NEAR(2.0 * sensitivity * sensitivity, var, 0.1);
EXPECT_NEAR(0.0, Skew(samples, mean, std::sqrt(var)), 0.1);
EXPECT_NEAR(3.0, Kurtosis(samples, mean, var), 0.1);
}
TEST(LaplaceDistributionTest, Cdf) {
EXPECT_EQ(LaplaceDistribution::cdf(5, 0), .5);
EXPECT_EQ(LaplaceDistribution::cdf(1, -1), .5 * std::exp(-1));
EXPECT_EQ(LaplaceDistribution::cdf(1, 1), 1 - .5 * std::exp(-1));
}
TEST(LaplaceDistributionTest, GetVarianceReturnsExpectedValue) {
// Use epsilon = 1 and l1 sensitivity = 2 => 2 * 2^2 = 8
absl::StatusOr<std::unique_ptr<LaplaceDistribution>> laplace_distribution =
LaplaceDistribution::Builder().SetEpsilon(1).SetSensitivity(2).Build();
ASSERT_OK(laplace_distribution);
EXPECT_NEAR(laplace_distribution->get()->GetVariance(), 8, 1e-6);
}
TEST(LaplaceCalculateGranularityTest, InvalidParameters) {
const double valid_param = kOneOverLog2;
std::vector<double> invalid_params = {0, -0.1, -1, -10, kInfinity};
for (double invalid_param : invalid_params) {
EXPECT_THAT(
LaplaceDistribution::CalculateGranularity(invalid_param, valid_param),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Epsilon must be finite and positive")));
EXPECT_THAT(
LaplaceDistribution::CalculateGranularity(valid_param, invalid_param),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Sensitivity must be finite and positive")));
}
// Test NaN
EXPECT_THAT(LaplaceDistribution::CalculateGranularity(kNan, valid_param),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Epsilon must be a valid numeric value")));
EXPECT_THAT(LaplaceDistribution::CalculateGranularity(valid_param, kNan),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Sensitivity must be a valid numeric value")));
// Test epsilon < 2^-50. See (broken link)
EXPECT_THAT(LaplaceDistribution::CalculateGranularity(
1.0 / (int64_t{1} << 51), valid_param),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Epsilon must be at least 2^-50")));
EXPECT_THAT(LaplaceDistribution::CalculateGranularity(
1.0 / (int64_t{1} << 50), valid_param),
StatusIs(absl::StatusCode::kOk));
}
TEST(LaplaceDistributionTest, GranularityIsBetweenZeroAndDiversity) {
const double epsilon = 1.1;
const double sensitivity = 2.0;
absl::StatusOr<std::unique_ptr<LaplaceDistribution>> ld =
LaplaceDistribution::Builder()
.SetEpsilon(epsilon)
.SetSensitivity(sensitivity)
.Build();
ASSERT_OK(ld);
const double granularity = ld.value()->GetGranularity();
EXPECT_THAT(granularity, Gt(0));
const double diversity = sensitivity / epsilon;
EXPECT_THAT(granularity, Lt(diversity));
}
TEST(GaussDistributionTest, ParameterValidation) {
GaussianDistribution::Builder builder;
EXPECT_THAT(
builder.SetStddev(-1).Build(),
StatusIs(
absl::StatusCode::kInvalidArgument,
HasSubstr("Standard deviation must be finite and non-negative")));
EXPECT_THAT(
builder.SetStddev(kNan).Build(),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Standard deviation must be a valid numeric value")));
EXPECT_THAT(
builder.SetStddev(kInfinity).Build(),
StatusIs(
absl::StatusCode::kInvalidArgument,
HasSubstr("Standard deviation must be finite and non-negative")));
EXPECT_OK(builder.SetStddev(0).Build());
}
TEST(GaussDistributionTest, CheckStatisticsForUnitValues) {
GaussianDistribution::Builder builder;
std::unique_ptr<GaussianDistribution> dist =
builder.SetStddev(1.0).Build().value();
std::vector<double> samples(kGaussianSamples);
std::generate(samples.begin(), samples.end(),
[&dist]() { return dist->Sample(); });
EXPECT_NEAR(0.0, Mean(samples), 0.01);
EXPECT_NEAR(1.0, Variance(samples), 0.1);
}
TEST(GaussDistributionTest, CheckStatisticsForSpecificDistribution) {
double stddev = kOneOverLog2;
GaussianDistribution::Builder builder;
std::unique_ptr<GaussianDistribution> dist =
builder.SetStddev(stddev).Build().value();
std::vector<double> samples(kGaussianSamples);
std::generate(samples.begin(), samples.end(),
[&dist]() { return dist->Sample(); });
EXPECT_NEAR(0.0, Mean(samples), 0.01);
EXPECT_NEAR(stddev * stddev, Variance(samples), 0.1);
}
TEST(GaussDistributionTest, CheckStatisticsForSpecificScaledDistribution) {
double stddev = kOneOverLog2;
double scale = 3.0;
GaussianDistribution::Builder builder;
std::unique_ptr<GaussianDistribution> dist =
builder.SetStddev(stddev).Build().value();
std::vector<double> samples(kGaussianSamples);
std::generate(samples.begin(), samples.end(),
[&dist, scale]() { return dist->Sample(scale); });
EXPECT_NEAR(0.0, Mean(samples), 0.01 * scale);
EXPECT_NEAR(stddev * stddev * scale * scale, Variance(samples), 0.1 * scale);
}
TEST(GaussDistributionTest, StandardDeviationGetter) {
double stddev = kOneOverLog2;
GaussianDistribution::Builder builder;
std::unique_ptr<GaussianDistribution> dist =
builder.SetStddev(stddev).Build().value();
EXPECT_EQ(dist->Stddev(), stddev);
}
TEST(GaussDistributionTest, Cdf) {
EXPECT_NEAR(GaussianDistribution::cdf(1, 0), 0.5, 0.000001);
EXPECT_NEAR(GaussianDistribution::cdf(5, 0), 0.5, 0.000001);
EXPECT_NEAR(GaussianDistribution::cdf(1, -1), 0.15865525, 0.000001);
EXPECT_NEAR(GaussianDistribution::cdf(1, -5), 2.8665157e-7, 1e-13);
EXPECT_NEAR(GaussianDistribution::cdf(1, -9), 1.1285884e-19, 1e-25);
EXPECT_NEAR(GaussianDistribution::cdf(5, -1), 0.42074029, 0.000001);
EXPECT_NEAR(GaussianDistribution::cdf(1, 1), 0.84134475, 0.000001);
EXPECT_NEAR(GaussianDistribution::cdf(5, 1), 0.57925971, 0.000001);
EXPECT_DEATH(GaussianDistribution::cdf(0, 1), "");
EXPECT_DEATH(GaussianDistribution::cdf(-1, 1), "");
}
TEST(GaussDistributionTest, Quantile) {
EXPECT_NEAR(GaussianDistribution::Quantile(1, 0.5), 0, 0.000001);
EXPECT_NEAR(GaussianDistribution::Quantile(5, 0.5), 0, 0.000001);
EXPECT_NEAR(GaussianDistribution::Quantile(1, 0.15865525), -1, 0.000001);
EXPECT_NEAR(GaussianDistribution::Quantile(1, 2.8665157e-7), -5, 0.000001);
// No test for 1.1285884e-19 as the precision is too low, and it evaluates to
// +inf.
EXPECT_NEAR(GaussianDistribution::Quantile(5, 0.42074029), -1, 0.000001);
EXPECT_NEAR(GaussianDistribution::Quantile(1, 0.84134475), 1, 0.000001);
EXPECT_NEAR(GaussianDistribution::Quantile(5, 0.57925971), 1, 0.000001);
EXPECT_DEATH(GaussianDistribution::Quantile(0, 1), "");
EXPECT_DEATH(GaussianDistribution::Quantile(-1, 1), "");
}
TEST(GeometricDistributionTest, ParameterValidation) {
GeometricDistribution::Builder builder;
EXPECT_THAT(builder.SetLambda(-1).Build(),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Lambda must be finite and non-negative")));
EXPECT_THAT(builder.SetLambda(kNan).Build(),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Lambda must be a valid numeric value")));
EXPECT_THAT(builder.SetLambda(kInfinity).Build(),
StatusIs(absl::StatusCode::kInvalidArgument,
HasSubstr("Lambda must be finite and non-negative")));
EXPECT_OK(builder.SetLambda(0).Build());
}
TEST(GeometricDistributionTest, SmallProbabilityStats) {
GeometricDistribution::Builder builder;
std::unique_ptr<GeometricDistribution> dist =
builder.SetLambda(-1.0 * std::log(1.0 - 1e-6)).Build().value();
std::vector<int64_t> samples(kNumGeometricSamples);
std::generate(samples.begin(), samples.end(),
[dist = std::move(dist)]() { return dist->Sample() + 1; });
EXPECT_NEAR(1000000, Mean(samples), 10000);
EXPECT_NEAR(999999.5, std::sqrt(Variance(samples)), 10000);
}
TEST(GeometricDistributionTest, LargeProbabilityStats) {
GeometricDistribution::Builder builder;
std::unique_ptr<GeometricDistribution> dist =
builder.SetLambda(-1.0 * std::log(1.0 - 0.5)).Build().value();
std::vector<int64_t> samples(kNumGeometricSamples);
std::generate(samples.begin(), samples.end(),
[dist = std::move(dist)]() { return dist->Sample() + 1; });
EXPECT_NEAR(2, Mean(samples), 0.01);
EXPECT_NEAR(std::sqrt(2), std::sqrt(Variance(samples)), 0.05);
}
TEST(GeometricDistributionTest, Ratios) {
double p = 1e-2;
GeometricDistribution::Builder builder;
std::unique_ptr<GeometricDistribution> dist =
builder.SetLambda(-1.0 * std::log(1.0 - p)).Build().value();
std::vector<int64_t> counts(51, 0);
for (int i = 0; i < kNumGeometricSamples; ++i) {
int64_t sample = dist->Sample();
if (sample < counts.size()) {
++counts[sample];
}
}
std::vector<double> ratios;
for (int i = 0; i < counts.size() - 1; ++i) {
ratios.push_back(static_cast<double>(counts[i + 1]) /
static_cast<double>(counts[i]));
}
EXPECT_NEAR(p, Mean(ratios), p / 1e-2);
}
// For Binomial/Poisson RVs this is mult standard deviations since var= mean.
// Probability of failure with mult = 7 ~1e-23 if Gaussian approx holds, but it
// does not for low values of x, so we have to add another fudge factor.
// Individual sample failure probability ~1e-13 and we test a few million
// samples overall so the overall flakiness of these tests is < 1e-6.
double AllowedError(int x, double mult = 7) {
return (std::sqrt(x) + std::log1p(x)) * mult;
}
// Generates a std::string of the first N values from counts.
template <size_t N, typename T>
std::string FirstN(const std::vector<T>& counts) {
auto spans = absl::MakeConstSpan(counts);
if (spans.empty()) {
return "<empty>";
} else if (N < spans.size()) {
spans = spans.first(N);
}
return absl::StrJoin(spans, ", ");
}
// Calculates the probability mass function of the Geometric
// distribution at an integer x:
//
// PMF(x) = (1-p)^x * p
//
double PMF(const double p, const int x) { return p * std::pow(1.0 - p, x); }
// Returns a list of the expected number of times a geometric distribution
// with parameter lambda will return each integer. Only returns integers
// for which the expected value is >= 2. Always returns the first 10 integers.
std::vector<double> ExpectedCounts(const double p, const double one_minus_p,
const int num_samples) {
std::vector<double> result;
int i = 0;
double expected_samples = PMF(p, i) * num_samples;
while (i < 10 && expected_samples >= 2) {
result.push_back(expected_samples);
++i;
expected_samples = PMF(p, i) * num_samples;
}
return result;
}
class GeometricDistributionTest : public ::testing::TestWithParam<double> {};
TEST_P(GeometricDistributionTest, Distribution) {
auto lambda = GetParam();
GeometricDistribution::Builder builder;
std::unique_ptr<GeometricDistribution> distribution =
builder.SetLambda(lambda).Build().value();
// Choose bucket sizes so that the expected count in the first bucket is
// 150.
const int64_t kBucketSize =
std::ceil(-std::log1p(-150.0 / kNumGeometricSamples) / lambda);
const int64_t kMaxValue = std::max(100.0, 100 / lambda);
const size_t num_buckets = (kMaxValue / kBucketSize) + 1;
std::vector<size_t> counts(num_buckets, 0);
for (int i = 0; i < kNumGeometricSamples; ++i) {
int64_t val = distribution->Sample() / kBucketSize;
// Probability of exceeding < exp(-100) * num_iterations;
ASSERT_GE(val, 0);
ASSERT_LE(val, kMaxValue);
if (val < kMaxValue) {
++counts[val];
}
}
double bucket_lambda = lambda * kBucketSize;
auto expected_buckets =
ExpectedCounts(-std::expm1(-bucket_lambda), std::exp(-bucket_lambda),
kNumGeometricSamples);
for (size_t i = 0; i < expected_buckets.size() && i < counts.size(); i++) {
ASSERT_NEAR(counts[i], expected_buckets[i],
AllowedError(expected_buckets[i]))
<< i;
}
// For small lambdas, try to get large enough buckets to have some
// statistically powerful results. Choose bucket sizes so that the expected
// probability in the first bucket is 0.2 which is a reasonably large
// fraction.
const int kBigBucketSize = std::ceil(-std::log1p(-0.2) / bucket_lambda);
// Otherwise the smaller buckets are themselves big enough;
if (kBigBucketSize > 10) {
bucket_lambda *= kBigBucketSize;
expected_buckets =
ExpectedCounts(-std::expm1(-bucket_lambda), std::exp(-bucket_lambda),
kNumGeometricSamples);
std::vector<int> sampled(counts.size() / kBigBucketSize + 1, 0);
for (int i = 0; i < counts.size(); ++i) {
sampled[i / kBigBucketSize] += counts[i];
}
for (size_t i = 0; i < expected_buckets.size() && i < sampled.size(); i++) {
EXPECT_NEAR(sampled[i], expected_buckets[i],
AllowedError(expected_buckets[i]))
<< i;
}
}
}
std::vector<double> GenParams() {
return std::vector<double>({
30,
3,
1e-2,
1e-3,
1e-4,
1e-5,
1e-7,
1e-12,
1e-15,
});
}
std::string ParamName(const ::testing::TestParamInfo<double>& info) {
const auto& p = info.param;
std::string name = absl::StrCat("L_", p);
return absl::StrReplaceAll(name, {{"-", "_"}, {".", "_"}});
}
INSTANTIATE_TEST_SUITE_P(All, GeometricDistributionTest,
::testing::ValuesIn(GenParams()), ParamName);
TEST(GeometricDistribution, ImpossibleDoubles) {
// Using std::geometric_distribution<int64_t> would fail this test, since it
// can't generate large odd values.
GeometricDistribution::Builder builder;
std::unique_ptr<GeometricDistribution> distribution =
builder.SetLambda(1e-15).Build().value();
double count = 0;
constexpr int kIter = 1000000;
for (int i = 0; i < kIter; ++i) {
int64_t val = distribution->Sample();
ASSERT_GE(val, 0);
if (val > (1LL << 53) && val % 2) {
++count;
}
}
// 61 ~ (exp(-2**53 * 1e-15))/2 * 1e6.
EXPECT_GT(count, 0);
}
} // namespace
} // namespace internal
} // namespace differential_privacy