blob: 189f2a807b2e789b711d2b8a1a57667112a7a4d9 [file] [log] [blame]
//
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef DIFFERENTIAL_PRIVACY_CPP_ALGORITHMS_PARTITION_SELECTION_H_
#define DIFFERENTIAL_PRIVACY_CPP_ALGORITHMS_PARTITION_SELECTION_H_
#include <cmath>
#include <cstddef>
#include <cstdlib>
#include <iostream>
#include <string>
#include <utility>
#include "base/statusor.h"
#include "algorithms/distributions.h"
#include "algorithms/numerical-mechanisms.h"
#include "algorithms/rand.h"
#include "algorithms/util.h"
#include "base/status_macros.h"
namespace differential_privacy {
// Provides a common abstraction for PartitionSelectionStrategy. Each partition
// selection strategy class has a builder with which it can be instantiated, and
// calling ShouldKeep will return true if a partition with the given number of
// users should be kept based on the values the partition selection strategy
// was instantiated with (while ShouldKeep will return false if the partition
// should have been dropped).
class PartitionSelectionStrategy {
public:
// Builder base class
class Builder {
public:
virtual ~Builder() = default;
Builder& SetEpsilon(double epsilon) {
epsilon_ = epsilon;
return *this;
}
Builder& SetDelta(double delta) {
delta_ = delta;
return *this;
}
Builder& SetMaxPartitionsContributed(int64_t max_partitions_contributed) {
max_partitions_contributed_ = max_partitions_contributed;
return *this;
}
virtual base::StatusOr<std::unique_ptr<PartitionSelectionStrategy>>
Build() = 0;
protected:
// Convenience methods to check if the Builder variables are set & valid
absl::Status EpsilonIsSetAndValid() {
return PartitionSelectionStrategy::EpsilonIsSetAndValid(epsilon_);
}
absl::Status DeltaIsSetAndValid() {
return PartitionSelectionStrategy::DeltaIsSetAndValid(delta_);
}
absl::Status MaxPartitionsContributedIsSetAndValid() {
return PartitionSelectionStrategy::MaxPartitionsContributedIsSetAndValid(
max_partitions_contributed_);
}
absl::optional<double> GetEpsilon() { return epsilon_; }
absl::optional<double> GetDelta() { return delta_; }
absl::optional<int64_t> GetMaxPartitionsContributed() {
return max_partitions_contributed_;
}
absl::optional<double> delta_;
private:
absl::optional<double> epsilon_;
absl::optional<int64_t> max_partitions_contributed_;
};
virtual ~PartitionSelectionStrategy() = default;
double GetEpsilon() const { return epsilon_; }
double GetDelta() const { return delta_; }
int64_t GetMaxPartitionsContributed() const {
return max_partitions_contributed_;
}
// ShouldKeep returns true when a partition with a given number of users
// should be kept and false otherwise.
virtual bool ShouldKeep(double num_users) = 0;
protected:
PartitionSelectionStrategy(double epsilon, double delta,
int64_t max_partitions_contributed,
double adjusted_delta)
: epsilon_(epsilon),
delta_(delta),
max_partitions_contributed_(max_partitions_contributed),
adjusted_delta_(adjusted_delta) {}
// Checks if epsilon is set and valid.
static absl::Status EpsilonIsSetAndValid(absl::optional<double> epsilon) {
RETURN_IF_ERROR(ValidateIsFiniteAndPositive(epsilon, "Epsilon"));
return absl::OkStatus();
}
// Checks if delta is set and valid.
static absl::Status DeltaIsSetAndValid(absl::optional<double> delta) {
RETURN_IF_ERROR(ValidateIsInInclusiveInterval(delta, 0, 1, "Delta"));
return absl::OkStatus();
}
// Checks if the max number of partitions contributed to is set and valid.
static absl::Status MaxPartitionsContributedIsSetAndValid(
absl::optional<int64_t> max_partitions_contributed) {
RETURN_IF_ERROR(ValidateIsPositive(
max_partitions_contributed,
"Max number of partitions a user can contribute to"));
return absl::OkStatus();
}
double GetAdjustedDelta() const { return adjusted_delta_; }
// We must derive an adjusted delta, to be used as the probability of keeping
// a single partition with one user, from delta, the probability we keep any
// of the partitions contributed to by a single user. Since the probability
// we drop a partition with a single user is 1 - adjusted_delta_, and raising
// this expression to the power of the max number of partitions one user can
// contribute to will get us delta, we can solve to get the following formula.
static base::StatusOr<double> CalculateAdjustedDelta(
double delta, int64_t max_partitions_contributed) {
RETURN_IF_ERROR(PartitionSelectionStrategy::DeltaIsSetAndValid(delta));
RETURN_IF_ERROR(
PartitionSelectionStrategy::MaxPartitionsContributedIsSetAndValid(
max_partitions_contributed));
// Numerically stable equivalent of
// 1- pow(1 - delta, 1 / max_partitions_contributed).
if (delta == 1) { // Avoid NaN from log1p(-1) -> log(0)
return 1;
}
return -expm1(log1p(-delta) / max_partitions_contributed);
}
// Inverse of CalculateAdjustedDelta()
static base::StatusOr<double> CalculateUnadjustedDelta(
double adjusted_delta, int64_t max_partitions_contributed) {
RETURN_IF_ERROR(
PartitionSelectionStrategy::DeltaIsSetAndValid(adjusted_delta));
RETURN_IF_ERROR(
PartitionSelectionStrategy::MaxPartitionsContributedIsSetAndValid(
max_partitions_contributed));
// Numerically stable equivalent of
// 1 - pow(1 - adjusted_delta, max_partitions_contributed).
if (adjusted_delta == 1) { // Avoid NaN from log1p(-1) -> log(0)
return 1;
}
return -expm1(max_partitions_contributed * log1p(-adjusted_delta));
}
private:
double epsilon_;
double delta_;
int max_partitions_contributed_;
double adjusted_delta_;
};
// PreAggPartitionSelection implements magic partition selection - instead of
// calculating a specific threshold to determine whether or not a partition
// should be kept, magic partition selection uses a formula derived from the
// original probablistic definition of differential privacy to generate the
// probability with which a partition should be kept. The math is shown in
// https://arxiv.org/pdf/2006.03684.pdf.
class PreaggPartitionSelection : public PartitionSelectionStrategy {
public:
// Builder for PreaggPartitionSelection
class Builder : public PartitionSelectionStrategy::Builder {
public:
base::StatusOr<std::unique_ptr<PartitionSelectionStrategy>> Build()
override {
RETURN_IF_ERROR(EpsilonIsSetAndValid());
RETURN_IF_ERROR(DeltaIsSetAndValid());
RETURN_IF_ERROR(MaxPartitionsContributedIsSetAndValid());
ASSIGN_OR_RETURN(
double adjusted_delta,
CalculateAdjustedDelta(GetDelta().value(),
GetMaxPartitionsContributed().value()));
std::unique_ptr<PartitionSelectionStrategy> magic_selection =
absl::WrapUnique(new PreaggPartitionSelection(
GetEpsilon().value(), GetDelta().value(),
GetMaxPartitionsContributed().value(), adjusted_delta));
return magic_selection;
}
};
virtual ~PreaggPartitionSelection() = default;
double GetAdjustedEpsilon() const { return adjusted_epsilon_; }
double GetFirstCrossover() const { return crossover_1_; }
double GetSecondCrossover() const { return crossover_2_; }
bool ShouldKeep(double num_users) override {
// generate a random number between 0 and 1
double rand_num = UniformDouble();
// only keep partition if random number < expected probability of keep
return (rand_num <= ProbabilityOfKeep(num_users));
}
protected:
PreaggPartitionSelection(double epsilon, double delta, int max_partitions,
double adjusted_delta)
: PartitionSelectionStrategy(epsilon, delta, max_partitions,
adjusted_delta),
adjusted_epsilon_(epsilon / static_cast<double>(max_partitions)) {
crossover_1_ =
1 +
floor(log1p(tanh(adjusted_epsilon_ / 2) * (1 / adjusted_delta - 1)) /
adjusted_epsilon_);
crossover_2_ =
crossover_1_ + floor((1.0 / adjusted_epsilon_) *
log1p((expm1(adjusted_epsilon_) / adjusted_delta) *
(1 - ProbabilityOfKeep(crossover_1_))));
}
private:
double adjusted_epsilon_;
double crossover_1_;
double crossover_2_;
// ProbabilityOfKeep returns the probability with which a partition with n
// users should be kept, Thm. 1 of https://arxiv.org/pdf/2006.03684.pdf
double ProbabilityOfKeep(double n) const {
const double adjusted_delta = GetAdjustedDelta();
if (n == 0) {
return 0;
} else if (n <= crossover_1_) {
return ((expm1(n * adjusted_epsilon_) / expm1(adjusted_epsilon_)) *
adjusted_delta);
} else if (n > crossover_1_ && n <= crossover_2_) {
const double m = n - crossover_1_;
const double p_crossover = ProbabilityOfKeep(crossover_1_);
return p_crossover -
(1 - p_crossover + (adjusted_delta / expm1(adjusted_epsilon_))) *
expm1(-m * adjusted_epsilon_);
} else {
return 1;
}
}
};
// LaplacePartitionSelection calculates a threshold based on the CDF of the
// Laplace distribution, delta, epsilon, and the max number of partitions a
// single user can contribute to. If the number of users in a partition
// + Laplace noise is greater than this threshold, the partition should be kept.
class LaplacePartitionSelection : public PartitionSelectionStrategy {
public:
// Builder for LaplacePartitionSelection
class Builder : public PartitionSelectionStrategy::Builder {
public:
Builder& SetLaplaceMechanism(
std::unique_ptr<LaplaceMechanism::Builder> laplace_builder) {
laplace_builder_ = std::move(laplace_builder);
return *this;
}
base::StatusOr<std::unique_ptr<PartitionSelectionStrategy>> Build()
override {
RETURN_IF_ERROR(EpsilonIsSetAndValid());
RETURN_IF_ERROR(DeltaIsSetAndValid());
RETURN_IF_ERROR(MaxPartitionsContributedIsSetAndValid());
if (laplace_builder_ == nullptr) {
laplace_builder_ = absl::make_unique<LaplaceMechanism::Builder>();
}
double epsilon = GetEpsilon().value();
double delta = GetDelta().value();
int64_t max_partitions_contributed = GetMaxPartitionsContributed().value();
ASSIGN_OR_RETURN(
double adjusted_delta,
CalculateAdjustedDelta(delta, max_partitions_contributed));
ASSIGN_OR_RETURN(
double threshold,
CalculateThreshold(epsilon, delta, max_partitions_contributed));
std::unique_ptr<NumericalMechanism> mechanism_;
ASSIGN_OR_RETURN(mechanism_,
laplace_builder_->SetEpsilon(epsilon)
.SetL0Sensitivity(max_partitions_contributed)
.SetLInfSensitivity(1)
.Build());
std::unique_ptr<PartitionSelectionStrategy> laplace =
absl::WrapUnique(new LaplacePartitionSelection(
epsilon, delta, max_partitions_contributed, adjusted_delta,
threshold, std::move(mechanism_)));
return laplace;
}
private:
std::unique_ptr<LaplaceMechanism::Builder> laplace_builder_;
};
virtual ~LaplacePartitionSelection() = default;
bool ShouldKeep(double num_users) override {
return mechanism_->NoisedValueAboveThreshold(num_users, threshold_);
}
static base::StatusOr<double> CalculateDelta(
double epsilon, double threshold, int64_t max_partitions_contributed) {
RETURN_IF_ERROR(PartitionSelectionStrategy::EpsilonIsSetAndValid(epsilon));
RETURN_IF_ERROR(
PartitionSelectionStrategy::MaxPartitionsContributedIsSetAndValid(
max_partitions_contributed));
if (threshold < 1) {
return CalculateUnadjustedDelta(
1 - (std::exp(
(threshold - 1) /
CalculateDiversity(epsilon, max_partitions_contributed)) /
2),
max_partitions_contributed);
} else {
return CalculateUnadjustedDelta(
(std::exp((1 - threshold) /
CalculateDiversity(epsilon, max_partitions_contributed))) /
2,
max_partitions_contributed);
}
}
static base::StatusOr<double> CalculateThreshold(
double epsilon, double delta, int64_t max_partitions_contributed) {
RETURN_IF_ERROR(PartitionSelectionStrategy::EpsilonIsSetAndValid(epsilon));
RETURN_IF_ERROR(PartitionSelectionStrategy::DeltaIsSetAndValid(delta));
RETURN_IF_ERROR(
PartitionSelectionStrategy::MaxPartitionsContributedIsSetAndValid(
max_partitions_contributed));
ASSIGN_OR_RETURN(double adjusted_delta,
CalculateAdjustedDelta(delta, max_partitions_contributed));
if (delta > 0.5) {
return 1 + CalculateDiversity(epsilon, max_partitions_contributed) *
std::log(2 * (1 - adjusted_delta));
} else {
return 1 - CalculateDiversity(epsilon, max_partitions_contributed) *
(std::log(2 * adjusted_delta));
}
}
int64_t GetL1Sensitivity() const { return l1_sensitivity_; }
double GetDiversity() const { return diversity_; }
double GetThreshold() const { return threshold_; }
protected:
LaplacePartitionSelection(double epsilon, double delta,
int64_t max_partitions_contributed,
double adjusted_delta, double threshold,
std::unique_ptr<NumericalMechanism> laplace)
: PartitionSelectionStrategy(epsilon, delta, max_partitions_contributed,
adjusted_delta),
l1_sensitivity_(max_partitions_contributed),
diversity_(CalculateDiversity(epsilon, l1_sensitivity_)),
threshold_(threshold),
mechanism_(std::move(laplace)) {}
static double CalculateDiversity(double epsilon, int64_t l1_sensitivity) {
return l1_sensitivity / epsilon;
}
private:
int64_t l1_sensitivity_;
double diversity_;
double threshold_;
std::unique_ptr<NumericalMechanism> mechanism_;
};
// GaussianPartitionSelection calculates a threshold based on the CDF of the
// Gaussian distribution, delta, epsilon, and the max number of partitions a
// single user can contribute to. If the number of users in a partition
// + Gaussian noise is greater than this threshold, the partition should be
// kept.
class GaussianPartitionSelection : public PartitionSelectionStrategy {
public:
// Builder for GaussianPartitionSelection
class Builder : public PartitionSelectionStrategy::Builder {
public:
Builder& SetGaussianMechanism(
std::unique_ptr<GaussianMechanism::Builder> gaussian_builder) {
gaussian_builder_ = std::move(gaussian_builder);
return *this;
}
base::StatusOr<std::unique_ptr<PartitionSelectionStrategy>> Build()
override {
RETURN_IF_ERROR(EpsilonIsSetAndValid());
RETURN_IF_ERROR(DeltaIsSetAndValid());
RETURN_IF_ERROR(MaxPartitionsContributedIsSetAndValid());
if (gaussian_builder_ == nullptr) {
gaussian_builder_ = absl::make_unique<GaussianMechanism::Builder>();
}
double epsilon = GetEpsilon().value();
double delta = GetDelta().value();
int64_t max_partitions_contributed = GetMaxPartitionsContributed().value();
double threshold_delta = delta / 2.;
double noise_delta = delta - threshold_delta;
std::unique_ptr<NumericalMechanism> mechanism_;
ASSIGN_OR_RETURN(mechanism_,
gaussian_builder_->SetEpsilon(epsilon)
.SetL0Sensitivity(max_partitions_contributed)
.SetLInfSensitivity(1)
.SetDelta(noise_delta)
.Build());
ASSIGN_OR_RETURN(double threshold,
CalculateThreshold(epsilon, noise_delta, threshold_delta,
max_partitions_contributed));
ASSIGN_OR_RETURN(
double adjusted_threshold_delta,
CalculateAdjustedDelta(threshold_delta, max_partitions_contributed));
std::unique_ptr<PartitionSelectionStrategy> gaussian =
absl::WrapUnique(new GaussianPartitionSelection(
epsilon, delta, threshold_delta, noise_delta,
max_partitions_contributed, adjusted_threshold_delta, threshold,
std::move(mechanism_)));
return gaussian;
}
private:
std::unique_ptr<GaussianMechanism::Builder> gaussian_builder_;
};
virtual ~GaussianPartitionSelection() = default;
double GetThresholdDelta() const { return threshold_delta_; }
double GetNoiseDelta() const { return noise_delta_; }
bool ShouldKeep(double num_users) override {
return mechanism_->NoisedValueAboveThreshold(num_users, threshold_);
}
// CalculateThresholdDelta returns the threshold_delta for a threshold k. This
// is the inverse of CalculateThreshold.
static base::StatusOr<double> CalculateThresholdDelta(
double epsilon, double noise_delta, double threshold,
int64_t max_partitions_contributed) {
RETURN_IF_ERROR(PartitionSelectionStrategy::EpsilonIsSetAndValid(epsilon));
RETURN_IF_ERROR(DeltaIsSetAndValid(noise_delta));
RETURN_IF_ERROR(
PartitionSelectionStrategy::MaxPartitionsContributedIsSetAndValid(
max_partitions_contributed));
double sigma = GaussianMechanism::CalculateStddev(
epsilon, noise_delta, max_partitions_contributed);
const double max_contribution = 1;
const double adjusted_threshold_delta =
1 - internal::GaussianDistribution::cdf(sigma,
threshold - max_contribution);
return CalculateUnadjustedDelta(adjusted_threshold_delta,
max_partitions_contributed);
}
// CalculateThreshold returns the smallest threshold k to use in a
// differentially private histogram with added Gaussian noise.
//
// See
// https://github.com/google/differential-privacy/blob/main/common_docs/Delta_For_Thresholding.pdf
// for details on the math underlying this.
static base::StatusOr<double> CalculateThreshold(
double epsilon, double noise_delta, double threshold_delta,
int64_t max_partitions_contributed) {
RETURN_IF_ERROR(PartitionSelectionStrategy::EpsilonIsSetAndValid(epsilon));
RETURN_IF_ERROR(DeltaIsSetAndValid(noise_delta));
RETURN_IF_ERROR(DeltaIsSetAndValid(threshold_delta));
RETURN_IF_ERROR(
PartitionSelectionStrategy::MaxPartitionsContributedIsSetAndValid(
max_partitions_contributed));
double sigma = GaussianMechanism::CalculateStddev(
epsilon, noise_delta, max_partitions_contributed);
ASSIGN_OR_RETURN(
double adjusted_threshold_delta,
CalculateAdjustedDelta(threshold_delta, max_partitions_contributed));
const double max_contribution = 1;
return max_contribution + internal::GaussianDistribution::Quantile(
sigma, 1 - adjusted_threshold_delta);
}
double GetThreshold() const { return threshold_; }
protected:
GaussianPartitionSelection(double epsilon, double delta,
double threshold_delta, double noise_delta,
int64_t max_partitions_contributed,
double adjusted_delta, double threshold,
std::unique_ptr<NumericalMechanism> gaussian)
: PartitionSelectionStrategy(epsilon, delta, max_partitions_contributed,
adjusted_delta),
threshold_delta_(threshold_delta),
noise_delta_(noise_delta),
threshold_(threshold),
mechanism_(std::move(gaussian)) {}
// CalculateDelta computes the smallest δ such that the Gaussian mechanism
// with fixed standard deviation σ is (ε,δ)-differentially private. The
// calculation is based on Theorem 8 of Balle and Wang's "Improving the
// Gaussian Mechanism for Differential Privacy: Analytical Calibration and
// Optimal Denoising" (https://arxiv.org/abs/1805.06530v2).
static double CalculateDelta(double sigma, double epsilon,
int64_t max_partitions_contributed) {
const double l2_sensitivity = std::sqrt(max_partitions_contributed);
return GaussianMechanism::CalculateDelta(sigma, epsilon, l2_sensitivity);
}
private:
double threshold_delta_;
double noise_delta_;
double threshold_;
std::unique_ptr<NumericalMechanism> mechanism_;
};
} // namespace differential_privacy
#endif // DIFFERENTIAL_PRIVACY_CPP_ALGORITHMS_PARTITION_SELECTION_H_