blob: 0cb85006fa1c2ff49881da4c0b9e5d46b385b6aa [file] [log] [blame]
//
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef DIFFERENTIAL_PRIVACY_ALGORITHMS_BINARY_SEARCH_H_
#define DIFFERENTIAL_PRIVACY_ALGORITHMS_BINARY_SEARCH_H_
#include "base/percentile.h"
#include "google/protobuf/any.pb.h"
#include "absl/status/status.h"
#include "base/statusor.h"
#include "algorithms/algorithm.h"
#include "algorithms/numerical-mechanisms.h"
#include "proto/util.h"
#include "base/canonical_errors.h"
#include "base/status_macros.h"
// Differentially private binary search.
//
// The Bayesian search implementation uses binary search-like iterations. A
// probability map is kept that partitions the search space into intervals and
// assigns a probability that the desired quantile is in that interval. At each
// iteration, we find the value such that we estimate there's a 50% chance the
// target is lower and a 50% change the target is higher. The noisy count of
// number of entries above and below the value is found. Then, based on the
// noisy counts, the probabilities that the desired quantile is below or above
// the current value is found, and used to update the probability map.
//
// Visual Example:
// e.g., for a dataset {1, 3, 6, 15, 18, 21, 24}, and a search range [0, 32],
// to find the median we might go through the following steps:
//
// | 1 3 6 15 18 21 24 | count_less count_greater_eq
// | ----------------------------------|
// | 0 0 0 0 m 1 1 1 | 4 + Lap(1/e) 3 + Lap(1/e)
// | 0 0 1 m 1 0 0 1 | 3 + Lap(1/e) 4 + Lap(1/e)
// | 0 0 1 m1 0 1 0 | 3 + Lap(1/e) 4 + Lap(1/e)
// | x x x x x x x |
// | x x x x x x x |
//
// Output:
// Median: 14
namespace differential_privacy {
// Bayesian search creates a map entry for each iteration. Bound this to prevent
// out of memory exception.
const size_t kMaxBayesianIterations = 10000;
// Bayesian search default fraction of the privacy budget used per iteration.
const double kDefaultLocalBudgetFraction = .01;
// Bayesian search maximum fraction of the privacy budget used per iteration.
const double kMaxLocalBudgetFraction = .1;
// If the bayesian update probability is closer than this constant to 50%, then
// we should increase budget use on the next iteration to get a stronger signal
// whether the percentile is above or below the midpoint.
const double kProbabilityTooUncertain = .4;
// If the bayesian update probability is farther than this constant from 50%,
// then we should decrease budget use on the next iteration because we're
// already getting a very strong signal whether the percentile is above or
// below the midpoint.
const double kProbabilityTooCertain = .49;
// Distance from a singularity for which to use the value at the singularity.
const double kSingularityTolerance = std::pow(10, -6);
template <typename T>
class BinarySearch : public Algorithm<T> {
public:
void AddEntry(const T& t) override {
// REF:
// https://stackoverflow.com/questions/61646166/how-to-resolve-fpclassify-ambiguous-call-to-overloaded-function
if (!std::isnan(static_cast<double>(t))) {
quantiles_->Add(t);
}
}
Summary Serialize() override {
BinarySearchSummary bs_summary;
quantiles_->SerializeToProto(bs_summary.mutable_input());
Summary summary;
summary.mutable_data()->PackFrom(bs_summary);
return summary;
}
absl::Status Merge(const Summary& summary) override {
if (!summary.has_data()) {
return absl::InternalError(
"Cannot merge summary with no binary search data.");
}
BinarySearchSummary bs_summary;
if (!summary.data().UnpackTo(&bs_summary)) {
return absl::InternalError(
"Binary search summary unable to be unpacked.");
}
quantiles_->MergeFromProto(bs_summary.input());
return absl::OkStatus();
}
int64_t MemoryUsed() override {
int64_t memory = sizeof(BinarySearch<T>);
if (mechanism_) {
memory += mechanism_->MemoryUsed();
}
if (quantiles_) {
memory += quantiles_->Memory();
}
return memory;
}
protected:
BinarySearch(
double epsilon, T lower, T upper, double quantile,
std::unique_ptr<LaplaceMechanism> mechanism,
std::unique_ptr<base::Percentile<T>> input_sketch)
: Algorithm<T>(epsilon),
quantile_(quantile),
upper_(upper),
lower_(lower),
mechanism_(std::move(mechanism)),
quantiles_(std::move(input_sketch)) {}
void ResetState() override { quantiles_->Reset(); }
base::StatusOr<Output> GenerateResult(double privacy_budget,
double noise_interval_level) override {
DCHECK_GT(privacy_budget, 0.0)
<< "Privacy budget should be greater than zero.";
if (privacy_budget == 0.0) return Output();
return BayesianSearch(privacy_budget, noise_interval_level);
}
private:
base::StatusOr<Output> BayesianSearch(double privacy_budget,
double noise_interval_level) {
// If the bounds are equal, we return the only possible value with total
// confidence.
if (lower_ == upper_) {
Output output = MakeOutput<T>(lower_);
ConfidenceInterval* ci =
output.mutable_error_report()->mutable_noise_confidence_interval();
ci->set_lower_bound(lower_);
ci->set_upper_bound(lower_);
ci->set_confidence_level(noise_interval_level);
return output;
}
// Start the local_budget at a fraction of the total budget.
double local_budget = privacy_budget * kDefaultLocalBudgetFraction;
double remaining_budget = privacy_budget;
double max_local_budget = privacy_budget * kMaxLocalBudgetFraction;
// Stores probability that the target value is the subrange. Since the map
// is sorted, it contains a sequence of key-value pairs (k_i, v_i) for
// i = 1, 2, ..., n, where n is the dictionary size. Then for
// i = 1, ..., n-1, the subrange [k_i. k_(i+1)) has probability v_i of
// containing the target value. [k_n, upper_] has probability v_n of
// containing the target value.
std::map<double, double> weight;
double m = lower_ / 2.0 + upper_ / 2.0;
weight[lower_] = .5;
weight[m] = .5;
// Keep doing search iterations while we have enough budget left.
int iterations = 0;
while (remaining_budget - local_budget > 0 &&
iterations < kMaxBayesianIterations) {
++iterations;
// Find noisy counts for number of values above and below m. A single
// input only contributes to one of the two counts.
ASSIGN_OR_RETURN(const double percentile, Percentile(m));
double noisy_less = mechanism_->AddNoise(
percentile * quantiles_->num_values(), local_budget);
double noisy_more = mechanism_->AddNoise(
(1 - percentile) * quantiles_->num_values(), local_budget);
double noised_size = noisy_less + noisy_more;
// For extreme percentiles, we want to push the result toward the range of
// the input data.
if (quantile_ < kSingularityTolerance) {
noisy_less -= GetDatapoints(noised_size);
} else if ((1 - quantile_) < kSingularityTolerance) {
noisy_more -= GetDatapoints(noised_size);
}
// Calculate update multipliers.
double update_left =
BayesianProbabilityLeft(local_budget, noisy_less, noisy_more);
// Adjust the local budget based on certainty.
remaining_budget -= local_budget;
local_budget = std::min(UpdateLocalBudget(local_budget, update_left),
max_local_budget);
// Apply update multipliers.
UpdateWeight(&weight, m, update_left);
// Find the subrange to split the bucket and its weight in two.
double sum_w = 0.0;
double lower_bound = static_cast<double>(lower_);
double w = 0;
auto it = weight.begin();
for (; it != weight.end(); it++) {
sum_w += it->second;
lower_bound = it->first;
w = it->second;
if (sum_w >= .5) {
break;
}
}
double upper_bound = static_cast<double>(upper_);
if (it != weight.end() && ++it != weight.end()) {
upper_bound = it->first;
}
// Split the bucket into two assuming uniform distribution of probability
// within the bucket. The bucket starting at lower_bound will retain the
// weight proportional to its length. The bucket starting at the new
// split-point will get the remaining weight. Do not split the bucket if
// m is lower_bound or upper_bound.
m = (.5 - sum_w + w) / w * (upper_bound - lower_bound) + lower_bound;
if (lower_bound < m && m < upper_bound) {
weight[lower_bound] =
w * (m - lower_bound) / (upper_bound - lower_bound);
weight[m] = w * (upper_bound - m) / (upper_bound - lower_bound);
}
}
// Round the result instead of truncation, and ensure the result is within
// the valid bounds of T (to prevent overflows or underflows).
if (std::is_integral<T>::value) {
m = Clamp<double>(std::numeric_limits<T>::lowest(),
std::numeric_limits<T>::max(), std::round(m));
}
// Return 95% confidence interval of the error.
Output output = MakeOutput<T>(m);
*(output.mutable_error_report()->mutable_noise_confidence_interval()) =
ErrorConfidenceInterval(noise_interval_level, weight, m);
return output;
}
// The "datapoints" is used to buffer the noisy less and noisy more
// count for finding extreme quantiles (at 0 and 1). It approximately can be
// thought of as you're looking for a value within datapoints away from the
// desired quantile. If "datapoints" is too small, then we are likely to
// obtain a very inaccurate result, because values between
// the search space upper bound and the largest element cannot be
// discriminated.
double GetDatapoints(double noised_size) {
return std::max(2.0, std::min(45.0, 14.0 * noised_size / 100.0));
}
// Given a noisy lower L and noisy greater count U for some value in a set,
// and that the noise of these counts were generated by this mechanism with
// local privacy_budget, find the probability that the percentile p element of
// the set is to the left of the investigated value. The tolerance is the
// distance from removable singularities to use the value at singularity.
virtual double BayesianProbabilityLeft(double privacy_budget, double L,
double U) {
double p = quantile_;
double b = privacy_budget / mechanism_->GetDiversity();
// Removable singularity at p=1/2.
if (std::abs(p - .5) < kSingularityTolerance) {
if (L < U) {
return -.25 * exp(b * (L - U)) * (-2 + b * (L - U));
}
// L >= U.
return 1 + exp(b * (U - L)) * (-.5 + .25 * b * (U - L));
}
// Singularities at p = 0 and p = 1. We use a simplified method.
if (std::abs(p) < kSingularityTolerance) {
if (L <= 0) {
return exp(b * L) / 2;
} else {
return 1 - exp(-b * L) / 2;
}
}
if (std::abs(p - 1) < kSingularityTolerance) {
if (U <= 0) {
return 1 - exp(b * U) / 2;
} else {
return exp(-b * U) / 2;
}
}
if (L < p * (L + U)) {
double num1 = exp(b * (L + p * U / (p - 1)));
double num2 = exp(b * (L * (1 / p - 1) - U));
double denom = 2 * (-1 + 2 * p);
return (-1 * num1 + 2 * num1 * p - num1 * p * p + num2 * p * p) / denom;
}
// L >= p(L+U).
double num1 = exp(-b * (L + p * U / (p - 1)));
double num2 = exp(b * (L - L / p + U));
double denom = 2 * (-1 + 2 * p);
return (-2 + num1 * std::pow(-1 + p, 2) + 4 * p - num2 * p * p) / denom;
}
void UpdateWeight(std::map<double, double>* weight, double m,
double update_left) {
// Apply the multipliers. For buckets below, apply left update. For buckets
// above, apply right update. m is always the lower bound of some bucket.
double sum_w = 0;
for (std::map<double, double>::iterator it = weight->begin();
it != weight->end(); it++) {
// Get bounds and weight of the bucket.
double lower_bound = it->first;
double w = it->second;
// Apply multiplier for the bucket according to position wrt m.
double new_w = w;
double update_right = 1 - update_left;
if (lower_bound < m) {
new_w *= update_left;
} else { // lower_bound >= m
new_w *= update_right;
}
(*weight)[lower_bound] = new_w;
sum_w += new_w;
}
// Normalize so weights sum to 1.
for (auto const& bucket : *weight) {
double lower_bound = bucket.first;
(*weight)[lower_bound] /= sum_w;
}
}
base::StatusOr<double> Percentile(double m) {
// If there are no inputs, getting the relative rank will return an error.
// Arbitrarilty say the percentile is 1/2.
if (quantiles_->num_values() == 0) {
return .5;
}
std::pair<double, double> percent_pair;
percent_pair = quantiles_->GetRelativeRank(m);
// If T is integral, then m is a double between two integers. Take the upper
// percentile of the nearest lesser integer.
if (std::is_integral<T>::value) {
return percent_pair.second;
}
// Otherwise, T is a float, get the percentile normally, taking the average
// between the lower and upper percentiles for the value.
return (percent_pair.first + percent_pair.second) / 2;
}
// Given the local budget probability that the desired value is on the left,
// return the adjusted local budget for next iteration.
double UpdateLocalBudget(double local_budget, double update_left) {
double certainty = std::abs(update_left - .5);
// If update_left is too close to 1/2, we need to increase the budget.
if (certainty < kProbabilityTooUncertain) {
return local_budget * 2;
}
// If update_left is too far from 1/2, we can decrease the budget.
if (certainty > kProbabilityTooCertain) {
return local_budget / 2;
}
return local_budget;
}
ConfidenceInterval ErrorConfidenceInterval(
double confidence_level, const std::map<double, double>& weight,
double result) {
ConfidenceInterval interval;
interval.set_confidence_level(confidence_level);
double sum_w = 0.0;
bool found_lower = false;
std::map<double, double>::const_iterator it;
for (it = weight.begin(); it != weight.end(); it++) {
sum_w += it->second;
if (!found_lower && sum_w >= .5 - confidence_level / 2) {
interval.set_upper_bound(result - it->first);
found_lower = true;
}
if (sum_w > (.5 + confidence_level / 2)) {
std::map<double, double>::const_iterator it_next = std::next(it, 1);
if (it_next == weight.end()) {
interval.set_lower_bound(result - upper_);
} else {
interval.set_lower_bound(result - it_next->first);
}
break;
}
}
return interval;
}
double quantile_;
T upper_;
T lower_;
std::unique_ptr<LaplaceMechanism> mechanism_;
std::unique_ptr<base::Percentile<T>> quantiles_;
};
} // namespace differential_privacy
#endif // DIFFERENTIAL_PRIVACY_ALGORITHMS_BINARY_SEARCH_H_