blob: 120e21d16958ef58dfb5e9a76b497a92665ba04c [file] [log] [blame]
//
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef DIFFERENTIAL_PRIVACY_ALGORITHMS_BOUNDED_VARIANCE_H_
#define DIFFERENTIAL_PRIVACY_ALGORITHMS_BOUNDED_VARIANCE_H_
#include <limits>
#include <type_traits>
#include "google/protobuf/any.pb.h"
#include "absl/memory/memory.h"
#include "absl/status/status.h"
#include "base/statusor.h"
#include "algorithms/algorithm.h"
#include "algorithms/approx-bounds.h"
#include "algorithms/bounded-algorithm.h"
#include "algorithms/numerical-mechanisms.h"
#include "algorithms/util.h"
#include "proto/util.h"
namespace differential_privacy {
// Incrementally provides a differentially private variance for values in the
// range [lower..upper]. Values outside of this range will be clamped so they
// lie in the range. The output will also be clamped between 0 and (upper -
// lower)^2. Since the result is guaranteed to be positive, this algorithm can
// be used to compute a differentially private standard deviation.
//
// The algorithm uses O(1) memory and runs in O(n) time where n is the size of
// the dataset, making it a fast and efficient. The amount of noise added grows
// quadratically in (upper - lower) and decreases linearly in n, so it might not
// produce good results unless n >> (upper - lower)^2.
//
// The algorithm is a variation of the algorithm for differentially private mean
// from "Differential Privacy: From Theory to Practice", section 2.5.5:
// https://books.google.com/books?id=WFttDQAAQBAJ&pg=PA24#v=onepage&q&f=false
template <typename T, std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
class BoundedVariance : public Algorithm<T> {
public:
// Builder for BoundedVariance algorithm.
class Builder
: public BoundedAlgorithmBuilder<T, BoundedVariance<T>, Builder> {
using AlgorithmBuilder =
differential_privacy::AlgorithmBuilder<T, BoundedVariance<T>, Builder>;
using BoundedBuilder =
BoundedAlgorithmBuilder<T, BoundedVariance<T>, Builder>;
public:
// For integral type, check for no overflow in subtraction squared.
template <typename T2 = T,
std::enable_if_t<std::is_integral<T2>::value>* = nullptr>
static absl::Status CheckBounds(T lower, T upper) {
if (lower > upper) {
return absl::InvalidArgumentError(
"Lower cannot be greater than upper.");
}
T subtract_result, square_result;
if (!SafeSubtract(upper, lower, &subtract_result) ||
!SafeSquare(subtract_result, &square_result)) {
return absl::InvalidArgumentError(
"Sensitivity calculation caused integer overflow.");
}
if (upper > sqrt(std::numeric_limits<T>::max()) ||
lower < -1 * sqrt(std::numeric_limits<T>::max())) {
return absl::InvalidArgumentError(
"Squaring the bounds caused overflow.");
}
return absl::OkStatus();
}
template <typename T2 = T,
std::enable_if_t<std::is_floating_point<T2>::value>* = nullptr>
static absl::Status CheckBounds(T lower, T upper) {
if (lower > upper) {
return absl::InvalidArgumentError(
"Lower cannot be greater than upper.");
}
return absl::OkStatus();
}
private:
base::StatusOr<std::unique_ptr<BoundedVariance<T>>> BuildBoundedAlgorithm()
override {
// We have to check epsilon now, otherwise the split during ApproxBounds
// construction might make the error message confusing.
RETURN_IF_ERROR(
GetValueIfSetAndPositive(AlgorithmBuilder::GetEpsilon(), "Epsilon")
.status());
// Ensure that either bounds are manually set or ApproxBounds is made.
RETURN_IF_ERROR(BoundedBuilder::BoundsSetup());
// If manual bounding, check bounds and construct mechanism so we can fail
// on build if sensitivity is inappropriate.
std::unique_ptr<NumericalMechanism> sum_mechanism = nullptr;
std::unique_ptr<NumericalMechanism> sos_mechanism = nullptr;
if (BoundedBuilder::BoundsAreSet()) {
RETURN_IF_ERROR(CheckBounds(BoundedBuilder::GetLower().value(),
BoundedBuilder::GetUpper().value()));
ASSIGN_OR_RETURN(
sum_mechanism,
BuildSumMechanism(
AlgorithmBuilder::GetMechanismBuilderClone(),
BoundedBuilder::GetRemainingEpsilon().value(),
AlgorithmBuilder::GetMaxPartitionsContributed().value_or(1),
AlgorithmBuilder::GetMaxContributionsPerPartition().value_or(1),
BoundedBuilder::GetLower().value(),
BoundedBuilder::GetUpper().value()));
ASSIGN_OR_RETURN(
sos_mechanism,
BuildSumOfSquaresMechanism(
AlgorithmBuilder::GetMechanismBuilderClone(),
BoundedBuilder::GetRemainingEpsilon().value(),
AlgorithmBuilder::GetMaxPartitionsContributed().value_or(1),
AlgorithmBuilder::GetMaxContributionsPerPartition().value_or(1),
BoundedBuilder::GetLower().value(),
BoundedBuilder::GetUpper().value()));
}
std::unique_ptr<NumericalMechanism> count_mechanism;
ASSIGN_OR_RETURN(
count_mechanism,
AlgorithmBuilder::GetMechanismBuilderClone()
->SetEpsilon(BoundedBuilder::GetRemainingEpsilon().value())
.SetL0Sensitivity(
AlgorithmBuilder::GetMaxPartitionsContributed().value_or(1))
.SetLInfSensitivity(
AlgorithmBuilder::GetMaxContributionsPerPartition().value_or(
1))
.Build());
// Construct bounded variance.
auto mech_builder = AlgorithmBuilder::GetMechanismBuilderClone();
return absl::WrapUnique(new BoundedVariance(
BoundedBuilder::GetRemainingEpsilon().value(),
BoundedBuilder::GetLower().value_or(0),
BoundedBuilder::GetUpper().value_or(0),
AlgorithmBuilder::GetMaxPartitionsContributed().value_or(1),
AlgorithmBuilder::GetMaxContributionsPerPartition().value_or(1),
std::move(mech_builder), std::move(sum_mechanism),
std::move(sos_mechanism), std::move(count_mechanism),
std::move(BoundedBuilder::MoveApproxBoundsPointer())));
}
};
void AddEntry(const T& t) override { AddMultipleEntries(t, 1); }
Summary Serialize() override {
// Create BoundedVarianceSummary.
BoundedVarianceSummary bv_summary;
bv_summary.set_count(raw_count_);
for (T x : pos_sum_) {
SetValue(bv_summary.add_pos_sum(), x);
}
for (T x : neg_sum_) {
SetValue(bv_summary.add_neg_sum(), x);
}
for (double x : pos_sum_of_squares_) {
bv_summary.add_pos_sum_of_squares(x);
}
for (double x : neg_sum_of_squares_) {
bv_summary.add_neg_sum_of_squares(x);
}
if (approx_bounds_) {
Summary approx_bounds_summary = approx_bounds_->Serialize();
approx_bounds_summary.data().UnpackTo(
bv_summary.mutable_bounds_summary());
}
// Create Summary.
Summary summary;
summary.mutable_data()->PackFrom(bv_summary);
return summary;
}
absl::Status Merge(const Summary& summary) override {
if (!summary.has_data()) {
return absl::InternalError(
"Cannot merge summary with no bounded variance data.");
}
// Unpack bounded variance summary.
BoundedVarianceSummary bv_summary;
if (!summary.data().UnpackTo(&bv_summary)) {
return absl::InternalError(
"Bounded variance summary unable to be unpacked.");
}
if ((approx_bounds_ != nullptr) != bv_summary.has_bounds_summary()) {
return absl::InternalError(
"Merged BoundedVariance must have the same bounding strategy.");
}
if (pos_sum_.size() != bv_summary.pos_sum_size() ||
neg_sum_.size() != bv_summary.neg_sum_size() ||
pos_sum_of_squares_.size() != bv_summary.pos_sum_of_squares_size() ||
neg_sum_of_squares_.size() != bv_summary.neg_sum_of_squares_size()) {
return absl::InternalError(
"Merged BoundedVariance must have the same amount of partial "
"sum or sum of squares values as this BoundedVariance.");
}
// Add count and partial values to current ones.
SafeAdd(raw_count_, bv_summary.count(), &raw_count_);
for (int i = 0; i < pos_sum_.size(); ++i) {
SafeAdd(pos_sum_[i], GetValue<T>(bv_summary.pos_sum(i)), &pos_sum_[i]);
pos_sum_of_squares_[i] += bv_summary.pos_sum_of_squares(i);
}
for (int i = 0; i < neg_sum_.size(); ++i) {
SafeAdd(neg_sum_[i], GetValue<T>(bv_summary.neg_sum(i)), &neg_sum_[i]);
neg_sum_of_squares_[i] += bv_summary.neg_sum_of_squares(i);
}
// Merge approx bounds if auto-clamping.
if (approx_bounds_) {
Summary approx_bounds_summary;
approx_bounds_summary.mutable_data()->PackFrom(
bv_summary.bounds_summary());
RETURN_IF_ERROR(approx_bounds_->Merge(approx_bounds_summary));
}
return absl::OkStatus();
}
int64_t MemoryUsed() override {
int64_t memory = sizeof(BoundedVariance<T>) +
sizeof(T) * (pos_sum_.capacity() + neg_sum_.capacity()) +
sizeof(double) * (pos_sum_of_squares_.capacity() +
neg_sum_of_squares_.capacity());
if (approx_bounds_) {
memory += approx_bounds_->MemoryUsed();
}
if (sum_mechanism_) {
memory += sum_mechanism_->MemoryUsed();
}
if (mechanism_builder_) {
memory += sizeof(*mechanism_builder_);
}
return memory;
}
double GetEpsilon() const override {
if (approx_bounds_) {
return approx_bounds_->GetEpsilon() + Algorithm<T>::GetEpsilon();
}
return Algorithm<T>::GetEpsilon();
}
// Returns the epsilon used to calculate approximate bounds. If approximate
// bounds are not used, returns 0.
double GetBoundingEpsilon() const {
if (approx_bounds_) {
return approx_bounds_->GetEpsilon();
}
return 0;
}
// Returns the epsilon used to calculate the noisy mean. If bounds are
// specified explicitly, this will be the total epsilon used by the algorithm.
double GetAggregationEpsilon() const { return Algorithm<T>::GetEpsilon(); }
private:
BoundedVariance(const double epsilon, const T lower, const T upper,
const double l0_sensitivity,
const double max_contributions_per_partition,
std::unique_ptr<NumericalMechanismBuilder> mechanism_builder,
std::unique_ptr<NumericalMechanism> sum_mechanism,
std::unique_ptr<NumericalMechanism> sos_mechanism,
std::unique_ptr<NumericalMechanism> count_mechanism,
std::unique_ptr<ApproxBounds<T>> approx_bounds = nullptr)
: Algorithm<T>(epsilon),
raw_count_(0),
lower_(lower),
upper_(upper),
mechanism_builder_(std::move(mechanism_builder)),
l0_sensitivity_(l0_sensitivity),
max_contributions_per_partition_(max_contributions_per_partition),
sum_mechanism_(std::move(sum_mechanism)),
sos_mechanism_(std::move(sos_mechanism)),
count_mechanism_(std::move(count_mechanism)),
approx_bounds_(std::move(approx_bounds)) {
// If automatically determining bounds, we need partial values for each bin
// of the ApproxBounds logarithmic histogram. Otherwise, we only need to
// store one already-clamped value.
if (approx_bounds_) {
pos_sum_.resize(approx_bounds_->NumPositiveBins(), 0);
neg_sum_.resize(approx_bounds_->NumPositiveBins(), 0);
pos_sum_of_squares_.resize(approx_bounds_->NumPositiveBins(), 0);
neg_sum_of_squares_.resize(approx_bounds_->NumPositiveBins(), 0);
} else {
pos_sum_.push_back(0);
pos_sum_of_squares_.push_back(0);
}
}
base::StatusOr<Output> GenerateResult(double privacy_budget,
double noise_interval_level) override {
DCHECK_GT(privacy_budget, 0.0)
<< "Privacy budget should be greater than zero.";
if (privacy_budget == 0.0) return Output();
double remaining_budget = privacy_budget;
Output output;
// We need these values to find the final variance.
double sum = 0;
double sos = 0; // Sum of squares.
if (approx_bounds_) {
// Get bounds with a fraction of the privacy budget.
double bounds_budget = remaining_budget / 2;
remaining_budget -= bounds_budget;
ASSIGN_OR_RETURN(Output bounds, approx_bounds_->PartialResult(
bounds_budget, noise_interval_level));
lower_ = GetValue<T>(bounds.elements(0).value());
upper_ = GetValue<T>(bounds.elements(1).value());
RETURN_IF_ERROR(Builder::CheckBounds(lower_, upper_));
// To find the sum, pass the identity function as the transform.
sum = approx_bounds_->template ComputeFromPartials<T>(
pos_sum_, neg_sum_, [](T x) { return x; }, lower_, upper_,
raw_count_);
// To find sum of squares, pass the square function.
sos = approx_bounds_->template ComputeFromPartials<double>(
pos_sum_of_squares_, neg_sum_of_squares_, [](T x) { return x * x; },
lower_, upper_, raw_count_);
// Populate the bounding report with ApproxBounds information.
*(output.mutable_error_report()->mutable_bounding_report()) =
approx_bounds_->GetBoundingReport(lower_, upper_);
// Clear the mechanism. The sensitivity might have changed.
sum_mechanism_.reset();
sos_mechanism_.reset();
} else {
// In this case, lower and upper were manually set. The clamped partial
// values are stored and do not need processing.
sum = pos_sum_[0];
sos = pos_sum_of_squares_[0];
}
// From this point lower_ and upper_ are guaranteed to be set, either from
// ApproxBounds results or manually at construction. Construct mechanism
// with the correct noise if needed.
if (!sum_mechanism_) {
ASSIGN_OR_RETURN(
sum_mechanism_,
BuildSumMechanism(mechanism_builder_->Clone(),
Algorithm<T>::GetEpsilon(), l0_sensitivity_,
max_contributions_per_partition_, lower_, upper_));
}
if (!sos_mechanism_) {
ASSIGN_OR_RETURN(sos_mechanism_,
BuildSumOfSquaresMechanism(
mechanism_builder_->Clone(),
Algorithm<T>::GetEpsilon(), l0_sensitivity_,
max_contributions_per_partition_, lower_, upper_));
}
T sum_midpoint = lower_ + (upper_ - lower_) / 2;
T sos_midpoint = MidpointOfSquares(lower_, upper_);
double count_budget = remaining_budget / 4;
remaining_budget -= count_budget;
double noised_sum_count =
count_mechanism_->AddNoise(raw_count_, count_budget);
remaining_budget -= count_budget;
double noised_sos_count =
count_mechanism_->AddNoise(raw_count_, count_budget);
// Exact output is sum_of_squares/count - sum*sum/(count*count).
double sum_budget = remaining_budget / 2;
remaining_budget -= sum_budget;
double normalized_sum = sum_mechanism_->AddNoise(
sum - static_cast<double>(raw_count_) * sum_midpoint, sum_budget);
double normalized_sos = sos_mechanism_->AddNoise(
sos - static_cast<double>(raw_count_) * sos_midpoint, remaining_budget);
double mean;
if (noised_sum_count <= 1) {
mean = sum_midpoint;
} else {
mean = normalized_sum / noised_sum_count + sum_midpoint;
}
double mean_of_square;
if (noised_sos_count <= 1) {
mean_of_square = sos_midpoint;
} else {
mean_of_square = normalized_sos / noised_sos_count + sos_midpoint;
}
double noised_variance = mean_of_square - pow(mean, 2);
AddToOutput<double>(
&output, Clamp<double>(0.0, IntervalLengthSquared(lower_, upper_) / 4,
noised_variance));
return output;
}
void ResetState() override {
std::fill(pos_sum_.begin(), pos_sum_.end(), 0);
std::fill(pos_sum_of_squares_.begin(), pos_sum_of_squares_.end(), 0);
std::fill(neg_sum_.begin(), neg_sum_.end(), 0);
std::fill(neg_sum_of_squares_.begin(), neg_sum_of_squares_.end(), 0);
raw_count_ = 0;
if (approx_bounds_) {
approx_bounds_->Reset();
sum_mechanism_ = nullptr;
sos_mechanism_ = nullptr;
}
}
static double IntervalLengthSquared(T lower, T upper) {
return std::pow(static_cast<double>(upper - lower), 2);
}
// Returns the midpoint of the range of f(x) = x^2 where the domains of f is
// [lower, upper].
static double MidpointOfSquares(T lower, T upper) {
DCHECK_GE(upper, lower);
if (0 > lower && 0 < upper) {
return std::max(lower * lower, upper * upper) / 2;
}
return lower * lower + (upper * upper - lower * lower) / 2;
}
// Returns the width of the range of f(x) = x^2 where the domain of f is
// [lower, upper].
static double RangeOfSquares(T lower, T upper) {
if (0 > lower && 0 < upper) {
return std::max(lower * lower, upper * upper);
}
return std::abs(upper * upper - lower * lower);
}
static base::StatusOr<std::unique_ptr<NumericalMechanism>> BuildSumMechanism(
std::unique_ptr<NumericalMechanismBuilder> mechanism_builder,
const double epsilon, const double l0_sensitivity,
const double max_contributions_per_partition, const T lower,
const T upper) {
return mechanism_builder->SetEpsilon(epsilon)
.SetL0Sensitivity(l0_sensitivity)
.SetLInfSensitivity(max_contributions_per_partition *
static_cast<double>(upper - lower) / 2.0)
.Build();
}
static base::StatusOr<std::unique_ptr<NumericalMechanism>>
BuildSumOfSquaresMechanism(
std::unique_ptr<NumericalMechanismBuilder> mechanism_builder,
const double epsilon, const double l0_sensitivity,
const double max_contributions_per_partition, const T lower,
const T upper) {
return mechanism_builder->SetEpsilon(epsilon)
.SetL0Sensitivity(l0_sensitivity)
.SetLInfSensitivity(max_contributions_per_partition *
(RangeOfSquares(lower, upper) / 2))
.Build();
}
void AddMultipleEntries(const T& t, uint64_t num_of_entries) {
// Drop value if it is NaN.
// REF:
// https://stackoverflow.com/questions/61646166/how-to-resolve-fpclassify-ambiguous-call-to-overloaded-function
if (std::isnan(static_cast<double>(t))) {
return;
}
// Count is unaffected by clamping.
SafeAdd<uint64_t>(raw_count_, num_of_entries, &raw_count_);
// If bounds exist, clamp and record. Otherwise, store partial results and
// feed input into ApproxBounds algorithm.
if (!approx_bounds_) {
CHECK_EQ(
AddManualBoundsEntries(Clamp<T>(lower_, upper_, t), num_of_entries),
absl::OkStatus());
} else {
approx_bounds_->AddMultipleEntries(t, num_of_entries);
// Add to partial sums and sum of squares.
auto difference_of_squares = [](T val1, T val2) {
// Lessen the chance of becoming inf/-inf by calculating it like this.
return (static_cast<double>(val1) + val2) *
(static_cast<double>(val1) - val2);
};
if (t >= 0) {
approx_bounds_->template AddMultipleEntriesToPartialSums<T>(
&pos_sum_, t, num_of_entries);
approx_bounds_->template AddMultipleEntriesToPartials<double>(
&pos_sum_of_squares_, t, num_of_entries, difference_of_squares);
} else {
approx_bounds_->template AddMultipleEntriesToPartialSums<T>(
&neg_sum_, t, num_of_entries);
approx_bounds_->template AddMultipleEntriesToPartials<double>(
&neg_sum_of_squares_, t, num_of_entries, difference_of_squares);
}
}
}
absl::Status AddManualBoundsEntries(const T& t, uint64_t num_of_entries) {
if (approx_bounds_) {
return absl::InternalError(
"AddManualBoundsEntry() can only be used when bounds were set "
"manually.");
}
SafeAdd(pos_sum_[0],
Clamp<T>(std::numeric_limits<T>::lowest(),
std::numeric_limits<T>::max(), t * num_of_entries),
&pos_sum_[0]);
pos_sum_of_squares_[0] += pow(t, 2) * num_of_entries;
return absl::OkStatus();
}
absl::Status AddManualBoundsEntry(const T& t) {
return AddManualBoundsEntries(t, 1);
}
// Friend class for testing only
friend class BoundedVarianceTestPeer;
// Vectors of partial values stored for automatic clamping.
std::vector<T> pos_sum_, neg_sum_;
std::vector<double> pos_sum_of_squares_, neg_sum_of_squares_;
uint64_t raw_count_;
T lower_, upper_;
// Used to construct mechanism once bounds are obtained.
std::unique_ptr<NumericalMechanismBuilder> mechanism_builder_;
const double l0_sensitivity_;
const int max_contributions_per_partition_;
std::unique_ptr<NumericalMechanism> sum_mechanism_;
std::unique_ptr<NumericalMechanism> sos_mechanism_;
std::unique_ptr<NumericalMechanism> count_mechanism_;
// If this is not nullptr, we are automatically determining bounds. Otherwise,
// lower and upper contain the manually set bounds.
std::unique_ptr<ApproxBounds<T>> approx_bounds_;
};
} // namespace differential_privacy
#endif // DIFFERENTIAL_PRIVACY_ALGORITHMS_BOUNDED_VARIANCE_H_