| // |
| // Copyright 2019 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| |
| #ifndef DIFFERENTIAL_PRIVACY_ALGORITHMS_BOUNDED_VARIANCE_H_ |
| #define DIFFERENTIAL_PRIVACY_ALGORITHMS_BOUNDED_VARIANCE_H_ |
| |
| #include <limits> |
| #include <type_traits> |
| |
| #include "google/protobuf/any.pb.h" |
| #include "absl/memory/memory.h" |
| #include "absl/status/status.h" |
| #include "base/statusor.h" |
| #include "algorithms/algorithm.h" |
| #include "algorithms/approx-bounds.h" |
| #include "algorithms/bounded-algorithm.h" |
| #include "algorithms/numerical-mechanisms.h" |
| #include "algorithms/util.h" |
| #include "proto/util.h" |
| |
| namespace differential_privacy { |
| |
| // Incrementally provides a differentially private variance for values in the |
| // range [lower..upper]. Values outside of this range will be clamped so they |
| // lie in the range. The output will also be clamped between 0 and (upper - |
| // lower)^2. Since the result is guaranteed to be positive, this algorithm can |
| // be used to compute a differentially private standard deviation. |
| // |
| // The algorithm uses O(1) memory and runs in O(n) time where n is the size of |
| // the dataset, making it a fast and efficient. The amount of noise added grows |
| // quadratically in (upper - lower) and decreases linearly in n, so it might not |
| // produce good results unless n >> (upper - lower)^2. |
| // |
| // The algorithm is a variation of the algorithm for differentially private mean |
| // from "Differential Privacy: From Theory to Practice", section 2.5.5: |
| // https://books.google.com/books?id=WFttDQAAQBAJ&pg=PA24#v=onepage&q&f=false |
| template <typename T, std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr> |
| class BoundedVariance : public Algorithm<T> { |
| public: |
| // Builder for BoundedVariance algorithm. |
| class Builder |
| : public BoundedAlgorithmBuilder<T, BoundedVariance<T>, Builder> { |
| using AlgorithmBuilder = |
| differential_privacy::AlgorithmBuilder<T, BoundedVariance<T>, Builder>; |
| using BoundedBuilder = |
| BoundedAlgorithmBuilder<T, BoundedVariance<T>, Builder>; |
| |
| public: |
| // For integral type, check for no overflow in subtraction squared. |
| template <typename T2 = T, |
| std::enable_if_t<std::is_integral<T2>::value>* = nullptr> |
| static absl::Status CheckBounds(T lower, T upper) { |
| if (lower > upper) { |
| return absl::InvalidArgumentError( |
| "Lower cannot be greater than upper."); |
| } |
| T subtract_result, square_result; |
| if (!SafeSubtract(upper, lower, &subtract_result) || |
| !SafeSquare(subtract_result, &square_result)) { |
| return absl::InvalidArgumentError( |
| "Sensitivity calculation caused integer overflow."); |
| } |
| if (upper > sqrt(std::numeric_limits<T>::max()) || |
| lower < -1 * sqrt(std::numeric_limits<T>::max())) { |
| return absl::InvalidArgumentError( |
| "Squaring the bounds caused overflow."); |
| } |
| return absl::OkStatus(); |
| } |
| |
| template <typename T2 = T, |
| std::enable_if_t<std::is_floating_point<T2>::value>* = nullptr> |
| static absl::Status CheckBounds(T lower, T upper) { |
| if (lower > upper) { |
| return absl::InvalidArgumentError( |
| "Lower cannot be greater than upper."); |
| } |
| return absl::OkStatus(); |
| } |
| |
| private: |
| base::StatusOr<std::unique_ptr<BoundedVariance<T>>> BuildBoundedAlgorithm() |
| override { |
| // We have to check epsilon now, otherwise the split during ApproxBounds |
| // construction might make the error message confusing. |
| RETURN_IF_ERROR( |
| GetValueIfSetAndPositive(AlgorithmBuilder::GetEpsilon(), "Epsilon") |
| .status()); |
| |
| // Ensure that either bounds are manually set or ApproxBounds is made. |
| RETURN_IF_ERROR(BoundedBuilder::BoundsSetup()); |
| |
| // If manual bounding, check bounds and construct mechanism so we can fail |
| // on build if sensitivity is inappropriate. |
| std::unique_ptr<NumericalMechanism> sum_mechanism = nullptr; |
| std::unique_ptr<NumericalMechanism> sos_mechanism = nullptr; |
| if (BoundedBuilder::BoundsAreSet()) { |
| RETURN_IF_ERROR(CheckBounds(BoundedBuilder::GetLower().value(), |
| BoundedBuilder::GetUpper().value())); |
| ASSIGN_OR_RETURN( |
| sum_mechanism, |
| BuildSumMechanism( |
| AlgorithmBuilder::GetMechanismBuilderClone(), |
| BoundedBuilder::GetRemainingEpsilon().value(), |
| AlgorithmBuilder::GetMaxPartitionsContributed().value_or(1), |
| AlgorithmBuilder::GetMaxContributionsPerPartition().value_or(1), |
| BoundedBuilder::GetLower().value(), |
| BoundedBuilder::GetUpper().value())); |
| ASSIGN_OR_RETURN( |
| sos_mechanism, |
| BuildSumOfSquaresMechanism( |
| AlgorithmBuilder::GetMechanismBuilderClone(), |
| BoundedBuilder::GetRemainingEpsilon().value(), |
| AlgorithmBuilder::GetMaxPartitionsContributed().value_or(1), |
| AlgorithmBuilder::GetMaxContributionsPerPartition().value_or(1), |
| BoundedBuilder::GetLower().value(), |
| BoundedBuilder::GetUpper().value())); |
| } |
| |
| std::unique_ptr<NumericalMechanism> count_mechanism; |
| ASSIGN_OR_RETURN( |
| count_mechanism, |
| AlgorithmBuilder::GetMechanismBuilderClone() |
| ->SetEpsilon(BoundedBuilder::GetRemainingEpsilon().value()) |
| .SetL0Sensitivity( |
| AlgorithmBuilder::GetMaxPartitionsContributed().value_or(1)) |
| .SetLInfSensitivity( |
| AlgorithmBuilder::GetMaxContributionsPerPartition().value_or( |
| 1)) |
| .Build()); |
| |
| // Construct bounded variance. |
| auto mech_builder = AlgorithmBuilder::GetMechanismBuilderClone(); |
| return absl::WrapUnique(new BoundedVariance( |
| BoundedBuilder::GetRemainingEpsilon().value(), |
| BoundedBuilder::GetLower().value_or(0), |
| BoundedBuilder::GetUpper().value_or(0), |
| AlgorithmBuilder::GetMaxPartitionsContributed().value_or(1), |
| AlgorithmBuilder::GetMaxContributionsPerPartition().value_or(1), |
| std::move(mech_builder), std::move(sum_mechanism), |
| std::move(sos_mechanism), std::move(count_mechanism), |
| std::move(BoundedBuilder::MoveApproxBoundsPointer()))); |
| } |
| }; |
| |
| void AddEntry(const T& t) override { AddMultipleEntries(t, 1); } |
| |
| Summary Serialize() override { |
| // Create BoundedVarianceSummary. |
| BoundedVarianceSummary bv_summary; |
| bv_summary.set_count(raw_count_); |
| for (T x : pos_sum_) { |
| SetValue(bv_summary.add_pos_sum(), x); |
| } |
| for (T x : neg_sum_) { |
| SetValue(bv_summary.add_neg_sum(), x); |
| } |
| for (double x : pos_sum_of_squares_) { |
| bv_summary.add_pos_sum_of_squares(x); |
| } |
| for (double x : neg_sum_of_squares_) { |
| bv_summary.add_neg_sum_of_squares(x); |
| } |
| if (approx_bounds_) { |
| Summary approx_bounds_summary = approx_bounds_->Serialize(); |
| approx_bounds_summary.data().UnpackTo( |
| bv_summary.mutable_bounds_summary()); |
| } |
| |
| // Create Summary. |
| Summary summary; |
| summary.mutable_data()->PackFrom(bv_summary); |
| return summary; |
| } |
| |
| absl::Status Merge(const Summary& summary) override { |
| if (!summary.has_data()) { |
| return absl::InternalError( |
| "Cannot merge summary with no bounded variance data."); |
| } |
| |
| // Unpack bounded variance summary. |
| BoundedVarianceSummary bv_summary; |
| if (!summary.data().UnpackTo(&bv_summary)) { |
| return absl::InternalError( |
| "Bounded variance summary unable to be unpacked."); |
| } |
| if ((approx_bounds_ != nullptr) != bv_summary.has_bounds_summary()) { |
| return absl::InternalError( |
| "Merged BoundedVariance must have the same bounding strategy."); |
| } |
| if (pos_sum_.size() != bv_summary.pos_sum_size() || |
| neg_sum_.size() != bv_summary.neg_sum_size() || |
| pos_sum_of_squares_.size() != bv_summary.pos_sum_of_squares_size() || |
| neg_sum_of_squares_.size() != bv_summary.neg_sum_of_squares_size()) { |
| return absl::InternalError( |
| "Merged BoundedVariance must have the same amount of partial " |
| "sum or sum of squares values as this BoundedVariance."); |
| } |
| |
| // Add count and partial values to current ones. |
| SafeAdd(raw_count_, bv_summary.count(), &raw_count_); |
| for (int i = 0; i < pos_sum_.size(); ++i) { |
| SafeAdd(pos_sum_[i], GetValue<T>(bv_summary.pos_sum(i)), &pos_sum_[i]); |
| pos_sum_of_squares_[i] += bv_summary.pos_sum_of_squares(i); |
| } |
| for (int i = 0; i < neg_sum_.size(); ++i) { |
| SafeAdd(neg_sum_[i], GetValue<T>(bv_summary.neg_sum(i)), &neg_sum_[i]); |
| neg_sum_of_squares_[i] += bv_summary.neg_sum_of_squares(i); |
| } |
| |
| // Merge approx bounds if auto-clamping. |
| if (approx_bounds_) { |
| Summary approx_bounds_summary; |
| approx_bounds_summary.mutable_data()->PackFrom( |
| bv_summary.bounds_summary()); |
| RETURN_IF_ERROR(approx_bounds_->Merge(approx_bounds_summary)); |
| } |
| |
| return absl::OkStatus(); |
| } |
| |
| int64_t MemoryUsed() override { |
| int64_t memory = sizeof(BoundedVariance<T>) + |
| sizeof(T) * (pos_sum_.capacity() + neg_sum_.capacity()) + |
| sizeof(double) * (pos_sum_of_squares_.capacity() + |
| neg_sum_of_squares_.capacity()); |
| if (approx_bounds_) { |
| memory += approx_bounds_->MemoryUsed(); |
| } |
| if (sum_mechanism_) { |
| memory += sum_mechanism_->MemoryUsed(); |
| } |
| if (mechanism_builder_) { |
| memory += sizeof(*mechanism_builder_); |
| } |
| return memory; |
| } |
| |
| double GetEpsilon() const override { |
| if (approx_bounds_) { |
| return approx_bounds_->GetEpsilon() + Algorithm<T>::GetEpsilon(); |
| } |
| return Algorithm<T>::GetEpsilon(); |
| } |
| |
| // Returns the epsilon used to calculate approximate bounds. If approximate |
| // bounds are not used, returns 0. |
| double GetBoundingEpsilon() const { |
| if (approx_bounds_) { |
| return approx_bounds_->GetEpsilon(); |
| } |
| return 0; |
| } |
| |
| // Returns the epsilon used to calculate the noisy mean. If bounds are |
| // specified explicitly, this will be the total epsilon used by the algorithm. |
| double GetAggregationEpsilon() const { return Algorithm<T>::GetEpsilon(); } |
| |
| private: |
| BoundedVariance(const double epsilon, const T lower, const T upper, |
| const double l0_sensitivity, |
| const double max_contributions_per_partition, |
| std::unique_ptr<NumericalMechanismBuilder> mechanism_builder, |
| std::unique_ptr<NumericalMechanism> sum_mechanism, |
| std::unique_ptr<NumericalMechanism> sos_mechanism, |
| std::unique_ptr<NumericalMechanism> count_mechanism, |
| std::unique_ptr<ApproxBounds<T>> approx_bounds = nullptr) |
| : Algorithm<T>(epsilon), |
| raw_count_(0), |
| lower_(lower), |
| upper_(upper), |
| mechanism_builder_(std::move(mechanism_builder)), |
| l0_sensitivity_(l0_sensitivity), |
| max_contributions_per_partition_(max_contributions_per_partition), |
| sum_mechanism_(std::move(sum_mechanism)), |
| sos_mechanism_(std::move(sos_mechanism)), |
| count_mechanism_(std::move(count_mechanism)), |
| approx_bounds_(std::move(approx_bounds)) { |
| // If automatically determining bounds, we need partial values for each bin |
| // of the ApproxBounds logarithmic histogram. Otherwise, we only need to |
| // store one already-clamped value. |
| if (approx_bounds_) { |
| pos_sum_.resize(approx_bounds_->NumPositiveBins(), 0); |
| neg_sum_.resize(approx_bounds_->NumPositiveBins(), 0); |
| pos_sum_of_squares_.resize(approx_bounds_->NumPositiveBins(), 0); |
| neg_sum_of_squares_.resize(approx_bounds_->NumPositiveBins(), 0); |
| } else { |
| pos_sum_.push_back(0); |
| pos_sum_of_squares_.push_back(0); |
| } |
| } |
| |
| base::StatusOr<Output> GenerateResult(double privacy_budget, |
| double noise_interval_level) override { |
| DCHECK_GT(privacy_budget, 0.0) |
| << "Privacy budget should be greater than zero."; |
| if (privacy_budget == 0.0) return Output(); |
| double remaining_budget = privacy_budget; |
| Output output; |
| |
| // We need these values to find the final variance. |
| double sum = 0; |
| double sos = 0; // Sum of squares. |
| |
| if (approx_bounds_) { |
| // Get bounds with a fraction of the privacy budget. |
| double bounds_budget = remaining_budget / 2; |
| remaining_budget -= bounds_budget; |
| ASSIGN_OR_RETURN(Output bounds, approx_bounds_->PartialResult( |
| bounds_budget, noise_interval_level)); |
| lower_ = GetValue<T>(bounds.elements(0).value()); |
| upper_ = GetValue<T>(bounds.elements(1).value()); |
| RETURN_IF_ERROR(Builder::CheckBounds(lower_, upper_)); |
| |
| // To find the sum, pass the identity function as the transform. |
| sum = approx_bounds_->template ComputeFromPartials<T>( |
| pos_sum_, neg_sum_, [](T x) { return x; }, lower_, upper_, |
| raw_count_); |
| |
| // To find sum of squares, pass the square function. |
| sos = approx_bounds_->template ComputeFromPartials<double>( |
| pos_sum_of_squares_, neg_sum_of_squares_, [](T x) { return x * x; }, |
| lower_, upper_, raw_count_); |
| |
| // Populate the bounding report with ApproxBounds information. |
| *(output.mutable_error_report()->mutable_bounding_report()) = |
| approx_bounds_->GetBoundingReport(lower_, upper_); |
| |
| // Clear the mechanism. The sensitivity might have changed. |
| sum_mechanism_.reset(); |
| sos_mechanism_.reset(); |
| } else { |
| // In this case, lower and upper were manually set. The clamped partial |
| // values are stored and do not need processing. |
| sum = pos_sum_[0]; |
| sos = pos_sum_of_squares_[0]; |
| } |
| |
| // From this point lower_ and upper_ are guaranteed to be set, either from |
| // ApproxBounds results or manually at construction. Construct mechanism |
| // with the correct noise if needed. |
| if (!sum_mechanism_) { |
| ASSIGN_OR_RETURN( |
| sum_mechanism_, |
| BuildSumMechanism(mechanism_builder_->Clone(), |
| Algorithm<T>::GetEpsilon(), l0_sensitivity_, |
| max_contributions_per_partition_, lower_, upper_)); |
| } |
| if (!sos_mechanism_) { |
| ASSIGN_OR_RETURN(sos_mechanism_, |
| BuildSumOfSquaresMechanism( |
| mechanism_builder_->Clone(), |
| Algorithm<T>::GetEpsilon(), l0_sensitivity_, |
| max_contributions_per_partition_, lower_, upper_)); |
| } |
| |
| T sum_midpoint = lower_ + (upper_ - lower_) / 2; |
| T sos_midpoint = MidpointOfSquares(lower_, upper_); |
| |
| double count_budget = remaining_budget / 4; |
| remaining_budget -= count_budget; |
| double noised_sum_count = |
| count_mechanism_->AddNoise(raw_count_, count_budget); |
| remaining_budget -= count_budget; |
| double noised_sos_count = |
| count_mechanism_->AddNoise(raw_count_, count_budget); |
| |
| // Exact output is sum_of_squares/count - sum*sum/(count*count). |
| double sum_budget = remaining_budget / 2; |
| remaining_budget -= sum_budget; |
| double normalized_sum = sum_mechanism_->AddNoise( |
| sum - static_cast<double>(raw_count_) * sum_midpoint, sum_budget); |
| double normalized_sos = sos_mechanism_->AddNoise( |
| sos - static_cast<double>(raw_count_) * sos_midpoint, remaining_budget); |
| |
| double mean; |
| if (noised_sum_count <= 1) { |
| mean = sum_midpoint; |
| } else { |
| mean = normalized_sum / noised_sum_count + sum_midpoint; |
| } |
| |
| double mean_of_square; |
| if (noised_sos_count <= 1) { |
| mean_of_square = sos_midpoint; |
| } else { |
| mean_of_square = normalized_sos / noised_sos_count + sos_midpoint; |
| } |
| |
| double noised_variance = mean_of_square - pow(mean, 2); |
| |
| AddToOutput<double>( |
| &output, Clamp<double>(0.0, IntervalLengthSquared(lower_, upper_) / 4, |
| noised_variance)); |
| return output; |
| } |
| |
| void ResetState() override { |
| std::fill(pos_sum_.begin(), pos_sum_.end(), 0); |
| std::fill(pos_sum_of_squares_.begin(), pos_sum_of_squares_.end(), 0); |
| std::fill(neg_sum_.begin(), neg_sum_.end(), 0); |
| std::fill(neg_sum_of_squares_.begin(), neg_sum_of_squares_.end(), 0); |
| raw_count_ = 0; |
| |
| if (approx_bounds_) { |
| approx_bounds_->Reset(); |
| sum_mechanism_ = nullptr; |
| sos_mechanism_ = nullptr; |
| } |
| } |
| |
| static double IntervalLengthSquared(T lower, T upper) { |
| return std::pow(static_cast<double>(upper - lower), 2); |
| } |
| |
| // Returns the midpoint of the range of f(x) = x^2 where the domains of f is |
| // [lower, upper]. |
| static double MidpointOfSquares(T lower, T upper) { |
| DCHECK_GE(upper, lower); |
| if (0 > lower && 0 < upper) { |
| return std::max(lower * lower, upper * upper) / 2; |
| } |
| return lower * lower + (upper * upper - lower * lower) / 2; |
| } |
| |
| // Returns the width of the range of f(x) = x^2 where the domain of f is |
| // [lower, upper]. |
| static double RangeOfSquares(T lower, T upper) { |
| if (0 > lower && 0 < upper) { |
| return std::max(lower * lower, upper * upper); |
| } |
| return std::abs(upper * upper - lower * lower); |
| } |
| |
| static base::StatusOr<std::unique_ptr<NumericalMechanism>> BuildSumMechanism( |
| std::unique_ptr<NumericalMechanismBuilder> mechanism_builder, |
| const double epsilon, const double l0_sensitivity, |
| const double max_contributions_per_partition, const T lower, |
| const T upper) { |
| return mechanism_builder->SetEpsilon(epsilon) |
| .SetL0Sensitivity(l0_sensitivity) |
| .SetLInfSensitivity(max_contributions_per_partition * |
| static_cast<double>(upper - lower) / 2.0) |
| .Build(); |
| } |
| |
| static base::StatusOr<std::unique_ptr<NumericalMechanism>> |
| BuildSumOfSquaresMechanism( |
| std::unique_ptr<NumericalMechanismBuilder> mechanism_builder, |
| const double epsilon, const double l0_sensitivity, |
| const double max_contributions_per_partition, const T lower, |
| const T upper) { |
| return mechanism_builder->SetEpsilon(epsilon) |
| .SetL0Sensitivity(l0_sensitivity) |
| .SetLInfSensitivity(max_contributions_per_partition * |
| (RangeOfSquares(lower, upper) / 2)) |
| .Build(); |
| } |
| |
| void AddMultipleEntries(const T& t, uint64_t num_of_entries) { |
| // Drop value if it is NaN. |
| // REF: |
| // https://stackoverflow.com/questions/61646166/how-to-resolve-fpclassify-ambiguous-call-to-overloaded-function |
| if (std::isnan(static_cast<double>(t))) { |
| return; |
| } |
| |
| // Count is unaffected by clamping. |
| SafeAdd<uint64_t>(raw_count_, num_of_entries, &raw_count_); |
| |
| // If bounds exist, clamp and record. Otherwise, store partial results and |
| // feed input into ApproxBounds algorithm. |
| if (!approx_bounds_) { |
| CHECK_EQ( |
| AddManualBoundsEntries(Clamp<T>(lower_, upper_, t), num_of_entries), |
| absl::OkStatus()); |
| } else { |
| approx_bounds_->AddMultipleEntries(t, num_of_entries); |
| |
| // Add to partial sums and sum of squares. |
| auto difference_of_squares = [](T val1, T val2) { |
| // Lessen the chance of becoming inf/-inf by calculating it like this. |
| return (static_cast<double>(val1) + val2) * |
| (static_cast<double>(val1) - val2); |
| }; |
| if (t >= 0) { |
| approx_bounds_->template AddMultipleEntriesToPartialSums<T>( |
| &pos_sum_, t, num_of_entries); |
| approx_bounds_->template AddMultipleEntriesToPartials<double>( |
| &pos_sum_of_squares_, t, num_of_entries, difference_of_squares); |
| } else { |
| approx_bounds_->template AddMultipleEntriesToPartialSums<T>( |
| &neg_sum_, t, num_of_entries); |
| approx_bounds_->template AddMultipleEntriesToPartials<double>( |
| &neg_sum_of_squares_, t, num_of_entries, difference_of_squares); |
| } |
| } |
| } |
| |
| absl::Status AddManualBoundsEntries(const T& t, uint64_t num_of_entries) { |
| if (approx_bounds_) { |
| return absl::InternalError( |
| "AddManualBoundsEntry() can only be used when bounds were set " |
| "manually."); |
| } |
| SafeAdd(pos_sum_[0], |
| Clamp<T>(std::numeric_limits<T>::lowest(), |
| std::numeric_limits<T>::max(), t * num_of_entries), |
| &pos_sum_[0]); |
| pos_sum_of_squares_[0] += pow(t, 2) * num_of_entries; |
| return absl::OkStatus(); |
| } |
| |
| absl::Status AddManualBoundsEntry(const T& t) { |
| return AddManualBoundsEntries(t, 1); |
| } |
| |
| // Friend class for testing only |
| friend class BoundedVarianceTestPeer; |
| |
| // Vectors of partial values stored for automatic clamping. |
| std::vector<T> pos_sum_, neg_sum_; |
| std::vector<double> pos_sum_of_squares_, neg_sum_of_squares_; |
| uint64_t raw_count_; |
| T lower_, upper_; |
| |
| // Used to construct mechanism once bounds are obtained. |
| std::unique_ptr<NumericalMechanismBuilder> mechanism_builder_; |
| const double l0_sensitivity_; |
| const int max_contributions_per_partition_; |
| |
| std::unique_ptr<NumericalMechanism> sum_mechanism_; |
| std::unique_ptr<NumericalMechanism> sos_mechanism_; |
| std::unique_ptr<NumericalMechanism> count_mechanism_; |
| |
| // If this is not nullptr, we are automatically determining bounds. Otherwise, |
| // lower and upper contain the manually set bounds. |
| std::unique_ptr<ApproxBounds<T>> approx_bounds_; |
| }; |
| |
| } // namespace differential_privacy |
| |
| #endif // DIFFERENTIAL_PRIVACY_ALGORITHMS_BOUNDED_VARIANCE_H_ |