| // |
| // Copyright 2019 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| |
| #ifndef DIFFERENTIAL_PRIVACY_ALGORITHMS_APPROX_BOUNDS_H_ |
| #define DIFFERENTIAL_PRIVACY_ALGORITHMS_APPROX_BOUNDS_H_ |
| |
| #include <stdint.h> |
| #include <stdlib.h> |
| |
| #include <algorithm> |
| #include <cmath> |
| #include <functional> |
| #include <limits> |
| #include <memory> |
| #include <optional> |
| #include <type_traits> |
| #include <utility> |
| #include <vector> |
| |
| #include <cstdint> |
| #include "google/protobuf/any.pb.h" |
| #include "absl/memory/memory.h" |
| #include "absl/status/status.h" |
| #include "base/statusor.h" |
| #include "algorithms/algorithm.h" |
| #include "algorithms/numerical-mechanisms.h" |
| #include "algorithms/util.h" |
| #include "proto/util.h" |
| #include "base/status_macros.h" |
| |
| namespace differential_privacy { |
| |
| // Find the approximate bounds of a set of numbers using logarithmic histogram |
| // bins. Like other algorithms, ApproxBounds assumes that it only gets one input |
| // per user. |
| // |
| // The algorithm takes as parameters num_bins, scale, and base to construct |
| // a logarithmic histogram with num_bins number of bins. Scale and base |
| // determine bin boundaries. Two histograms are created: one for positives and |
| // another for negatives. |
| // |
| // Without loss of generality, bin i contains the number of inputs whose most |
| // significant bit represents a number that lies in the range |
| // (scale * base^(i-1), scale * base^i]. There are two exceptions: |
| // - Positive bin 0 has boundaries [0, scale * base^0]. Negative bin 0 does |
| // not contain 0. |
| // - When the highest included positive number in the histogram is the max |
| // numeric limit for the data type, the lowest negative bin, instead of |
| // containing [-1 * max_numeric_limit, x), will contain |
| // [min_numeric_limit, x). This is because the min_numeric_limit is |
| // sometimes one greater in magnitude than the max_numeric_limit. |
| // |
| // To generate the output, first noise is added to each bin count. Then, the |
| // success_probability is used to determine a threshold count. The success |
| // probability is the probability that, given our dataset is empty, all bins |
| // have noised counts that are less than the threshold count. Therefore, |
| // increasing success_probability will increase the threshold and the |
| // probability that bounds are too tight. Alternatively, the threshold can be |
| // passed as a parameter directly. |
| // |
| // For the approx upper bound, we choose the rightmost bin that succeeds the |
| // threshold count and return its upper boundary. Similarly, for the approx |
| // lower bound we choose the leftmost bin that succeeds the threshold count and |
| // return its lower boundary. If the success_probability is too high it is |
| // possible that no bin is greater than the threshold. In this case, we reduce |
| // the success probability (and thereby the threshold) and see whether any bins |
| // exceed the new threshold. We repeat this until a bin exceeds the threshold or |
| // the success probability becomes small enough that the bounds we would find |
| // are likely to be due to noise. If we still have not found bounds, we return |
| // an error status in the output. |
| // |
| // For example, if |
| // scale = 2, base = 1, num_bins = 4, inputs = {0, 0, 0, 0, 1, 3, 7, 8, 8, 8} |
| // We have histogram bins and counts |
| // [0, 1]: 5 |
| // (1, 2]: 0 |
| // (2, 4]: 1 |
| // (4, 8]: 4 |
| // Then if success_probability=.9 and epsilon=1 we will obtain approximately |
| // threshold=3.5. Since the count of bin (4, 8] > threshold, we return an |
| // approx max of 2^3 = 8. Since the count of bin [0,1] > threshold, we return an |
| // approx min of 0. |
| template <typename T> |
| class ApproxBounds : public Algorithm<T> { |
| static_assert(std::is_arithmetic<T>::value, |
| "ApproxBounds can only be used for arithmetic types"); |
| |
| public: |
| // Builder to construct ApproxBounds objects. |
| class Builder; |
| |
| void AddEntry(const T& input) override { AddMultipleEntries(input, 1); } |
| |
| // Serialize the positive and negative bin counts. |
| Summary Serialize() const override { |
| ApproxBoundsSummary am_summary; |
| *am_summary.mutable_pos_bin_count() = {pos_bins_.begin(), pos_bins_.end()}; |
| *am_summary.mutable_neg_bin_count() = {neg_bins_.begin(), neg_bins_.end()}; |
| Summary summary; |
| summary.mutable_data()->PackFrom(am_summary); |
| return summary; |
| } |
| |
| // Retrieve positive and negative bin counts from summary and add them. |
| absl::Status Merge(const Summary& summary) override { |
| if (!summary.has_data()) { |
| return absl::InternalError( |
| "Cannot merge summary with no histogram data."); |
| } |
| ApproxBoundsSummary am_summary; |
| if (!summary.data().UnpackTo(&am_summary)) { |
| return absl::InternalError( |
| "Approximate bounds summary unable to be unpacked."); |
| } |
| |
| if (pos_bins_.size() != am_summary.pos_bin_count_size() || |
| neg_bins_.size() != am_summary.neg_bin_count_size()) { |
| return absl::InternalError( |
| "Merged approximate max summary must have the same number of " |
| "bin counts as this histogram."); |
| } |
| |
| // Add bin count from summary to each bin. |
| for (int i = 0; i < pos_bins_.size(); ++i) { |
| pos_bins_[i] += am_summary.pos_bin_count(i); |
| neg_bins_[i] += am_summary.neg_bin_count(i); |
| } |
| return absl::OkStatus(); |
| } |
| |
| int64_t MemoryUsed() override { |
| int64_t memory = sizeof(ApproxBounds<T>) + |
| sizeof(int64_t) * neg_bins_.capacity() + |
| sizeof(int64_t) * pos_bins_.capacity() + |
| sizeof(T) * noisy_neg_bins_.capacity() + |
| sizeof(T) * noisy_pos_bins_.capacity(); |
| if (mechanism_) { |
| memory += mechanism_->MemoryUsed(); |
| } |
| return memory; |
| } |
| |
| // Return the number of positive bins. This function and the following |
| // functions are exposed for calling classes of ApproxBounds to store partial |
| // values corresponding to bins. Virtual for testing. |
| int64_t NumPositiveBins() { return pos_bins_.size(); } |
| |
| // Find the most significant bit of the magnitude of the value. For the |
| // special case 0, return 0. This is used as the bin index. |
| int MostSignificantBit(T value) { |
| // Handle 0 value seperately since log(0) is undefined. |
| if (value == 0) { |
| return 0; |
| } |
| |
| // Clamp infinities to highest and lowest value. |
| value = Clamp(std::numeric_limits<T>::lowest(), |
| std::numeric_limits<T>::max(), value); |
| |
| // Sometimes the minimum numeric limit has greater magnitude than the |
| // maximum. In this case clamp its magnitude at the maximum numeric limit to |
| // find msb. In reality our negative bin will accommodate the value. |
| T abs; |
| if (value <= -1 * std::numeric_limits<T>::max()) { |
| abs = std::numeric_limits<T>::max(); |
| } else { |
| abs = std::abs(value); |
| } |
| |
| // Calculate the most significant bit and clamp to a valid bin index. |
| int msb = std::ceil((std::log(abs) - std::log(scale_)) / std::log(base_)); |
| int bin_index = |
| std::max(0, std::min(msb, static_cast<int>(pos_bins_.size() - 1))); |
| |
| // Floating-point precision errors mean that for some bin boundaries, we'll |
| // end up calculating the larger-magnitude bin rather than the smaller. |
| if ((value > 0 && value <= PosLeftBinBoundary(bin_index)) || |
| (value < 0 && value >= NegLeftBinBoundary(bin_index))) { |
| return std::max(0, bin_index - 1); |
| } |
| return bin_index; |
| } |
| |
| // Splits the value into the elements of the partials vector, each of which |
| // corresponds to a bin in ApproxBounds. make_partial is a function that given |
| // two numbers, returns the partial value corresponding to if those numbers |
| // are the bounds. For example, if we were storing partial sums, make_partials |
| // would be the difference function. |
| // |
| // We split the value into partial sums which corresponds to each bucket of |
| // ApproxBounds. For example, consider value = 7 and the bins (0, 1], (1, 2], |
| // (2, 4], (4, 8], and the four corresponding negative bins. We would store |
| // partial sums: |
| // (0, 1]: 1 |
| // (1, 2]: 1 |
| // (2, 4]; 2 |
| // (4, 8]: min(3, 4) = 3 |
| // |
| // Then later, say our bounds are [0, 4]. We can then sum the partial values |
| // that lie in bins that are included in the bounds. In our case it is bins |
| // (0, 1], (1, 2], (2, 4]. So 1 + 1 + 2 = 4. This is the same result if our |
| // value 7 was initially clamped between [0, 4]. |
| template <typename T2> |
| void AddToPartials(std::vector<T2>* partials, T value, |
| std::function<T2(T, T)> make_partial) { |
| AddMultipleEntriesToPartials<T2>(partials, value, 1, make_partial); |
| } |
| |
| template <typename T2> |
| void AddToPartialSums(std::vector<T2>* sums, T value) { |
| AddMultipleEntriesToPartialSums<T2>(sums, value, 1); |
| } |
| |
| // Given two vectors of partial values, add the partials in the bins between |
| // the boundaries corresponding to lower and upper to get the clamped value. |
| // The value_transform and count parameters are used to calculate the |
| // contribution of values clamped below lower or above upper, if applicable. |
| template <typename T2> |
| base::StatusOr<T2> ComputeFromPartials(const std::vector<T2>& pos_partials, |
| const std::vector<T2>& neg_partials, |
| std::function<T2(T)> value_transform, |
| T lower, T upper, int64_t count) { |
| RETURN_IF_ERROR(ValidateIsNonNegative(count, "Count")); |
| |
| // Find value by adding the partial values corresponding to bins that are |
| // between the lower and upper bound. ApproxBounds will always return a |
| // bin boundary as lower and upper bounds. |
| int lower_msb = MostSignificantBit(lower); |
| int upper_msb = MostSignificantBit(upper); |
| |
| // Find value from its per-bin partials. |
| T2 value = 0; |
| if (lower <= 0 && 0 <= upper) { |
| // If 0 is in [lower, upper], then we sum partial values corresponding |
| // to 0 to upper, and also from 0 to lower in the negative vectors. |
| if (lower < 0) { |
| for (int i = 0; i <= lower_msb; ++i) { |
| value += neg_partials[i]; |
| } |
| } |
| if (upper > 0) { |
| for (int i = 0; i <= upper_msb; ++i) { |
| value += pos_partials[i]; |
| } |
| } |
| } else if (upper < 0) { |
| // If lower and upper are negative, each value is clamped so that they |
| // contributed at most upper. Anything less they contributed is stored |
| // in partial values between lower and upper, which we add. |
| value += count * value_transform(upper); |
| for (int i = upper_msb + 1; i <= lower_msb; ++i) { |
| value += neg_partials[i]; |
| } |
| } else { // 0 < lower <= upper |
| // If lower and upper are both positive, each value is clamped to it |
| // contributed at least lower. Anything more contributed is stored |
| // between lower and upper in positive vectors, which we add. |
| value += count * value_transform(lower); |
| for (int i = lower_msb + 1; i <= upper_msb; ++i) { |
| value += pos_partials[i]; |
| } |
| } |
| return value; |
| } |
| |
| // Get additional private information in the form of a BoundingReport. Will |
| // populate any fields possible and leave the rest blank. |
| BoundingReport GetBoundingReport(T lower, T upper) { |
| BoundingReport report; |
| SetValue<T>(report.mutable_lower_bound(), lower); |
| SetValue<T>(report.mutable_upper_bound(), upper); |
| base::StatusOr<double> count = NumInputs(); |
| base::StatusOr<double> count_outside = NumInputsOutside(lower, upper); |
| if (count.ok()) { |
| report.set_num_inputs(count.value()); |
| } |
| if (count_outside.ok()) { |
| report.set_num_outside(count_outside.value()); |
| } |
| return report; |
| } |
| |
| // Return the number of positive bins for testing. |
| int64_t GetNumPosBinsForTesting() { return pos_bins_.size(); } |
| |
| // Returns a pointer to the mechanism for testing. Does not transfer |
| // ownership. |
| NumericalMechanism* GetMechanismForTesting() { return mechanism_.get(); } |
| |
| protected: |
| ApproxBounds(double epsilon, int64_t num_bins, double scale, double base, |
| double success_probability, bool has_user_set_threshold, |
| std::unique_ptr<NumericalMechanism> mechanism) |
| : Algorithm<T>(epsilon), |
| pos_bins_(num_bins, 0), |
| neg_bins_(num_bins, 0), |
| bin_boundaries_(num_bins, 0), |
| scale_(scale), |
| base_(base), |
| success_probability_(success_probability), |
| has_user_set_threshold_(has_user_set_threshold), |
| mechanism_(std::move(mechanism)) { |
| // Cache the bin boundary magnitudes for performance. Note that casting |
| // numeric limits lead to inconsistencies. |
| auto get_boundary = [boundary = scale_, base = base_]() mutable { |
| if (boundary >= std::numeric_limits<T>::max() / base) { |
| return std::numeric_limits<T>::max(); |
| } |
| double this_boundary = boundary; |
| boundary *= base; |
| return static_cast<T>(this_boundary); |
| }; |
| std::generate(bin_boundaries_.begin(), bin_boundaries_.end(), get_boundary); |
| } |
| |
| // Returns an output containing approximate min as the first element and |
| // approximate max as the second element. If not enough inputs exist to pass |
| // the threshold, populate the output with an error status. |
| base::StatusOr<Output> GenerateResult(double noise_interval_level) override { |
| // Populate noisy versions of the histogram bins. |
| noisy_pos_bins_ = AddNoise(pos_bins_); |
| noisy_neg_bins_ = AddNoise(neg_bins_); |
| |
| double success_probability = success_probability_; |
| |
| std::optional<Output> output; |
| int bounding_attempts = 0; |
| int max_bounding_attempts = 30; |
| do { |
| double threshold = mechanism_->Quantile( |
| std::pow(success_probability, 1.0 / (2 * pos_bins_.size()))); |
| |
| output = findBounds(threshold); |
| |
| if (has_user_set_threshold_) { |
| // The user asked for a specific threshold, so don't try again with a |
| // looser threshold. |
| break; |
| } |
| |
| double failure_probability = 1 - success_probability; |
| success_probability = 1 - 10 * failure_probability; |
| bounding_attempts++; |
| } while (!output.has_value() && |
| success_probability > kMinSuccessProbability && |
| bounding_attempts < max_bounding_attempts); |
| |
| // Record error status if approx min or max was not found. |
| if (!output.has_value() || output->elements_size() < 2) { |
| return absl::FailedPreconditionError( |
| "Bin count threshold was too large to find approximate " |
| "bounds. Either run over a larger dataset or decrease " |
| "success_probability and try again."); |
| } |
| |
| return *output; |
| } |
| |
| // Finds approximate bounds by comparing noised bin counts to a threshold. |
| // This method does not add any noise (it assumes that noisy_pos_bins_ and |
| // noisy_neg_bins_ have been initialised) so calling this method multiple |
| // times with different thresholds is DP: the noised histogram is itself DP. |
| std::optional<Output> findBounds(double threshold) { |
| std::optional<T> lowerBound = findLowerBound(threshold); |
| if (!lowerBound.has_value()) { |
| return std::nullopt; |
| } |
| |
| std::optional<T> upperBound = findUpperBound(threshold); |
| if (!upperBound.has_value()) { |
| return std::nullopt; |
| } |
| |
| Output output; |
| AddToOutput(&output, *lowerBound); |
| AddToOutput(&output, *upperBound); |
| return output; |
| } |
| |
| void ResetState() override { |
| std::fill(pos_bins_.begin(), pos_bins_.end(), 0); |
| std::fill(neg_bins_.begin(), neg_bins_.end(), 0); |
| } |
| |
| // Given a bin index, finds the larger-magnitude boundary of the corresponding |
| // bin for negative bin. |
| T NegRightBinBoundary(int bin_index) { |
| T pos_rbb = PosRightBinBoundary(bin_index); |
| if (pos_rbb == std::numeric_limits<T>::max()) { |
| return std::numeric_limits<T>::lowest(); |
| } |
| return -1 * pos_rbb; |
| } |
| |
| // Given a bin index, finds the larger-magnitude boundary of the corresponding |
| // bin for positive bin. |
| T PosRightBinBoundary(int bin_index) { return bin_boundaries_[bin_index]; } |
| |
| private: |
| // Adds input num_of_entries times to the bins. |
| void AddMultipleEntries(const T& input, int64_t num_of_entries) { |
| // REF: |
| // https://stackoverflow.com/questions/61646166/how-to-resolve-fpclassify-ambiguous-call-to-overloaded-function |
| absl::Status status = |
| ValidateIsPositive(num_of_entries, "Number of entries"); |
| if (std::isnan(static_cast<double>(input)) || !status.ok()) { |
| return; |
| } |
| |
| // Place into correct bin according to most significant bit and sign. Note |
| // that MostSignificantBit returns 0 for 0. |
| int index = MostSignificantBit(input); |
| if (input >= 0) { |
| pos_bins_[index] += num_of_entries; |
| } else { // value < 0 |
| neg_bins_[index] += num_of_entries; |
| } |
| } |
| |
| // Adds value to partials (as described in comment for AddToPartials()) |
| // num_of_entries times. This function more efficiently adds multiple entries |
| // at once, instead of using AddToPartials() in a for-loop. |
| template <typename T2> |
| void AddMultipleEntriesToPartials(std::vector<T2>* partials, T value, |
| int64_t num_of_entries, |
| std::function<T2(T, T)> make_partial) { |
| // REF: |
| // https://stackoverflow.com/questions/61646166/how-to-resolve-fpclassify-ambiguous-call-to-overloaded-function |
| absl::Status status = |
| ValidateIsPositive(num_of_entries, "Number of entries"); |
| if (std::isnan(static_cast<double>(value)) || !status.ok()) { |
| return; |
| } |
| |
| int msb = MostSignificantBit(value); |
| |
| // Each bin of the logarithmic histograms in ApproxBounds can be a candidate |
| // for auto-determined upper and lower bounds. Thus, we store a contribution |
| // of the value from the value for each bin. |
| for (int i = 0; i <= msb; ++i) { |
| // The maximum contribution to the bin is the partial between boundaries. |
| T2 partial = 0; |
| if (value >= 0) { |
| partial = make_partial(PosRightBinBoundary(i), PosLeftBinBoundary(i)); |
| } else { |
| partial = make_partial(NegRightBinBoundary(i), NegLeftBinBoundary(i)); |
| } |
| |
| if (i < msb) { |
| // For indices below the msb, add the maximum contribution |
| // (num_of_entries times) to the partial. |
| |
| (*partials)[i] += partial * num_of_entries; |
| } else { |
| // For i = msb, add the remaining contribution (num_of_entries times), |
| // but not more than the maximum contribution to the partial for this |
| // bin. This may occur if the msb was clamped by the ApproxBounds not |
| // having enough bins. |
| T2 remainder; |
| if (value > 0) { |
| remainder = make_partial(value, PosLeftBinBoundary(i)); |
| } else { |
| remainder = make_partial(value, NegLeftBinBoundary(i)); |
| } |
| if (std::abs(partial) < std::abs(remainder)) { |
| (*partials)[msb] += partial * num_of_entries; |
| } else { |
| (*partials)[msb] += remainder * num_of_entries; |
| } |
| } |
| } |
| } |
| |
| // Break value into its partial sums and store it into the sums vector. A |
| // specific use case of AddToPartials used in some algorithms. |
| template <typename T2> |
| void AddMultipleEntriesToPartialSums(std::vector<T2>* sums, T value, |
| int64_t num_of_entries) { |
| AddMultipleEntriesToPartials<T2>( |
| sums, value, num_of_entries, |
| [](T val1, T val2) { return val1 - val2; }); |
| } |
| |
| // Add noise to each member of bins and return noisy vector. |
| const std::vector<T> AddNoise(const std::vector<int64_t>& bins) { |
| std::vector<T> noisy_bins(bins.size()); |
| for (int i = 0; i < bins.size(); ++i) { |
| noisy_bins[i] = mechanism_->AddNoise(bins[i]); |
| } |
| return noisy_bins; |
| } |
| |
| // Given a bin index, finds the smaller-magnitude boundary of the |
| // corresponding bin for positive bin. |
| T PosLeftBinBoundary(int bin_index) { |
| if (bin_index == 0) { |
| return 0; |
| } |
| return PosRightBinBoundary(bin_index - 1); |
| } |
| |
| // Given a bin index, finds the smaller-magnitude boundary of the |
| // corresponding bin for negative bin. |
| T NegLeftBinBoundary(int bin_index) { |
| return -1 * PosLeftBinBoundary(bin_index); |
| } |
| |
| // Calculate the noisy number of inputs outside the two bounds from the |
| // most recent result generation. Inputs equal to either bound may or may not |
| // be part of the count. Input lower and upper are rounded to the nearest |
| // larger-magnitude bin boundary. |
| base::StatusOr<double> NumInputsOutside(T lower, T upper) { |
| // Check that noisy bins have been populated. |
| if (noisy_pos_bins_.empty()) { |
| return absl::InvalidArgumentError( |
| "Noisy histogram bins have not been created. Try generating " |
| "results first."); |
| } |
| |
| int lower_msb = MostSignificantBit(lower); |
| int upper_msb = MostSignificantBit(upper); |
| double num_outside = 0; |
| |
| // Add the count of inputs below lower. |
| int pos_i = 0; |
| int neg_i = noisy_neg_bins_.size(); |
| if (lower == 0) { |
| neg_i = -1; |
| } else if (lower < 0) { |
| neg_i = lower_msb; |
| } else { // lower > 0 |
| neg_i = -1; |
| pos_i = lower_msb + 1; |
| } |
| for (int i = noisy_neg_bins_.size() - 1; i > neg_i; --i) { |
| num_outside += noisy_neg_bins_[i]; |
| } |
| for (int i = 0; i < pos_i; ++i) { |
| num_outside += noisy_pos_bins_[i]; |
| } |
| |
| // Add the count of inputs above upper. |
| pos_i = noisy_pos_bins_.size(); |
| neg_i = -1; |
| if (upper == 0) { |
| pos_i = 0; |
| } else if (upper < 0) { |
| pos_i = 0; |
| neg_i = upper_msb; |
| } else { // upper > 0. |
| pos_i = upper_msb + 1; |
| } |
| for (int i = neg_i; i >= 0; --i) { |
| num_outside += noisy_neg_bins_[i]; |
| } |
| for (int i = pos_i; i < noisy_pos_bins_.size(); ++i) { |
| num_outside += noisy_pos_bins_[i]; |
| } |
| |
| return num_outside; |
| } |
| |
| // Get the noisy number of total inputs by summing all noisy histogram bins. |
| base::StatusOr<double> NumInputs() { |
| // Number of inputs outside of the empty set. |
| return NumInputsOutside(0, 0); |
| } |
| |
| // Friend class for testing only. |
| friend class ApproxBoundsTestPeer; |
| |
| // Needed for classes that rely on ApproxBounds::AddMultipleEntries() |
| template <typename T2> |
| friend class BoundedMeanWithApproxBounds; |
| template <typename T2> |
| friend class BoundedVarianceWithApproxBounds; |
| |
| private: |
| std::optional<T> findLowerBound(double threshold) { |
| // Find first bin above threshold for minimum. |
| for (int i = neg_bins_.size() - 1; i >= 0; --i) { |
| if (noisy_neg_bins_[i] >= threshold) { |
| return NegRightBinBoundary(i); |
| } |
| } |
| for (int i = 0; i < pos_bins_.size(); ++i) { |
| if (noisy_pos_bins_[i] >= threshold) { |
| return PosLeftBinBoundary(i); |
| } |
| } |
| return std::nullopt; |
| } |
| |
| std::optional<T> findUpperBound(double threshold) { |
| // Find first bin above threshold for maximum. |
| for (int i = pos_bins_.size() - 1; i >= 0; --i) { |
| if (noisy_pos_bins_[i] >= threshold) { |
| return PosRightBinBoundary(i); |
| } |
| } |
| |
| for (int i = 0; i < neg_bins_.size(); ++i) { |
| if (noisy_neg_bins_[i] >= threshold) { |
| return NegLeftBinBoundary(i); |
| } |
| } |
| |
| return std::nullopt; |
| } |
| |
| // Count the values in each logarithmic bin for positives and negatives. |
| std::vector<int64_t> pos_bins_; |
| std::vector<int64_t> neg_bins_; |
| |
| // Noisy DP counts of the positive and negative bins. Populated upon |
| // generating the result. |
| std::vector<T> noisy_pos_bins_; |
| std::vector<T> noisy_neg_bins_; |
| |
| // The bin boundary magnitudes, starting from lowest positive magnitude. |
| std::vector<T> bin_boundaries_; |
| |
| // Multiplicative factor for inputs |
| double scale_; |
| |
| // Base of the logarithm. |
| double base_; |
| |
| // The desired probability that, when the dataset is empty, no bin counts are |
| // above the threshold for determing whether a bin is empty. |
| double success_probability_; |
| |
| // The minimum allowed success probability when relaxing the success |
| // probability. If approx bounds fails to find bounds, it will reduce the |
| // success probability and, if the reduced success probability is still |
| // greater than kMinSuccessProbability, attempt to find bounds with the |
| // reduced success probability. Having a minimum success probability ensures |
| // we fail rather than returning bounds that are just due to noised empty |
| // bins. |
| static constexpr double kMinSuccessProbability = 1 - 1e-3; |
| |
| // Whether the user chose a specific threshold for determining whether a bin |
| // is empty, rather than using a value computed from success_probability_. |
| bool has_user_set_threshold_; |
| |
| // Mechanism for adding noise to buckets. |
| std::unique_ptr<NumericalMechanism> mechanism_; |
| }; |
| |
| template <typename T> |
| class ApproxBounds<T>::Builder { |
| public: |
| ApproxBounds<T>::Builder& SetEpsilon(double epsilon) { |
| epsilon_ = epsilon; |
| return *this; |
| } |
| |
| // This is just a stub that will be implemented once we have support for |
| // Gaussian. |
| ApproxBounds<T>::Builder& SetDelta(double delta) { return *this; } |
| |
| ApproxBounds<T>::Builder& SetMaxPartitionsContributed( |
| int max_partitions_contributed) { |
| max_partitions_contributed_ = max_partitions_contributed; |
| return *this; |
| } |
| |
| ApproxBounds<T>::Builder& SetMaxContributionsPerPartition( |
| int max_contributions_per_partition) { |
| max_contributions_per_partition_ = max_contributions_per_partition; |
| return *this; |
| } |
| |
| ApproxBounds<T>::Builder& SetLaplaceMechanism( |
| std::unique_ptr<NumericalMechanismBuilder> builder) { |
| mechanism_builder_ = std::move(builder); |
| return *this; |
| } |
| |
| ApproxBounds<T>::Builder& SetNumBins(int64_t num_bins) { |
| num_bins_ = num_bins; |
| return *this; |
| } |
| |
| ApproxBounds<T>::Builder& SetScale(double scale) { |
| scale_ = scale; |
| return *this; |
| } |
| |
| ApproxBounds<T>::Builder& SetBase(double base) { |
| base_ = base; |
| return *this; |
| } |
| |
| // Set exactly one of success_probability or k threshold. |
| ApproxBounds<T>::Builder& SetSuccessProbability(double success_probability) { |
| success_probability_ = success_probability; |
| threshold_.reset(); |
| return *this; |
| } |
| |
| // Set exactly one of success_probability or k threshold. Not recommended |
| // for use in non-test code: if you know enough about your sample |
| // distribution to choose a value for this parameter, then you probably know |
| // enough to choose sensible bounds for your sample. |
| ABSL_DEPRECATED("Use SetThresholdForTest instead") |
| ApproxBounds<T>::Builder& SetThreshold(double threshold) { |
| return SetThresholdForTest(threshold); |
| } |
| |
| // Set exactly one of success_probability or k threshold. Not recommended |
| // for use in non-test code: if you know enough about your sample |
| // distribution to choose a value for this parameter, then you probably know |
| // enough to choose sensible bounds for your sample. |
| ApproxBounds<T>::Builder& SetThresholdForTest(double threshold) { |
| threshold_ = threshold; |
| return *this; |
| } |
| |
| base::StatusOr<std::unique_ptr<ApproxBounds<T>>> Build() { |
| if (!epsilon_.has_value()) { |
| epsilon_ = DefaultEpsilon(); |
| LOG(WARNING) << "Default epsilon of " << epsilon_.value() |
| << " is being used. Consider setting your own epsilon based " |
| "on privacy considerations."; |
| } |
| |
| RETURN_IF_ERROR(ValidateEpsilon(epsilon_)); |
| RETURN_IF_ERROR( |
| ValidateMaxPartitionsContributed(max_partitions_contributed_)); |
| RETURN_IF_ERROR( |
| ValidateMaxContributionsPerPartition(max_contributions_per_partition_)); |
| |
| // Check the validity of the histogram parameters. num_bin and |
| // success_probability restrictions prevent undefined threshold |
| // calculation. |
| RETURN_IF_ERROR(ValidateIsPositive(num_bins_, "Number of bins")); |
| RETURN_IF_ERROR(ValidateIsFiniteAndPositive(scale_, "Scale")); |
| RETURN_IF_ERROR(ValidateIsFinite(base_, "Base")); |
| RETURN_IF_ERROR(ValidateIsGreaterThanOrEqualTo(base_, 1, "Base")); |
| |
| // TODO: Handle case where scale * base^num_bins > |
| // std::numeric_limits<T>::max, even though the ApproxBounds constructor |
| // addresses this. |
| if (threshold_.has_value()) { |
| RETURN_IF_ERROR(ValidateIsFinite(threshold_.value(), "k threshold")); |
| RETURN_IF_ERROR(ValidateIsNonNegative(threshold_.value(), "k threshold")); |
| } else { |
| RETURN_IF_ERROR(ValidateIsInExclusiveInterval(success_probability_, 0, 1, |
| "Success probability")); |
| } |
| |
| ASSIGN_OR_RETURN(std::unique_ptr<NumericalMechanism> mechanism, |
| mechanism_builder_->SetEpsilon(epsilon_.value()) |
| .SetL0Sensitivity(max_partitions_contributed_) |
| .SetLInfSensitivity(max_contributions_per_partition_) |
| .Build()); |
| |
| if (threshold_.has_value()) { |
| // If the user specified a threshold rather than a success probability, |
| // then calculate the success probability that corresponds to the |
| // threshold. |
| success_probability_ = |
| std::pow(mechanism->Cdf(threshold_.value()), 2 * num_bins_); |
| } |
| |
| // Create ApproxBounds. |
| return absl::WrapUnique(new ApproxBounds( |
| epsilon_.value(), num_bins_, scale_, base_, success_probability_, |
| threshold_.has_value(), std::move(mechanism))); |
| } |
| |
| private: |
| // Default scale depends on the input type T. |
| static double DefaultScaleForT() { |
| if (std::is_integral<T>::value) { |
| return 1.0; |
| } else { |
| return std::numeric_limits<T>::min(); |
| } |
| } |
| |
| absl::optional<double> epsilon_; |
| int max_partitions_contributed_ = 1; |
| int max_contributions_per_partition_ = 1; |
| std::unique_ptr<NumericalMechanismBuilder> mechanism_builder_ = |
| absl::make_unique<LaplaceMechanism::Builder>(); |
| |
| absl::optional<double> threshold_; |
| double scale_ = DefaultScaleForT(); |
| double base_ = 2.0; |
| double success_probability_ = 1 - std::pow(10, -9); |
| |
| // Take the subtraction of two logarithms to prevent overflows. |
| int64_t num_bins_ = |
| std::ceil((std::log(std::numeric_limits<T>::max()) - std::log(scale_)) / |
| std::log(base_)) + |
| 1; |
| }; |
| |
| } // namespace differential_privacy |
| |
| #endif // DIFFERENTIAL_PRIVACY_ALGORITHMS_APPROX_BOUNDS_H_ |