blob: 42bcb0b5267eca906afd9ee61fa600d64e63a4b8 [file] [log] [blame]
//
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef DIFFERENTIAL_PRIVACY_CPP_ALGORITHMS_QUANTILE_TREE_H_
#define DIFFERENTIAL_PRIVACY_CPP_ALGORITHMS_QUANTILE_TREE_H_
#include <cmath>
#include <cstdlib>
#include <limits>
#include <unordered_map>
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "algorithms/algorithm.h"
#include "algorithms/bounded-algorithm.h"
#include "algorithms/internal/count-tree.h"
#include "algorithms/numerical-mechanisms.h"
#include "proto/util.h"
#include "proto/confidence-interval.pb.h"
#include "proto/summary.pb.h"
#include "base/status_macros.h"
namespace differential_privacy {
// A small tolerance on the quantile we're searching for. We'll be aiming to
// return a value that's within this tolerance of the chosen quantile. This is
// a post-processing parameter with no privacy implications.
constexpr double kNumericalTolerance = 1.0e-6;
// Default tree parameters. Will result in splitting the input space into 16^4
// = 65536 equal buckets. Using a larger height or branching factor will
// split the input space more finely, resulting in greater precision but also
// increasing space used. Increasing the height will increase the amount of
// noise that is added. These parameters were selected based on experiments.
constexpr int kDefaultTreeHeight = 4;
constexpr int kDefaultBranchingFactor = 16;
// Fraction a node needs to contribute to the total count of itself and its
// siblings to be considered during the search for a particular quantile. The
// idea of alpha is to filter out noisy empty nodes. This is a post processing
// parameter with no privacy implications.
constexpr double kAlpha = 0.005;
// Calculates differentially private quantiles using a tree-based data
// structure. See a full writeup of the algorithm at:
// https://github.com/google/differential-privacy/blob/main/common_docs/Differentially_Private_Quantile_Trees.pdf
//
// This algorithm can be used to calculate an arbitrarily large number of
// quantiles with no loss in accuracy or additional expenditure of privacy
// budget.
//
// This is not an Algorithm, and does not behave in the same way as other
// algorithms. For a quantile implementation that follows the Algorithm
// interface, see multi-quantile.h.
template <typename T>
class QuantileTree {
public:
class Builder;
class Privatized;
void AddEntry(const T& input) { AddMultipleEntries(input, 1); }
// Removes all input from the QuantileTree. After calling this method, the
// QuantileTree will be equivalent to one that is newly initialized with no
// input added.
void Reset() { tree_.ClearNodes(); }
struct DPParams {
double epsilon;
double delta;
int max_contributions_per_partition;
int max_partitions_contributed_to;
std::unique_ptr<NumericalMechanismBuilder> mechanism_builder;
};
// Returns a private version of the quantile tree, which can be used to get
// differentially private quantiles. Each call to this method expends the
// epsilon and delta specified in the params.
absl::StatusOr<Privatized> MakePrivate(const DPParams& params) {
ASSIGN_OR_RETURN(
std::unique_ptr<NumericalMechanism> mech,
params.mechanism_builder->SetEpsilon(params.epsilon)
.SetDelta(params.delta)
.SetL0Sensitivity(params.max_partitions_contributed_to *
tree_.GetHeight())
.SetLInfSensitivity(params.max_contributions_per_partition)
.Build());
return Privatized(upper_, lower_, std::move(mech), tree_);
}
BoundedQuantilesSummary Serialize() {
BoundedQuantilesSummary to_return = tree_.Serialize();
to_return.set_lower(lower_);
to_return.set_upper(upper_);
return to_return;
}
absl::Status Merge(const BoundedQuantilesSummary& summary) {
if (static_cast<double>(lower_) != summary.lower() ||
static_cast<double>(upper_) != summary.upper()) {
return absl::InternalError(absl::StrCat(
"Bounds mismatch. Tree: [", lower_, ", ", upper_, "] ",
", summary: [", summary.lower(), ", ", summary.upper(), "]"));
}
return tree_.Merge(summary);
}
int64_t MemoryUsed() {
return sizeof(QuantileTree) - sizeof(internal::CountTree) +
tree_.MemoryUsed();
}
int GetHeight() { return tree_.GetHeight(); }
int GetBranchingFactor() { return tree_.GetBranchingFactor(); }
private:
QuantileTree(T lower, T upper, int tree_height, int branching_factor)
: lower_(lower), upper_(upper), tree_(tree_height, branching_factor) {}
int getLeafIndex(T input) {
double leaf_fraction =
static_cast<double>(input - lower_) / (upper_ - lower_);
return tree_.GetNthLeaf(leaf_fraction * (tree_.GetNumberOfLeaves() - 1));
}
void AddMultipleEntries(const T& input, const int64_t times) {
// REF:
// https://stackoverflow.com/questions/61646166/how-to-resolve-fpclassify-ambiguous-call-to-overloaded-function
if (std::isnan(static_cast<double>(input))) {
return;
}
if (times <= 0) {
return;
}
int currentNode = getLeafIndex(Clamp(lower_, upper_, input));
while (currentNode > tree_.GetRoot()) {
tree_.IncrementNodeBy(currentNode, times);
currentNode = tree_.Parent(currentNode);
}
}
T lower_;
T upper_;
internal::CountTree tree_;
friend class QuantileTreeTestPeer;
};
// A private version of a quantile tree. Used for calculating differentially
// private quantiles. It will contain raw data internally, but only
// differentially private results can be accessed.
template <typename T>
class QuantileTree<T>::Privatized {
public:
absl::StatusOr<double> GetQuantile(double quantile) {
if (quantile < 0 || quantile > 1) {
return absl::InvalidArgumentError(absl::StrCat(
"Requested quantile must be in [0, 1] but was ", quantile));
}
quantile = ClampQuantile(quantile);
int current_node = raw_tree_.GetRoot();
while (!raw_tree_.IsLeaf(current_node)) {
int left_most_child = raw_tree_.LeftMostChild(current_node);
int right_most_child = raw_tree_.RightMostChild(current_node);
double total_count = 0.0;
for (int i = left_most_child; i <= right_most_child; ++i) {
total_count += GetNoisedCount(i);
}
// All child nodes appear to be empty. No need to continue down the tree.
if (total_count <= 0) break;
// Remove nodes that make up less than an alpha fraction of the total -
// these are likely empty.
double corrected_total_count = 0.0;
for (int i = left_most_child; i <= right_most_child; ++i) {
corrected_total_count +=
GetNoisedCount(i) >= total_count * kAlpha ? GetNoisedCount(i) : 0.0;
}
// All child nodes have a negligible noisy count. We can't tell whether
// they have any elements in them, and if so how many, so we can stop
// and pick the middle of this range.
if (corrected_total_count <= 0) break;
double partial_count = 0.0;
for (int i = left_most_child; i <= right_most_child; ++i) {
double count = GetNoisedCount(i);
// Ignore nodes we think are empty.
partial_count += count >= total_count * kAlpha ? count : 0.0;
if (partial_count / corrected_total_count >=
quantile - kNumericalTolerance) {
quantile =
(quantile - (partial_count - count) / corrected_total_count) /
(count / corrected_total_count);
quantile = std::min(std::max(quantile, 0.0), 1.0);
current_node = i;
break;
}
}
}
double to_return = (1 - quantile) * GetSubtreeLowerBound(current_node) +
quantile * GetSubtreeUpperBound(current_node);
return to_return;
}
absl::StatusOr<ConfidenceInterval> ComputeNoiseConfidenceInterval(
double quantile, double confidence_interval_level) {
if (quantile < 0 || quantile > 1) {
return absl::InvalidArgumentError(
absl::StrCat("Quantile must be in [0, 1], but was ", quantile));
}
quantile = ClampQuantile(quantile);
ASSIGN_OR_RETURN(double lower_bound,
ComputeNoiseConfidenceIntervalBound(
quantile, confidence_interval_level,
ConfidenceIntervalBoundType::LOWER));
ASSIGN_OR_RETURN(double upper_bound,
ComputeNoiseConfidenceIntervalBound(
quantile, confidence_interval_level,
ConfidenceIntervalBoundType::UPPER));
ConfidenceInterval confidence_interval;
confidence_interval.set_lower_bound(lower_bound);
confidence_interval.set_upper_bound(upper_bound);
confidence_interval.set_confidence_level(confidence_interval_level);
return confidence_interval;
}
private:
friend class QuantileTree<T>;
Privatized(T upper, T lower, std::unique_ptr<NumericalMechanism> mechanism,
const internal::CountTree& raw_tree)
: raw_tree_(raw_tree),
upper_(upper),
lower_(lower),
mechanism_(std::move(mechanism)) {}
// These are used by the confidence interval computation algorithm.
enum class ConfidenceIntervalBoundType { LOWER, UPPER };
struct IndexAndQuantile {
int index;
double quantile;
IndexAndQuantile(int initial_index, double initial_quantile)
: index(initial_index), quantile(initial_quantile) {}
};
// Returns a pair of the index of the child node visited next in the quantile
// search together with the rank for the next iteration of the search. If all
// child nodes are considered empty, null is returned.
IndexAndQuantile GetNextIndexAndQuantile(
double quantile, int leftmost_child_index, int rightmost_child_index,
const std::unordered_map<int, double>& node_counts) {
double total_count = 0.0;
for (int i = leftmost_child_index; i <= rightmost_child_index; i++) {
total_count += std::max(0.0, node_counts.at(i));
}
double corrected_total_count = 0.0;
for (int i = leftmost_child_index; i <= rightmost_child_index; i++) {
// Treat child nodes contributing less than a gamma fraction to the total
// count as empty subtrees.
double node_count_i = node_counts.at(i);
corrected_total_count +=
node_count_i >= total_count * kAlpha ? node_count_i : 0.0;
}
if (corrected_total_count == 0.0) {
// Either all counts are 0.0 or no child node contributes more than an
// alpha fraction to the total count (the latter can only happen when
// alpha > 1 / branching factor, which is not the case for the default
// branching factor). This means that all child nodes are considered
// empty.
return IndexAndQuantile(-1, -1);
}
// Determine the child node whose subtree contains the bound.
double partial_count = 0.0;
for (int i = leftmost_child_index; true; i++) {
double count = node_counts.at(i);
// Skip child nodes contributing less than gamma to the total count.
if (count < total_count * kAlpha) {
continue;
}
partial_count += count;
// Check if the bound is in the current child's subtree.
if (partial_count / corrected_total_count <
quantile - kNumericalTolerance) {
continue;
}
double next_quantile =
(quantile - (partial_count - count) / corrected_total_count) /
(count / corrected_total_count);
// Clamping rank to a value between 0.0 and 1.0. Note that rank can
// become greater than 1 because of the numerical tolerance. Values
// less than 0.0 should not occur. The respective clamping is set in
// place to be on the safe side.
next_quantile = std::min(std::max(0.0, next_quantile), 1.0);
return IndexAndQuantile(i, next_quantile);
}
}
// The following computation of a lower or upper interval bound is based on
// the same search algorithm used to compute the respective quantile. The
// difference is that instead of using the noised node counts to determine the
// direction of the search, the algorithm uses the confidence intervals bounds
// of the node counts.
absl::StatusOr<double> ComputeNoiseConfidenceIntervalBound(
double quantile, double confidence_interval_level,
const ConfidenceIntervalBoundType& bound_type) {
// Let b be the branching factor and h the height of the tree. The search
// for a quantile queries at most b * h node counts. Assigning a confidence
// interval with error probability
// alpha_per_count = 1-confidence_interval_level^(1 / (b * h))
// to each of these counts guarantees that the true counts are contained
// within these confidence intervals with error probability
// 1 - (1 - alpha_per_count)^(b * h)
// = 1 - (1 - (1 - (confidence_interval_level)^(1 / (b * h))))^(b * h)
// = 1 - confidence_interval_level,
// which matches the specified error probability.
double alpha_per_count =
1 - std::pow(
confidence_interval_level,
1.0 / (raw_tree_.GetBranchingFactor() * raw_tree_.GetHeight()));
// Confidence interval of a node count of 0 with error probability
// confidence_level_per_count. All other node count confidence intervals are
// computed by shifting this interval, which is faster than calling
// ComputeConfidenceInterval() for each node count individually.
// privacy_budget is set to the maximum of 1.0 because computing the noise
// confidence intervals for quantiles does not consume any privacy budget.
ASSIGN_OR_RETURN(ConfidenceInterval zero_confidence_interval,
mechanism_->NoiseConfidenceInterval(1 - alpha_per_count));
// Value of the bound that is being computed. The value is set to the
// tightest bound possible and loosened successively as needed.
double bound =
bound_type == ConfidenceIntervalBoundType::LOWER ? upper_ : lower_;
int index = raw_tree_.GetRoot();
// Search for the index of the leaf node containing the desired bound,
// starting at the root.
while (index < raw_tree_.GetLeftMostLeaf()) {
int leftmost_child_index = raw_tree_.LeftMostChild(index);
int rightmost_child_index = raw_tree_.RightMostChild(index);
// Index of the node visited next in the search. The value is set to the
// tightest index possible and loosened successively as needed.
int next_index = bound_type == ConfidenceIntervalBoundType::LOWER
? std::numeric_limits<int>::max()
: std::numeric_limits<int>::min();
// Quantile used in the next iteration of the search. The value will be
// set with the first update of next_index.
double next_quantile = -1.0;
std::unordered_map<int, ConfidenceInterval> child_confidence_intervals;
for (int i = leftmost_child_index; i <= rightmost_child_index; i++) {
ConfidenceInterval ci;
ci.set_lower_bound(noised_tree_[i] +
zero_confidence_interval.lower_bound());
ci.set_upper_bound(noised_tree_[i] +
zero_confidence_interval.upper_bound());
child_confidence_intervals[i] = ci;
}
// Let [l_i, u_i] denote the confidence interval of child node i. To find
// a lower bound b for the quantiles that can be reached via a particular
// configuration of counts c_i such that l_i ≤ c_i ≤ u_i, the counts to
// left of b should be as large as possible while the counts to the right
// of b should be as small as possible. Thus, we set
// c_i = u_i if i <= j and c_i = l_i if i > j
// for some index j. Similarly, an upper bound can be obtained by setting
// c_i = l_i if i <= j and c_i = u_i if i > j.
//
// Because we don't know the index j in advance, we go through all
// possible indices j and pick whichever yields the smallest lower bound
// or largest upper bound.
for (int j = leftmost_child_index - 1; j <= rightmost_child_index; j++) {
std::unordered_map<int, double> count_bounds;
for (int i = leftmost_child_index; i <= j; i++) {
count_bounds[i] = bound_type == ConfidenceIntervalBoundType::LOWER
? child_confidence_intervals[i].upper_bound()
: child_confidence_intervals[i].lower_bound();
}
for (int i = j + 1; i <= rightmost_child_index; i++) {
count_bounds[i] = bound_type == ConfidenceIntervalBoundType::LOWER
? child_confidence_intervals[i].lower_bound()
: child_confidence_intervals[i].upper_bound();
}
IndexAndQuantile next_index_and_quantile =
GetNextIndexAndQuantile(quantile, leftmost_child_index,
rightmost_child_index, count_bounds);
if (next_index_and_quantile.index == -1) {
// All child nodes are considred empty. Update the bound with a linear
// interpolation of the smallest and largest value associated with the
// current node if the result yields a looser bound.
if (bound_type == ConfidenceIntervalBoundType::LOWER) {
bound =
std::min(bound, (1 - quantile) * GetSubtreeLowerBound(index) +
quantile * GetSubtreeUpperBound(index));
} else {
bound =
std::max(bound, (1 - quantile) * GetSubtreeLowerBound(index) +
quantile * GetSubtreeUpperBound(index));
}
} else {
// Update nextIndex and nextRank if this results in a looser bound.
if ((bound_type == ConfidenceIntervalBoundType::LOWER &&
next_index_and_quantile.index <= next_index) ||
(bound_type == ConfidenceIntervalBoundType::UPPER &&
next_index_and_quantile.index >= next_index)) {
if (next_index_and_quantile.index != next_index) {
next_index = next_index_and_quantile.index;
next_quantile = next_index_and_quantile.quantile;
} else if ((bound_type == ConfidenceIntervalBoundType::LOWER &&
next_index_and_quantile.quantile < next_quantile) ||
(bound_type == ConfidenceIntervalBoundType::UPPER &&
next_index_and_quantile.quantile > next_quantile)) {
next_quantile = next_index_and_quantile.quantile;
}
}
}
}
// Check if the current node was considered empty for all values of j
// (this is the case when nextRank is still its initial invalid value of
// -1). If so, the search can be stopped and the bound returned.
// Otherwise, continue the search in the next node with the respective
// new rank.
if (next_quantile == -1) {
return bound;
}
index = next_index;
quantile = next_quantile;
}
// The search has reached a leaf node. In this case, we either return a
// linear interpolation between the smallest and the largest value
// associated with the leaf node, or the bound computed so far should it be
// looser.
double linear_interpolation = (1 - quantile) * GetSubtreeLowerBound(index) +
quantile * GetSubtreeUpperBound(index);
return bound_type == ConfidenceIntervalBoundType::LOWER
? std::min(bound, linear_interpolation)
: std::max(bound, linear_interpolation);
}
double GetNoisedCount(int index) {
if (noised_tree_.find(index) == noised_tree_.end()) {
noised_tree_[index] = mechanism_->AddNoise(raw_tree_.GetNodeCount(index));
}
return noised_tree_[index];
}
double GetSubtreeLowerBound(int index) {
int leaf_index =
raw_tree_.LeftMostInSubtree(index) - raw_tree_.GetLeftMostLeaf();
double quantile =
static_cast<double>(leaf_index) / raw_tree_.GetNumberOfLeaves();
return quantile * upper_ + (1 - quantile) * lower_;
}
double GetSubtreeUpperBound(int index) {
int leaf_index =
raw_tree_.RightMostInSubtree(index) - raw_tree_.GetLeftMostLeaf() + 1;
double quantile =
static_cast<double>(leaf_index) / raw_tree_.GetNumberOfLeaves();
return quantile * upper_ + (1 - quantile) * lower_;
}
// Clamps a quantile to a value between 0.005 and 0.995. This mitigates the
// inaccuracy of the quantile tree mechanism when finding a quantile close to
// 0 or 1.
static double ClampQuantile(double quantile) {
return std::min(std::max(0.005, quantile), 0.995);
}
const T upper_;
const T lower_;
std::unique_ptr<NumericalMechanism> mechanism_;
const internal::CountTree raw_tree_;
std::unordered_map<int64_t, int64_t> noised_tree_;
};
template <typename T>
class QuantileTree<T>::Builder {
public:
Builder() = default;
Builder& SetTreeHeight(int tree_height) {
tree_height_ = tree_height;
return *static_cast<Builder*>(this);
}
Builder& SetBranchingFactor(int branching_factor) {
branching_factor_ = branching_factor;
return *static_cast<Builder*>(this);
}
Builder& SetLower(T lower) {
lower_ = lower;
return *static_cast<Builder*>(this);
}
Builder& SetUpper(T upper) {
upper_ = upper;
return *static_cast<Builder*>(this);
}
absl::StatusOr<std::unique_ptr<QuantileTree<T>>> Build() {
if (!tree_height_.has_value()) {
tree_height_ = kDefaultTreeHeight;
}
if (!branching_factor_.has_value()) {
branching_factor_ = kDefaultBranchingFactor;
}
if (!lower_.has_value() || !upper_.has_value()) {
return absl::InvalidArgumentError(
"Lower and upper bounds must both be set.");
}
if (tree_height_.value() < 1) {
return absl::InvalidArgumentError(absl::StrCat(
"Tree height must be at least 1, but was ", tree_height_.value()));
}
if (branching_factor_.value() < 2) {
return absl::InvalidArgumentError(
absl::StrCat("Branching factor must be at least 2, but was ",
branching_factor_.value()));
}
if (lower_.value() >= upper_.value()) {
return absl::InvalidArgumentError(
absl::StrCat("Lower bound must be less than upper bound, but lower: ",
lower_.value(), " >= upper: ", upper_.value()));
}
return std::unique_ptr<QuantileTree>(
new QuantileTree(lower_.value(), upper_.value(), tree_height_.value(),
branching_factor_.value()));
}
private:
absl::optional<int> tree_height_;
absl::optional<int> branching_factor_;
absl::optional<T> lower_;
absl::optional<T> upper_;
};
} // namespace differential_privacy
#endif // DIFFERENTIAL_PRIVACY_CPP_ALGORITHMS_QUANTILE_TREE_H_