blob: b2019a9b253a12558937b2f81dc963913783c1ec [file] [log] [blame]
# Copyright 2020 Google LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Implementing Privacy Loss Distribution.
This file implements the privacy loss distribution (PLD) and its basic
functionalities. The main feature of PLD is that it allows for accurate
computation of privacy parameters under composition. Please refer to the
supplementary material below for more details:
../../common_docs/Privacy_Loss_Distributions.pdf
"""
import collections
import logging
import math
from typing import Mapping, Optional, Callable, Any
import numpy as np
from dp_accounting import common
from dp_accounting import privacy_loss_mechanism
class BasicPrivacyLossDistribution(object):
"""Class for single privacy loss distributions and computation involving them.
The privacy loss distribution (PLD) of two discrete distributions, the upper
distribution mu_upper and the lower distribution mu_lower, is defined as a
distribution on real numbers generated by first sampling an outcome o
according to mu_upper and then outputting the privacy loss
ln(mu_upper(o) / mu_lower(o)) where mu_lower(o) and mu_upper(o) are the
probability masses of o in mu_lower and mu_upper respectively. This class
allows one to create and manipulate privacy loss distributions.
PLD allows one to (approximately) compute the epsilon-hockey stick divergence
between mu_upper and mu_lower, which is defined as
sum_{o} [mu_upper(o) - e^{epsilon} * mu_lower(o)]_+. This quantity in turn
governs the parameter delta of (eps, delta)-differential privacy of the
corresponding protocol. (See Observation 1 in the supplementary material.)
The above definitions extend to continuous distributions. The PLD of two
continuous distributions mu_upper and mu_lower is defined as a distribution on
real numbers generated by first sampling an outcome o according to mu_upper
and then outputting the privacy loss ln(f_{mu_upper}(o) / f_{mu_lower}(o))
where f_{mu_lower}(o) and f_{mu_upper}(o) are the probability density
functions at o in mu_lower and mu_upper respectively. Moreover, for continuous
distributions the epsilon-hockey stick divergence is defined as
integral [f_{mu_upper}(o) - e^{epsilon} * f_{mu_lower}(o)]_+ do.
Attributes:
rounded_probability_mass_function: the probability mass function for the
privacy loss distribution where each value is rounded to be an integer
multiple of value_discretization_interval. To avoid floating point errors
in the values, the keys here are the integer multipliers. For example,
suppose that the probability mass function assigns mass of 0.1 to the
value 2 * value_discretization_interval, then the dictionary will have
(key: value) pair (2: 0.1).
value_discretization_interval: the interval length for which the values of
the privacy loss distribution are discretized. In particular, the values
are always integer multiples of value_discretization_interval. Smaller
value results in more accurate estimates of the privacy loss, at the cost
of increased run-time / memory usage.
infinity_mass: The probability mass of mu_upper over all the outcomes that
can occur only in mu_upper but not in mu_lower.(These outcomes result in
privacy loss ln(mu_upper(o) / mu_lower(o)) of infinity.)
pessimistic_estimate: whether the rounding is done in such a way that the
resulting epsilon-hockey stick divergence computation gives an upper
estimate to the real value.
"""
def __init__(self,
rounded_probability_mass_function: Mapping[int, float],
value_discretization_interval: float,
infinity_mass: float,
pessimistic_estimate: bool = True):
self.rounded_probability_mass_function = rounded_probability_mass_function
self.value_discretization_interval = value_discretization_interval
self.infinity_mass = infinity_mass
self.pessimistic_estimate = pessimistic_estimate
def get_delta_for_epsilon(self, epsilon: float) -> float:
"""Computes the epsilon-hockey stick divergence between mu_upper, mu_lower.
When this privacy loss distribution corresponds to a mechanism, the
epsilon-hockey stick divergence gives the value of delta for which the
mechanism is (epsilon, delta)-differentially private. (See Observation 1 in
the supplementary material.)
Args:
epsilon: the epsilon in epsilon-hockey stick divergence.
Returns:
A non-negative real number which is the epsilon-hockey stick divergence
between the upper (mu_upper) and the lower (mu_lower) distributions
corresponding to this privacy loss distribution.
"""
# The epsilon-hockey stick divergence of mu_upper with respect to mu_lower
# is equal to (the sum over all the values in the privacy loss distribution
# of the probability mass at value times max(0, 1 - e^{epsilon - value}) )
# plus the infinity_mass.
shifted_privacy_losses = []
probability_masses = []
for i in self.rounded_probability_mass_function:
val = i * self.value_discretization_interval
if val > epsilon and self.rounded_probability_mass_function[i] > 0:
shifted_privacy_losses.append(epsilon - val)
probability_masses.append(self.rounded_probability_mass_function[i])
return self.infinity_mass + np.dot(
(1 - np.exp(shifted_privacy_losses)), probability_masses)
def get_epsilon_for_delta(self, delta: float) -> float:
"""Computes epsilon for which hockey stick divergence is at most delta.
This function computes the smallest non-negative epsilon for which the
epsilon-hockey stick divergence between mu_upper, mu_lower is at most delta.
When this privacy loss distribution corresponds to a mechanism and the
rounding is pessimistic, the returned value corresponds to an epsilon for
which the mechanism is (epsilon, delta)-differentially private. (See
Observation 1 in the supplementary material.)
Args:
delta: the target epsilon-hockey stick divergence.
Returns:
A non-negative real number which is the smallest epsilon such that the
epsilon-hockey stick divergence between the upper (mu_upper) and the
lower (mu_lower) distributions is at most delta. When no such finite
epsilon exists, return math.inf.
"""
if self.infinity_mass > delta:
return math.inf
mass_upper = self.infinity_mass
mass_lower = 0
for i in sorted(
self.rounded_probability_mass_function.keys(), reverse=True):
val = i * self.value_discretization_interval
if (mass_upper > delta and mass_lower > 0 and math.log(
(mass_upper - delta) / mass_lower) >= val):
# Epsilon is greater than or equal to val.
break
mass_upper += self.rounded_probability_mass_function[i]
mass_lower += (math.exp(-val) * self.rounded_probability_mass_function[i])
if mass_upper >= delta and mass_lower == 0:
# This only occurs when val is very large, which results in exp(-val)
# being treated as zero.
return max(0, val)
if mass_upper <= mass_lower + delta:
return 0
else:
return math.log((mass_upper - delta) / mass_lower)
def validate_composable(
self, basic_pld: 'BasicPrivacyLossDistribution'):
"""Verifies that a given Basic PLD can be composed with this Basic PLD.
The two privacy loss distributions must have the same discretization
interval and estimate type for the composition to be allowed.
Args:
basic_pld: the privacy loss distribution to be composed
with the current privacy loss distribution.
Raises:
ValueError if the value_discretization_interval or estimate_type of the
two PLDs are different.
"""
if (self.value_discretization_interval !=
basic_pld.value_discretization_interval):
raise ValueError(
f'Discretization intervals are different: '
f'{self.value_discretization_interval}'
f'{basic_pld.value_discretization_interval}')
if (self.pessimistic_estimate !=
basic_pld.pessimistic_estimate):
raise ValueError(f'Estimation types are different: '
f'{self.pessimistic_estimate}'
f'{basic_pld.pessimistic_estimate}')
def compose(
self,
basic_pld: 'BasicPrivacyLossDistribution',
tail_mass_truncation: float = 1e-15) -> 'BasicPrivacyLossDistribution':
"""Computes a privacy loss distribution resulting from composing two PLDs.
Args:
basic_pld: the privacy loss distribution to be composed
with the current privacy loss distribution. The two must have the same
value_discretization_interval.
tail_mass_truncation: an upper bound on the tails of the probability mass
of the PLD that might be truncated.
Returns:
A privacy loss distribution which is the result of composing the two.
"""
self.validate_composable(basic_pld)
# The probability mass function of the resulting distribution is simply the
# convolutaion of the two input probability mass functions.
new_rounded_probability_mass_function = common.convolve_dictionary(
self.rounded_probability_mass_function,
basic_pld.rounded_probability_mass_function,
tail_mass_truncation=tail_mass_truncation)
new_infinity_mass = (
self.infinity_mass + basic_pld.infinity_mass -
(self.infinity_mass * basic_pld.infinity_mass))
if self.pessimistic_estimate:
# In the pessimistic case, the truncated probability mass needs to be
# treated as if it were infinity.
new_infinity_mass += tail_mass_truncation
return BasicPrivacyLossDistribution(
new_rounded_probability_mass_function,
self.value_discretization_interval,
new_infinity_mass,
pessimistic_estimate=self.pessimistic_estimate)
def get_delta_for_epsilon_for_composed_pld(
self, basic_pld: 'BasicPrivacyLossDistribution',
epsilon: float) -> float:
"""Computes delta for given epsilon for the result of composing this PLD and a given PLD.
The output of this function should be the same as first composing this PLD
and basic_pld, and then call get_delta_for_epsilon on the
resulting PLD. The main advantage is that this function is faster.
Args:
basic_pld: the privacy loss distribution to be composed
with the current privacy loss distribution. The two must have the same
value_discretization_interval.
epsilon: the epsilon in epsilon-hockey stick divergence.
Returns:
A non-negative real number which is the epsilon-hockey stick divergence
of the privacy loss distribution which is the result of composing this PLD
with basic_pld.
"""
self.validate_composable(basic_pld)
this_offset, this_probability_mass_function = common.dictionary_to_list(
self.rounded_probability_mass_function)
this_exponentiated_privacy_loss_values = np.exp(
self.value_discretization_interval * np.arange(
this_offset, this_offset + len(this_probability_mass_function)))
other_offset, other_probability_mass_function = common.dictionary_to_list(
basic_pld.rounded_probability_mass_function)
other_exponentiated_privacy_loss_values = np.exp(
basic_pld.value_discretization_interval * np.arange(
other_offset, other_offset + len(other_probability_mass_function)))
exp_epsilon = math.exp(epsilon)
# Compute the hockey stick divergence using equation (2) in the
# supplementary material. other_cumulative_upper_mass below represents the
# summation in equation (3) and other_cumulative_lower_mass represents the
# summation in equation (4).
other_cumulative_upper_mass = 0
other_cumulative_lower_mass = 0
current_index = len(other_probability_mass_function) - 1
divergence = 0
for this_exponentiated_privacy_loss, this_probability_mass in zip(
this_exponentiated_privacy_loss_values, this_probability_mass_function):
cutoff = exp_epsilon / this_exponentiated_privacy_loss
while current_index >= 0 and other_exponentiated_privacy_loss_values[
current_index] > cutoff:
other_cumulative_upper_mass += other_probability_mass_function[
current_index]
other_cumulative_lower_mass += (
other_probability_mass_function[current_index] /
other_exponentiated_privacy_loss_values[current_index])
current_index -= 1
divergence += this_probability_mass * (
other_cumulative_upper_mass - cutoff * other_cumulative_lower_mass)
# The probability that the composed privacy loss is infinite
composed_infinity_mass = 1 - (1 - self.infinity_mass) * (
1 - basic_pld.infinity_mass)
return divergence + composed_infinity_mass
def self_compose(
self,
num_times: int,
tail_mass_truncation: float = 1e-15) -> 'BasicPrivacyLossDistribution':
"""Computes PLD resulting from repeated composing the PLD with itself.
Args:
num_times: the number of times to compose this PLD with itself.
tail_mass_truncation: an upper bound on the tails of the probability mass
of the PLD that might be truncated. Currently only supports for
pessimistic estimates.
Returns:
A privacy loss distribution which is the result of the composition.
"""
if not self.pessimistic_estimate:
# Currently support truncation only for pessimistic estimates.
tail_mass_truncation = 0
new_rounded_probability_mass_function = common.self_convolve_dictionary(
self.rounded_probability_mass_function,
num_times,
tail_mass_truncation=tail_mass_truncation)
new_infinity_mass = (1 - ((1 - self.infinity_mass)**num_times))
new_infinity_mass += tail_mass_truncation
return BasicPrivacyLossDistribution(
new_rounded_probability_mass_function,
self.value_discretization_interval,
new_infinity_mass,
pessimistic_estimate=self.pessimistic_estimate)
def _deprecation_warning(method_name: str):
logging.warning('PrivacyLossDistribution.%s() will be deprecated shortly. '
'Use factory method %s() instead.', method_name, method_name)
class PrivacyLossDistribution(object):
"""Class for privacy loss distributions and computation involving them.
The privacy loss distribution (PLD) of two discrete distributions, the upper
distribution mu_upper and the lower distribution mu_lower, is defined as a
distribution on real numbers generated by first sampling an outcome o
according to mu_upper and then outputting the privacy loss
ln(mu_upper(o) / mu_lower(o)) where mu_lower(o) and mu_upper(o) are the
probability masses of o in mu_lower and mu_upper respectively. This class
allows one to create and manipulate privacy loss distributions.
PLD allows one to (approximately) compute the epsilon-hockey stick divergence
between mu_upper and mu_lower, which is defined as
sum_{o} [mu_upper(o) - e^{epsilon} * mu_lower(o)]_+. This quantity in turn
governs the parameter delta of (eps, delta)-differential privacy of the
corresponding protocol. (See Observation 1 in the supplementary material.)
The above definitions extend to continuous distributions. The PLD of two
continuous distributions mu_upper and mu_lower is defined as a distribution on
real numbers generated by first sampling an outcome o according to mu_upper
and then outputting the privacy loss ln(f_{mu_upper}(o) / f_{mu_lower}(o))
where f_{mu_lower}(o) and f_{mu_upper}(o) are the probability density
functions at o in mu_lower and mu_upper respectively. Moreover, for continuous
distributions the epsilon-hockey stick divergence is defined as
integral [f_{mu_upper}(o) - e^{epsilon} * f_{mu_lower}(o)]_+ do.
A single privacy loss distribution is represented as an object of the class
BasicPrivacyLossDistribution. This class, on the other hand, holds the higher
level logic.
Namely, this class maintains up to two BasicPrivacyLossDistribution objects.
One for the 'add' adjacency type, which specifies the privacy loss
distribution for a mechanism M with mu_upper = M(D) and mu_lower = M(D'),
where D' contains one more datapoint than D.
And one for the 'remove' adjacency type, which specifies the privacy loss
distribution for a mechanism M, with mu_upper = M(D) and mu_lower = M(D'),
where D' contains one less datapoint than D.
In the case where both privacy loss distributions are the same, only one copy
is maintained.
While this class offers additional support with respect to the ADD/REMOVE
adjacency, this is not an inherent limitation; this class can also be used
in the case of other adjacencies such as Substitution.
Factory methods in this module provide a convenient way to generate objects of
this class associated to various mechanisms.
Attributes:
_basic_pld_remove: basic privacy loss distribution with respect to REMOVE
adjacency.
_basic_pld_add: basic privacy loss distribution with respect to ADD
adjacency.
_symmetric: When True, basic_pld_add is assumed to be the same as
basic_pld_remove.
_basic_pld: An alias for basic_pld_remove. Useful when symmetric is True.
"""
def __init__(
self,
rounded_probability_mass_function: Mapping[int, float],
value_discretization_interval: float,
infinity_mass: float,
pessimistic_estimate: bool = True,
rounded_probability_mass_function_add: Optional[Mapping[int,
float]] = None,
infinity_mass_add: Optional[float] = None,
symmetric: bool = True):
"""Initialization method for PrivacyLossDistribution.
Args:
rounded_probability_mass_function: rounded probability mass function of
the basic privacy loss distribution, with respect to REMOVE adjacency.
value_discretization_interval: the interval length for which the values of
the privacy loss distribution are discretized. In particular, the values
are always integer multiples of value_discretization_interval. Smaller
value results in more accurate estimates of the privacy loss, at the
cost of increased run-time / memory usage. Smaller value results in more
accurate estimates of the privacy loss, at the cost of increased
run-time / memory usage.
infinity_mass: infinity_mass for basic privacy loss distribution with
respect to the REMOVE adjacency.
pessimistic_estimate: whether the rounding is done in such a way that the
resulting epsilon-hockey stick divergence computation gives an upper
estimate to the real value.
rounded_probability_mass_function_add: rounded probability mass function
of the basic privacy loss distribution,with respect to ADD adjacency.
infinity_mass_add: infinity_mass for basic privacy loss distribution with
respect to the ADD adjacency.
symmetric: When True, the basic privacy loss distribution with respect to
ADD adjacency is assumed to be the same as that for REMOVE adjacency.
Arguments rounded_probability_mass_function_add, infinity_mass_add are
ignored in this case.
"""
self._basic_pld = BasicPrivacyLossDistribution(
rounded_probability_mass_function, value_discretization_interval,
infinity_mass, pessimistic_estimate)
self._basic_pld_remove = self._basic_pld
self._symmetric = symmetric
if symmetric:
self._basic_pld_add = self._basic_pld_remove
if (rounded_probability_mass_function_add is not None or
infinity_mass_add is not None):
raise ValueError('Details about privacy loss distribution with respect'
'to ADD adjacency cannot be specified when symmetric')
else:
if (rounded_probability_mass_function_add is None or
infinity_mass_add is None):
raise ValueError('Details about privacy loss distribution with respect'
'to ADD adjacency should be specified when not '
'symmetric')
self._basic_pld_add = BasicPrivacyLossDistribution(
rounded_probability_mass_function_add, value_discretization_interval,
infinity_mass_add, pessimistic_estimate)
@classmethod
def from_basic_privacy_loss_distribution(
cls,
basic_pld: 'BasicPrivacyLossDistribution') -> 'PrivacyLossDistribution':
"""Constructs from a single BasicPrivacyLossDistribution object."""
return cls(basic_pld.rounded_probability_mass_function,
basic_pld.value_discretization_interval,
basic_pld.infinity_mass,
basic_pld.pessimistic_estimate)
@classmethod
def from_add_remove_basic_plds(
cls, basic_pld_remove: BasicPrivacyLossDistribution,
basic_pld_add: BasicPrivacyLossDistribution
) -> 'PrivacyLossDistribution':
"""Constructs from Basic PLDs for REMOVE and ADD adjacencies.
It is assumed that both basic PLDs have the same pessimistic estimate and
value_discretization_interval.
Args:
basic_pld_remove: basic privacy loss distribution with respect to REMOVE
adjacency.
basic_pld_add: basic privacy loss distribution with respect to ADD
adjacency.
Returns:
Asymmetric PrivacyLossDistribution object with respect to
ADD/REMOVE adjacency.
"""
basic_pld_remove.validate_composable(basic_pld_add)
return cls(basic_pld_remove.rounded_probability_mass_function,
basic_pld_remove.value_discretization_interval,
basic_pld_remove.infinity_mass,
basic_pld_remove.pessimistic_estimate,
basic_pld_add.rounded_probability_mass_function,
basic_pld_add.infinity_mass,
symmetric=False)
@classmethod
def identity(
cls,
value_discretization_interval: float = 1e-4) -> 'PrivacyLossDistribution':
"""Constructs an identity privacy loss distribution.
This class method will be deprecated shortly. Use factory method identity()
instead.
Args:
value_discretization_interval: the dicretization interval for the privacy
loss distribution. The values will be rounded up/down to be integer
multiples of this number. Smaller value results in more accurate
estimates of the privacy loss, at the cost of increased run-time /
memory usage.
Returns:
The privacy loss distribution corresponding to an algorithm with no
privacy leak (i.e. output is independent of input).
"""
_deprecation_warning('identity')
return identity(value_discretization_interval)
@classmethod
def from_two_probability_mass_functions(
cls,
log_probability_mass_function_lower: Mapping[Any, float],
log_probability_mass_function_upper: Mapping[Any, float],
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4,
log_mass_truncation_bound: float = -math.inf
) -> 'PrivacyLossDistribution':
"""Constructs a privacy loss distribution from mu_lower and mu_upper.
This class method will be deprecated shortly. Use factory method
from_two_probability_mass_functions() instead.
Args:
log_probability_mass_function_lower: the probability mass function of
mu_lower represented as a dictionary where each key is an outcome o of
mu_lower and the corresponding value is the natural log of the
probability mass of mu_lower at o.
log_probability_mass_function_upper: the probability mass function of
mu_upper represented as a dictionary where each key is an outcome o of
mu_upper and the corresponding value is the natural log of the
probability mass of mu_upper at o.
pessimistic_estimate: whether the rounding is done in such a way that the
resulting epsilon-hockey stick divergence computation gives an upper
estimate to the real value.
value_discretization_interval: the dicretization interval for the privacy
loss distribution. The values will be rounded up/down to be integer
multiples of this number. Smaller value results in more accurate
estimates of the privacy loss, at the cost of increased run-time /
memory usage.
log_mass_truncation_bound: when the log of the probability mass of the
upper distribution is below this bound, it is either (i) included in
infinity_mass in the case of pessimistic estimate or (ii) discarded
completely in the case of optimistic estimate. The larger
log_mass_truncation_bound is, the more error it may introduce in
divergence calculations.
Returns:
The privacy loss distribution constructed as specified.
"""
_deprecation_warning('from_two_probability_mass_functions')
return from_two_probability_mass_functions(
log_probability_mass_function_lower,
log_probability_mass_function_upper,
pessimistic_estimate,
value_discretization_interval,
log_mass_truncation_bound)
@classmethod
def create_from_additive_noise(
cls,
additive_noise_privacy_loss:
'privacy_loss_mechanism.AdditiveNoisePrivacyLoss',
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4) -> 'PrivacyLossDistribution':
"""Constructs the privacy loss distribution of an additive noise mechanism.
An additive noise mechanism for computing a scalar-valued function f is a
mechanism that outputs the sum of the true value of the function and a noise
drawn from a certain distribution mu. This function calculates the privacy
loss distribution for such an additive noise mechanism.
This class method will be deprecated shortly. Use factory method
create_from_additive_noise() instead.
Args:
additive_noise_privacy_loss: the privacy loss representation of the
mechanism.
pessimistic_estimate: a value indicating whether the rounding is done in
such a way that the resulting epsilon-hockey stick divergence
computation gives an upper estimate to the real value.
value_discretization_interval: the length of the dicretization interval
for the privacy loss distribution. The values will be rounded up/down to
be integer multiples of this number. Smaller value results in more
accurate estimates of the privacy loss, at the cost of increased
run-time / memory usage.
Returns:
The privacy loss distribution constructed as specified.
"""
_deprecation_warning('create_from_additive_noise')
return create_from_additive_noise(additive_noise_privacy_loss,
pessimistic_estimate,
value_discretization_interval)
@classmethod
def create_from_cdf(
cls,
cdf: Callable[[float], float],
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4,
tail_mass_truncation: float = 1e-15) -> 'PrivacyLossDistribution':
"""Constructs the privacy loss distribution from its cumulative density function.
This class method will be deprecated shortly. Use factory method
create_from_cdf() instead.
Args:
cdf: the cumulative density function of the privacy loss distribution.
pessimistic_estimate: a value indicating whether the rounding is done in
such a way that the resulting epsilon-hockey stick divergence
computation gives an upper estimate to the real value.
value_discretization_interval: the length of the dicretization interval
for the privacy loss distribution. The values will be rounded up/down to
be integer multiples of this number. Smaller value results in more
accurate estimates of the privacy loss, at the cost of increased
run-time / memory usage.
tail_mass_truncation: an upper bound on the tails of the probability mass
of the PLD that might be truncated.
Returns:
The privacy loss distribution constructed as specified.
"""
_deprecation_warning('create_from_cdf')
return create_from_cdf(cdf, pessimistic_estimate,
value_discretization_interval,
tail_mass_truncation)
@classmethod
def from_randomized_response(
cls,
noise_parameter: float,
num_buckets: int,
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4) -> 'PrivacyLossDistribution':
"""Constructs the privacy loss distribution of Randomized Response.
The Randomized Response over k buckets with noise parameter p takes in an
input which is one of the k buckets. With probability 1 - p, it simply
outputs the input bucket. Otherwise, with probability p, it outputs a bucket
drawn uniformly at random from the k buckets.
This function calculates the privacy loss distribution for the
aforementioned Randomized Response with a given number of buckets, and a
given noise parameter.
Specifically, suppose that the original input is x and it is changed to x'.
Recall that the privacy loss distribution of the Randomized Response
mechanism is generated as follows: first pick o according to R(x), where
R(x) denote the output distribution of the Randomized Response mechanism
on input x. Then, the privacy loss is ln(Pr[R(x) = o] / Pr[R(x') = o]).
There are three cases here:
- When o = x, ln(Pr[R(x) = o] / Pr[R(x') = o]) =
ln(Pr[R(x) = x] / Pr[R(x') = x]). Here Pr[R(x) = x] = 1 - p + p / k
and Pr[R(x') = x] = p / k.
- When o = x', ln(Pr[R(x) = o] / Pr[R(x') = o]) =
ln(Pr[R(x') = x'] / Pr[R(x) = x']), which is just the negation of the
previous privacy loss.
- When o != x, x', the privacy loss is zero.
This class method will be deprecated shortly. Use factory method
from_randomized_response() instead.
Args:
noise_parameter: the probability that the Randomized Response outputs a
completely random bucket.
num_buckets: the total number of possible input values (which is equal to
the total number of possible output values).
pessimistic_estimate: a value indicating whether the rounding is done in
such a way that the resulting epsilon-hockey stick divergence
computation gives an upper estimate to the real value.
value_discretization_interval: the length of the dicretization interval
for the privacy loss distribution. The values will be rounded up/down to
be integer multiples of this number. Smaller value results in more
accurate estimates of the privacy loss, at the cost of increased
run-time / memory usage.
Returns:
The privacy loss distribution constructed as specified.
"""
_deprecation_warning('from_randomized_response')
return from_randomized_response(noise_parameter, num_buckets,
pessimistic_estimate,
value_discretization_interval)
@classmethod
def from_laplace_mechanism(
cls,
parameter: float,
sensitivity: float = 1,
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4,
sampling_prob: float = 1.0) -> 'PrivacyLossDistribution':
"""Computes the privacy loss distribution of the Laplace mechanism.
This class method will be deprecated shortly. Use factory method
from_laplace_mechanism() instead.
Args:
parameter: the parameter of the Laplace distribution.
sensitivity: the sensitivity of function f. (i.e. the maximum absolute
change in f when an input to a single user changes.)
pessimistic_estimate: a value indicating whether the rounding is done in
such a way that the resulting epsilon-hockey stick divergence
computation gives an upper estimate to the real value.
value_discretization_interval: the length of the dicretization interval
for the privacy loss distribution. The values will be rounded up/down to
be integer multiples of this number. Smaller value results in more
accurate estimates of the privacy loss, at the cost of increased
run-time / memory usage.
sampling_prob: sub-sampling probability, a value in (0,1].
Returns:
The privacy loss distribution corresponding to the Laplace mechanism with
given parameters.
"""
_deprecation_warning('from_laplace_mechanism')
return from_laplace_mechanism(parameter, sensitivity, pessimistic_estimate,
value_discretization_interval, sampling_prob)
@classmethod
def from_gaussian_mechanism(
cls,
standard_deviation: float,
sensitivity: float = 1,
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4,
log_mass_truncation_bound: float = -50,
sampling_prob: float = 1.0) -> 'PrivacyLossDistribution':
"""Creates the privacy loss distribution of the Gaussian mechanism.
This class method will be deprecated shortly. Use factory method
from_gaussian_mechanism() instead.
Args:
standard_deviation: the standard_deviation of the Gaussian distribution.
sensitivity: the sensitivity of function f. (i.e. the maximum absolute
change in f when an input to a single user changes.)
pessimistic_estimate: a value indicating whether the rounding is done in
such a way that the resulting epsilon-hockey stick divergence
computation gives an upper estimate to the real value.
value_discretization_interval: the length of the dicretization interval
for the privacy loss distribution. The values will be rounded up/down to
be integer multiples of this number. Smaller value results in more
accurate estimates of the privacy loss, at the cost of increased
run-time / memory usage.
log_mass_truncation_bound: the ln of the probability mass that might be
discarded from the noise distribution. The larger this number, the more
error it may introduce in divergence calculations.
sampling_prob: sub-sampling probability, a value in (0,1].
Returns:
The privacy loss distribution corresponding to the Gaussian mechanism with
given parameters.
"""
_deprecation_warning('from_gaussian_mechanism')
return from_gaussian_mechanism(standard_deviation, sensitivity,
pessimistic_estimate,
value_discretization_interval,
log_mass_truncation_bound,
sampling_prob)
@classmethod
def from_discrete_laplace_mechanism(
cls,
parameter: float,
sensitivity: int = 1,
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4,
sampling_prob: float = 1.0) -> 'PrivacyLossDistribution':
"""Computes the privacy loss distribution of the Discrete Laplace mechanism.
This class method will be deprecated shortly. Use factory method
from_discrete_laplace_mechanism() instead.
Args:
parameter: the parameter of the discrete Laplace distribution.
sensitivity: the sensitivity of function f. (i.e. the maximum absolute
change in f when an input to a single user changes.)
pessimistic_estimate: a value indicating whether the rounding is done in
such a way that the resulting epsilon-hockey stick divergence
computation gives an upper estimate to the real value.
value_discretization_interval: the length of the dicretization interval
for the privacy loss distribution. The values will be rounded up/down to
be integer multiples of this number. Smaller value results in more
accurate estimates of the privacy loss, at the cost of increased
run-time / memory usage.
sampling_prob: sub-sampling probability, a value in (0,1].
Returns:
The privacy loss distribution corresponding to the Discrete Laplace
mechanism with given parameters.
"""
_deprecation_warning('from_discrete_laplace_mechanism')
return from_discrete_laplace_mechanism(parameter, sensitivity,
pessimistic_estimate,
value_discretization_interval,
sampling_prob)
@classmethod
def from_discrete_gaussian_mechanism(
cls,
sigma: float,
sensitivity: int = 1,
truncation_bound: Optional[int] = None,
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4,
sampling_prob: float = 1.0) -> 'PrivacyLossDistribution':
"""Creates the privacy loss distribution of the discrete Gaussian mechanism.
This class method will be deprecated shortly. Use factory method
from_discrete_gaussian_mechanism() instead.
Args:
sigma: the parameter of the discrete Gaussian distribution. Note that
unlike the (continuous) Gaussian distribution this is not equal to the
standard deviation of the noise.
sensitivity: the sensitivity of function f. (i.e. the maximum absolute
change in f when an input to a single user changes.)
truncation_bound: bound for truncating the noise, i.e. the noise will only
have a support in [-truncation_bound, truncation_bound]. When not
specified, truncation_bound will be chosen in such a way that the mass
of the noise outside of this range is at most 1e-30.
pessimistic_estimate: a value indicating whether the rounding is done in
such a way that the resulting epsilon-hockey stick divergence
computation gives an upper estimate to the real value.
value_discretization_interval: the length of the dicretization interval
for the privacy loss distribution. The values will be rounded up/down to
be integer multiples of this number. Smaller value results in more
accurate estimates of the privacy loss, at the cost of increased
run-time / memory usage.
sampling_prob: sub-sampling probability, a value in (0,1].
Returns:
The privacy loss distribution corresponding to the discrete Gaussian
mechanism with given parameters.
"""
_deprecation_warning('from_discrete_gaussian_mechanism')
return from_discrete_gaussian_mechanism(sigma, sensitivity,
truncation_bound,
pessimistic_estimate,
value_discretization_interval,
sampling_prob)
@classmethod
def from_privacy_parameters(
cls,
privacy_parameters: common.DifferentialPrivacyParameters,
value_discretization_interval: float = 1e-4) -> 'PrivacyLossDistribution':
"""Constructs pessimistic PLD from epsilon and delta parameters.
When the mechanism is (epsilon, delta)-differentially private, the following
is a pessimistic estimate of its privacy loss distribution (see Section 3.5
of the supplementary material for more explanation):
- infinity with probability delta.
- epsilon with probability (1 - delta) / (1 + exp(-eps))
- -epsilon with probability (1 - delta) / (1 + exp(eps))
This class method will be deprecated shortly. Use factory method
from_privacy_parameters() instead.
Args:
privacy_parameters: the privacy guarantee of the mechanism.
value_discretization_interval: the length of the dicretization interval
for the privacy loss distribution. The values will be rounded up/down to
be integer multiples of this number. Smaller value results in more
accurate estimates of the privacy loss, at the cost of increased
run-time / memory usage.
Returns:
The privacy loss distribution constructed as specified.
"""
_deprecation_warning('from_privacy_parameters')
return from_privacy_parameters(privacy_parameters,
value_discretization_interval)
def get_delta_for_epsilon(self, epsilon: float) -> float:
"""Computes the epsilon-hockey stick divergence between mu_upper, mu_lower.
When this privacy loss distribution corresponds to a mechanism, the
epsilon-hockey stick divergence gives the value of delta for which the
mechanism is (epsilon, delta)-differentially private. (See Observation 1 in
the supplementary material.)
Args:
epsilon: the epsilon in epsilon-hockey stick divergence.
Returns:
A non-negative real number which is the epsilon-hockey stick divergence
between the upper (mu_upper) and the lower (mu_lower) distributions
corresponding to this privacy loss distribution.
"""
delta_remove = self._basic_pld_remove.get_delta_for_epsilon(epsilon)
if self._symmetric:
return delta_remove
delta_add = self._basic_pld_add.get_delta_for_epsilon(epsilon)
return max(delta_remove, delta_add)
def get_epsilon_for_delta(self, delta: float) -> float:
"""Computes epsilon for which hockey stick divergence is at most delta.
This function computes the smallest non-negative epsilon for which the
epsilon-hockey stick divergence between mu_upper, mu_lower is at most delta.
When this privacy loss distribution corresponds to a mechanism and the
rounding is pessimistic, the returned value corresponds to an epsilon for
which the mechanism is (epsilon, delta)-differentially private. (See
Observation 1 in the supplementary material.)
Args:
delta: the target epsilon-hockey stick divergence.
Returns:
A non-negative real number which is the smallest epsilon such that the
epsilon-hockey stick divergence between the upper (mu_upper) and the
lower (mu_lower) distributions is at most delta. When no such finite
epsilon exists, return math.inf.
"""
epsilon_remove = self._basic_pld_remove.get_epsilon_for_delta(delta)
if self._symmetric:
return epsilon_remove
epsilon_add = self._basic_pld_add.get_epsilon_for_delta(delta)
return max(epsilon_remove, epsilon_add)
def validate_composable(self,
privacy_loss_distribution: 'PrivacyLossDistribution'):
"""Verifies that a given PLD can be composed with this PLD.
The two privacy loss distributions must have the same discretization
interval and estimate type for the composition to be allowed.
Args:
privacy_loss_distribution: the privacy loss distribution to be composed
with the current privacy loss distribution.
Raises:
ValueError if the value_discretization_interval or estimate_type of the
two PLDs are different.
"""
self._basic_pld_remove.validate_composable(
privacy_loss_distribution._basic_pld_remove) # pylint:disable=protected-access
def compose(
self,
privacy_loss_distribution: 'PrivacyLossDistribution',
tail_mass_truncation: float = 1e-15,
) -> 'PrivacyLossDistribution':
"""Computes a privacy loss distribution resulting from composing two PLDs.
Args:
privacy_loss_distribution: the privacy loss distribution to be composed
with the current privacy loss distribution. The two must have the same
value_discretization_interval.
tail_mass_truncation: an upper bound on the tails of the probability mass
of the PLD that might be truncated.
Returns:
A privacy loss distribution which is the result of composing the two.
"""
basic_pld_remove = self._basic_pld_remove.compose(
privacy_loss_distribution._basic_pld_remove, tail_mass_truncation) # pylint:disable=protected-access
if self._symmetric and privacy_loss_distribution._symmetric: # pylint:disable=protected-access
return PrivacyLossDistribution.from_basic_privacy_loss_distribution(
basic_pld_remove)
basic_pld_add = self._basic_pld_add.compose(
privacy_loss_distribution._basic_pld_add, tail_mass_truncation) # pylint:disable=protected-access
return PrivacyLossDistribution.from_add_remove_basic_plds(
basic_pld_remove, basic_pld_add)
def get_delta_for_epsilon_for_composed_pld(
self, privacy_loss_distribution: 'PrivacyLossDistribution',
epsilon: float) -> float:
"""Computes delta for given epsilon for the result of composing this PLD and a given PLD.
The output of this function should be the same as first composing this PLD
and privacy_loss_distribution, and then call get_delta_for_epsilon on the
resulting PLD. The main advantage is that this function is faster.
Args:
privacy_loss_distribution: the privacy loss distribution to be composed
with the current privacy loss distribution. The two must have the same
value_discretization_interval.
epsilon: the epsilon in epsilon-hockey stick divergence.
Returns:
A non-negative real number which is the epsilon-hockey stick divergence
of the privacy loss distribution which is the result of composing this PLD
with privacy_loss_distribution.
"""
delta_remove = self._basic_pld_remove.get_delta_for_epsilon_for_composed_pld(
privacy_loss_distribution._basic_pld_remove, epsilon) # pylint:disable=protected-access
if self._symmetric and privacy_loss_distribution._symmetric: # pylint:disable=protected-access
return delta_remove
delta_add = self._basic_pld_add.get_delta_for_epsilon_for_composed_pld(
privacy_loss_distribution._basic_pld_add, epsilon) # pylint:disable=protected-access
return max(delta_remove, delta_add)
def self_compose(
self,
num_times: int,
tail_mass_truncation: float = 1e-15) -> 'PrivacyLossDistribution':
"""Computes PLD resulting from repeated composing the PLD with itself.
Args:
num_times: the number of times to compose this PLD with itself.
tail_mass_truncation: an upper bound on the tails of the probability mass
of the PLD that might be truncated. Currently only supports for
pessimistic estimates.
Returns:
A privacy loss distribution which is the result of the composition.
"""
basic_pld_remove = self._basic_pld_remove.self_compose(
num_times, tail_mass_truncation)
if self._symmetric:
return PrivacyLossDistribution.from_basic_privacy_loss_distribution(
basic_pld_remove)
basic_pld_add = self._basic_pld_add.self_compose(num_times,
tail_mass_truncation)
return PrivacyLossDistribution.from_add_remove_basic_plds(
basic_pld_remove, basic_pld_add)
def identity(
value_discretization_interval: float = 1e-4) -> PrivacyLossDistribution:
"""Constructs an identity privacy loss distribution.
Args:
value_discretization_interval: the dicretization interval for the privacy
loss distribution. The values will be rounded up/down to be integer
multiples of this number. Smaller value results in more accurate estimates
of the privacy loss, at the cost of increased run-time / memory usage.
Returns:
The privacy loss distribution corresponding to an algorithm with no
privacy leak (i.e. output is independent of input).
"""
return PrivacyLossDistribution({0: 1}, value_discretization_interval, 0)
def from_two_probability_mass_functions(
log_probability_mass_function_lower: Mapping[Any, float],
log_probability_mass_function_upper: Mapping[Any, float],
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4,
log_mass_truncation_bound: float = -math.inf) -> PrivacyLossDistribution:
"""Constructs a privacy loss distribution from mu_lower and mu_upper.
Args:
log_probability_mass_function_lower: the probability mass function of
mu_lower represented as a dictionary where each key is an outcome o of
mu_lower and the corresponding value is the natural log of the
probability mass of mu_lower at o.
log_probability_mass_function_upper: the probability mass function of
mu_upper represented as a dictionary where each key is an outcome o of
mu_upper and the corresponding value is the natural log of the
probability mass of mu_upper at o.
pessimistic_estimate: whether the rounding is done in such a way that the
resulting epsilon-hockey stick divergence computation gives an upper
estimate to the real value.
value_discretization_interval: the dicretization interval for the privacy
loss distribution. The values will be rounded up/down to be integer
multiples of this number. Smaller value results in more accurate estimates
of the privacy loss, at the cost of increased run-time / memory usage.
log_mass_truncation_bound: when the log of the probability mass of the upper
distribution is below this bound, it is either (i) included in
infinity_mass in the case of pessimistic estimate or (ii) discarded
completely in the case of optimistic estimate. The larger
log_mass_truncation_bound is, the more error it may introduce in
divergence calculations.
Returns:
The privacy loss distribution constructed as specified.
"""
infinity_mass = 0
for outcome in log_probability_mass_function_upper:
if log_probability_mass_function_lower.get(outcome, -math.inf) == -math.inf:
# When an outcome only appears in the upper distribution but not in the
# lower distribution, then it must be counted in infinity_mass as such
# an outcome contributes to the hockey stick divergence.
infinity_mass += math.exp(log_probability_mass_function_upper[outcome])
# Compute the (non-discretized) probability mass function for the privacy
# loss distribution.
probability_mass_function = {}
for outcome in log_probability_mass_function_lower:
if log_probability_mass_function_lower[outcome] == -math.inf:
# This outcome never occurs in mu_lower. This case was already included
# as infinity_mass above.
continue
elif (log_probability_mass_function_upper.get(outcome, -math.inf) >
log_mass_truncation_bound):
# When the probability mass of mu_upper at the outcome is greater than
# the threshold, add it to the distribution.
privacy_loss_value = (
log_probability_mass_function_upper[outcome] -
log_probability_mass_function_lower[outcome])
probability_mass_function[privacy_loss_value] = (
probability_mass_function.get(privacy_loss_value, 0) +
math.exp(log_probability_mass_function_upper[outcome]))
else:
if pessimistic_estimate:
# When the probability mass of mu_upper at the outcome is no more than
# the threshold and we would like to get a pessimistic estimate,
# account for this in infinity_mass.
infinity_mass += math.exp(
log_probability_mass_function_upper.get(outcome, -math.inf))
# Discretize the probability mass so that the values are integer multiples
# of value_discretization_interval
rounded_probability_mass_function = collections.defaultdict(lambda: 0)
round_fn = math.ceil if pessimistic_estimate else math.floor
for val in probability_mass_function:
rounded_probability_mass_function[round_fn(
val / value_discretization_interval)] += probability_mass_function[val]
return PrivacyLossDistribution(rounded_probability_mass_function,
value_discretization_interval,
infinity_mass,
pessimistic_estimate=pessimistic_estimate)
def create_from_additive_noise(
additive_noise_privacy_loss:
'privacy_loss_mechanism.AdditiveNoisePrivacyLoss',
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4) -> PrivacyLossDistribution:
"""Constructs the privacy loss distribution of an additive noise mechanism.
An additive noise mechanism for computing a scalar-valued function f is a
mechanism that outputs the sum of the true value of the function and a noise
drawn from a certain distribution mu. This function calculates the privacy
loss distribution for such an additive noise mechanism.
Args:
additive_noise_privacy_loss: the privacy loss representation of the
mechanism.
pessimistic_estimate: a value indicating whether the rounding is done in
such a way that the resulting epsilon-hockey stick divergence computation
gives an upper estimate to the real value.
value_discretization_interval: the length of the dicretization interval for
the privacy loss distribution. The values will be rounded up/down to be
integer multiples of this number. Smaller value results in more accurate
estimates of the privacy loss, at the cost of increased run-time / memory
usage.
Returns:
The privacy loss distribution constructed as specified.
"""
round_fn = math.ceil if pessimistic_estimate else math.floor
tail_pld = additive_noise_privacy_loss.privacy_loss_tail()
rounded_probability_mass_function = collections.defaultdict(lambda: 0)
infinity_mass = tail_pld.tail_probability_mass_function.get(math.inf, 0)
for privacy_loss in tail_pld.tail_probability_mass_function:
if privacy_loss != math.inf:
rounded_probability_mass_function[round_fn(
privacy_loss / value_discretization_interval
)] += tail_pld.tail_probability_mass_function[privacy_loss]
if additive_noise_privacy_loss.discrete_noise:
xs = list(
range(
math.ceil(tail_pld.lower_x_truncation) - 1,
math.floor(tail_pld.upper_x_truncation) + 1))
# Compute PMF for the x's. Note that a vectorized call to mu_upper_cdf can
# be much faster than many scalar calls.
cdf_values = additive_noise_privacy_loss.mu_upper_cdf(xs)
probability_mass = cdf_values[1:] - cdf_values[:-1]
for x, prob in zip(xs[1:], probability_mass):
rounded_probability_mass_function[round_fn(
additive_noise_privacy_loss.privacy_loss(x) /
value_discretization_interval)] += prob
else:
lower_x = tail_pld.lower_x_truncation
rounded_down_value = math.floor(
additive_noise_privacy_loss.privacy_loss(lower_x) /
value_discretization_interval)
upper_x_privacy_loss = additive_noise_privacy_loss.privacy_loss(
tail_pld.upper_x_truncation)
# Compute discretization intervals for PLD approximation.
xs, rounded_values = [lower_x], []
x = lower_x
while x < tail_pld.upper_x_truncation:
if (value_discretization_interval * rounded_down_value <=
upper_x_privacy_loss):
x = tail_pld.upper_x_truncation
else:
x = additive_noise_privacy_loss.inverse_privacy_loss(
value_discretization_interval * rounded_down_value)
xs.append(x)
rounded_values.append(round_fn(rounded_down_value + 0.5))
rounded_down_value -= 1
# Compute PLD for discretization intervals. Note that a vectorized call to
# mu_upper_cdf is much faster than many scalar calls.
cdf_values = additive_noise_privacy_loss.mu_upper_cdf(xs)
probability_mass = cdf_values[1:] - cdf_values[:-1]
# Each x in [lower_x, upper_x] results in privacy loss that lies in
# [value_discretization_interval * rounded_down_value,
# value_discretization_interval * (rounded_down_value + 1)]
for rounded_value, prob in zip(rounded_values, probability_mass):
rounded_probability_mass_function[rounded_value] += prob
return PrivacyLossDistribution(
dict(rounded_probability_mass_function),
value_discretization_interval,
infinity_mass,
pessimistic_estimate=pessimistic_estimate)
def create_from_cdf(
cdf: Callable[[float], float],
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4,
tail_mass_truncation: float = 1e-15) -> PrivacyLossDistribution:
"""Constructs the privacy loss distribution from its cumulative density function.
Args:
cdf: the cumulative density function of the privacy loss distribution.
pessimistic_estimate: a value indicating whether the rounding is done in
such a way that the resulting epsilon-hockey stick divergence computation
gives an upper estimate to the real value.
value_discretization_interval: the length of the dicretization interval for
the privacy loss distribution. The values will be rounded up/down to be
integer multiples of this number. Smaller value results in more accurate
estimates of the privacy loss, at the cost of increased run-time / memory
usage.
tail_mass_truncation: an upper bound on the tails of the probability mass of
the PLD that might be truncated.
Returns:
The privacy loss distribution constructed as specified.
"""
rounded_probability_mass_function = {}
# Construct the distribution for value greater than or equal to zero.
rounded_value = 1 if pessimistic_estimate else 0
value = 0
while cdf(value) < 1 - tail_mass_truncation / 2:
rounded_probability_mass_function[rounded_value] = (
cdf(value + value_discretization_interval) - cdf(value))
rounded_value += 1
value += value_discretization_interval
# Construct the distribution for value less than zero.
rounded_value = 0 if pessimistic_estimate else -1
value = 0
while cdf(value) > tail_mass_truncation / 2:
rounded_probability_mass_function[rounded_value] = (
cdf(value) - cdf(value - value_discretization_interval))
rounded_value -= 1
value -= value_discretization_interval
return PrivacyLossDistribution(
rounded_probability_mass_function,
value_discretization_interval,
tail_mass_truncation if pessimistic_estimate else 0,
pessimistic_estimate=pessimistic_estimate)
def from_randomized_response(
noise_parameter: float,
num_buckets: int,
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4
) -> PrivacyLossDistribution:
"""Constructs the privacy loss distribution of Randomized Response.
The Randomized Response over k buckets with noise parameter p takes in an
input which is one of the k buckets. With probability 1 - p, it simply
outputs the input bucket. Otherwise, with probability p, it outputs a bucket
drawn uniformly at random from the k buckets.
This function calculates the privacy loss distribution for the
aforementioned Randomized Response with a given number of buckets, and a
given noise parameter.
Specifically, suppose that the original input is x and it is changed to x'.
Recall that the privacy loss distribution of the Randomized Response
mechanism is generated as follows: first pick o according to R(x), where
R(x) denote the output distribution of the Randomized Response mechanism
on input x. Then, the privacy loss is ln(Pr[R(x) = o] / Pr[R(x') = o]).
There are three cases here:
- When o = x, ln(Pr[R(x) = o] / Pr[R(x') = o]) =
ln(Pr[R(x) = x] / Pr[R(x') = x]). Here Pr[R(x) = x] = 1 - p + p / k
and Pr[R(x') = x] = p / k.
- When o = x', ln(Pr[R(x) = o] / Pr[R(x') = o]) =
ln(Pr[R(x') = x'] / Pr[R(x) = x']), which is just the negation of the
previous privacy loss.
- When o != x, x', the privacy loss is zero.
Args:
noise_parameter: the probability that the Randomized Response outputs a
completely random bucket.
num_buckets: the total number of possible input values (which is equal to
the total number of possible output values).
pessimistic_estimate: a value indicating whether the rounding is done in
such a way that the resulting epsilon-hockey stick divergence computation
gives an upper estimate to the real value.
value_discretization_interval: the length of the dicretization interval for
the privacy loss distribution. The values will be rounded up/down to be
integer multiples of this number. Smaller value results in more accurate
estimates of the privacy loss, at the cost of increased run-time / memory
usage.
Returns:
The privacy loss distribution constructed as specified.
"""
if noise_parameter <= 0 or noise_parameter >= 1:
raise ValueError(f'Noise parameter must be strictly between 0 and 1: '
f'{noise_parameter}')
if num_buckets <= 1:
raise ValueError(
f'Number of buckets must be strictly greater than 1: {num_buckets}')
round_fn = math.ceil if pessimistic_estimate else math.floor
rounded_probability_mass_function = collections.defaultdict(lambda: 0)
# Probability that the output is equal to the input, i.e., Pr[R(x) = x]
probability_output_equal_input = ((1 - noise_parameter) +
noise_parameter / num_buckets)
# Probability that the output is equal to a specific bucket that is not the
# input, i.e., Pr[R(x') = x] for x' != x.
probability_output_not_input = noise_parameter / num_buckets
# Add privacy loss for the case o = x
rounded_value = round_fn(
math.log(probability_output_equal_input / probability_output_not_input)
/ value_discretization_interval)
rounded_probability_mass_function[
rounded_value] += probability_output_equal_input
# Add privacy loss for the case o = x'
rounded_value = round_fn(
math.log(probability_output_not_input / probability_output_equal_input)
/ value_discretization_interval)
rounded_probability_mass_function[
rounded_value] += probability_output_not_input
# Add privacy loss for the case o != x, x'
rounded_probability_mass_function[0] += (
probability_output_not_input * (num_buckets - 2))
return PrivacyLossDistribution(
rounded_probability_mass_function,
value_discretization_interval,
0,
pessimistic_estimate=pessimistic_estimate)
def _pld_for_subsampled_mechanism(
single_pld: Callable[[privacy_loss_mechanism.AdjacencyType],
PrivacyLossDistribution],
sampling_prob: float = 1.0) -> PrivacyLossDistribution:
"""Computes the privacy loss distribution for subsampled mechanisms.
It is assumed that when sub-sampling probability is 1, the privacy loss
distributions corresponding to ADD and REMOVE adjacencies are identical.
Args:
single_pld: method for computing the privacy loss distributions with respect
to ADD and REMOVE adjacency types.
sampling_prob: sub-sampling probability, a value in (0,1].
Returns:
A symmetric privacy loss distribution when sampling_prob = 1; An
asymmetric privacy loss distribution corresponding to ADD and REMOVE
adjacency types when sampling_prob < 1.
"""
if sampling_prob == 1.0:
return single_pld(privacy_loss_mechanism.AdjacencyType.REMOVE)
return PrivacyLossDistribution.from_add_remove_basic_plds(
single_pld(privacy_loss_mechanism.AdjacencyType.REMOVE)._basic_pld, # pylint:disable=protected-access
single_pld(privacy_loss_mechanism.AdjacencyType.ADD)._basic_pld) # pylint:disable=protected-access
def from_laplace_mechanism(
parameter: float,
sensitivity: float = 1,
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4,
sampling_prob: float = 1.0) -> PrivacyLossDistribution:
"""Computes the privacy loss distribution of the Laplace mechanism.
Args:
parameter: the parameter of the Laplace distribution.
sensitivity: the sensitivity of function f. (i.e. the maximum absolute
change in f when an input to a single user changes.)
pessimistic_estimate: a value indicating whether the rounding is done in
such a way that the resulting epsilon-hockey stick divergence computation
gives an upper estimate to the real value.
value_discretization_interval: the length of the dicretization interval for
the privacy loss distribution. The values will be rounded up/down to be
integer multiples of this number. Smaller value results in more accurate
estimates of the privacy loss, at the cost of increased run-time / memory
usage.
sampling_prob: sub-sampling probability, a value in (0,1].
Returns:
The privacy loss distribution corresponding to the Laplace mechanism with
given parameters.
"""
def single_laplace_pld(
adjacency_type: privacy_loss_mechanism.AdjacencyType
) -> PrivacyLossDistribution:
return create_from_additive_noise(
privacy_loss_mechanism.LaplacePrivacyLoss(
parameter,
sensitivity=sensitivity,
sampling_prob=sampling_prob,
adjacency_type=adjacency_type),
pessimistic_estimate=pessimistic_estimate,
value_discretization_interval=value_discretization_interval)
return _pld_for_subsampled_mechanism(single_laplace_pld, sampling_prob)
def from_gaussian_mechanism(
standard_deviation: float,
sensitivity: float = 1,
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4,
log_mass_truncation_bound: float = -50,
sampling_prob: float = 1.0) -> PrivacyLossDistribution:
"""Creates the privacy loss distribution of the Gaussian mechanism.
Args:
standard_deviation: the standard_deviation of the Gaussian distribution.
sensitivity: the sensitivity of function f. (i.e. the maximum absolute
change in f when an input to a single user changes.)
pessimistic_estimate: a value indicating whether the rounding is done in
such a way that the resulting epsilon-hockey stick divergence computation
gives an upper estimate to the real value.
value_discretization_interval: the length of the dicretization interval for
the privacy loss distribution. The values will be rounded up/down to be
integer multiples of this number. Smaller value results in more accurate
estimates of the privacy loss, at the cost of increased run-time / memory
usage.
log_mass_truncation_bound: the ln of the probability mass that might be
discarded from the noise distribution. The larger this number, the more
error it may introduce in divergence calculations.
sampling_prob: sub-sampling probability, a value in (0,1].
Returns:
The privacy loss distribution corresponding to the Gaussian mechanism with
given parameters.
"""
def single_gaussian_pld(
adjacency_type: privacy_loss_mechanism.AdjacencyType
) -> PrivacyLossDistribution:
return create_from_additive_noise(
privacy_loss_mechanism.GaussianPrivacyLoss(
standard_deviation,
sensitivity=sensitivity,
pessimistic_estimate=pessimistic_estimate,
log_mass_truncation_bound=log_mass_truncation_bound,
sampling_prob=sampling_prob,
adjacency_type=adjacency_type),
pessimistic_estimate=pessimistic_estimate,
value_discretization_interval=value_discretization_interval)
return _pld_for_subsampled_mechanism(single_gaussian_pld, sampling_prob)
def from_discrete_laplace_mechanism(
parameter: float,
sensitivity: int = 1,
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4,
sampling_prob: float = 1.0) -> PrivacyLossDistribution:
"""Computes the privacy loss distribution of the Discrete Laplace mechanism.
Args:
parameter: the parameter of the discrete Laplace distribution.
sensitivity: the sensitivity of function f. (i.e. the maximum absolute
change in f when an input to a single user changes.)
pessimistic_estimate: a value indicating whether the rounding is done in
such a way that the resulting epsilon-hockey stick divergence computation
gives an upper estimate to the real value.
value_discretization_interval: the length of the dicretization interval for
the privacy loss distribution. The values will be rounded up/down to be
integer multiples of this number. Smaller value results in more accurate
estimates of the privacy loss, at the cost of increased run-time / memory
usage.
sampling_prob: sub-sampling probability, a value in (0,1].
Returns:
The privacy loss distribution corresponding to the Discrete Laplace
mechanism with given parameters.
"""
def single_discrete_laplace_pld(
adjacency_type: privacy_loss_mechanism.AdjacencyType
) -> PrivacyLossDistribution:
return create_from_additive_noise(
privacy_loss_mechanism.DiscreteLaplacePrivacyLoss(
parameter,
sensitivity=sensitivity,
sampling_prob=sampling_prob,
adjacency_type=adjacency_type),
pessimistic_estimate=pessimistic_estimate,
value_discretization_interval=value_discretization_interval)
return _pld_for_subsampled_mechanism(single_discrete_laplace_pld,
sampling_prob)
def from_discrete_gaussian_mechanism(
sigma: float,
sensitivity: int = 1,
truncation_bound: Optional[int] = None,
pessimistic_estimate: bool = True,
value_discretization_interval: float = 1e-4,
sampling_prob: float = 1.0) -> PrivacyLossDistribution:
"""Creates the privacy loss distribution of the discrete Gaussian mechanism.
Args:
sigma: the parameter of the discrete Gaussian distribution. Note that
unlike the (continuous) Gaussian distribution this is not equal to the
standard deviation of the noise.
sensitivity: the sensitivity of function f. (i.e. the maximum absolute
change in f when an input to a single user changes.)
truncation_bound: bound for truncating the noise, i.e. the noise will only
have a support in [-truncation_bound, truncation_bound]. When not
specified, truncation_bound will be chosen in such a way that the mass
of the noise outside of this range is at most 1e-30.
pessimistic_estimate: a value indicating whether the rounding is done in
such a way that the resulting epsilon-hockey stick divergence computation
gives an upper estimate to the real value.
value_discretization_interval: the length of the dicretization interval for
the privacy loss distribution. The values will be rounded up/down to be
integer multiples of this number. Smaller value results in more accurate
estimates of the privacy loss, at the cost of increased run-time / memory
usage.
sampling_prob: sub-sampling probability, a value in (0,1].
Returns:
The privacy loss distribution corresponding to the discrete Gaussian
mechanism with given parameters.
"""
def single_discrete_gaussian_pld(
adjacency_type: privacy_loss_mechanism.AdjacencyType
) -> PrivacyLossDistribution:
return create_from_additive_noise(
privacy_loss_mechanism.DiscreteGaussianPrivacyLoss(
sigma,
sensitivity=sensitivity,
truncation_bound=truncation_bound,
sampling_prob=sampling_prob,
adjacency_type=adjacency_type),
pessimistic_estimate=pessimistic_estimate,
value_discretization_interval=value_discretization_interval)
return _pld_for_subsampled_mechanism(single_discrete_gaussian_pld,
sampling_prob)
def from_privacy_parameters(
privacy_parameters: common.DifferentialPrivacyParameters,
value_discretization_interval: float = 1e-4) -> PrivacyLossDistribution:
"""Constructs pessimistic PLD from epsilon and delta parameters.
When the mechanism is (epsilon, delta)-differentially private, the following
is a pessimistic estimate of its privacy loss distribution (see Section 3.5
of the supplementary material for more explanation):
- infinity with probability delta.
- epsilon with probability (1 - delta) / (1 + exp(-eps))
- -epsilon with probability (1 - delta) / (1 + exp(eps))
Args:
privacy_parameters: the privacy guarantee of the mechanism.
value_discretization_interval: the length of the dicretization interval for
the privacy loss distribution. The values will be rounded up/down to be
integer multiples of this number. Smaller value results in more accurate
estimates of the privacy loss, at the cost of increased run-time / memory
usage.
Returns:
The privacy loss distribution constructed as specified.
"""
delta = privacy_parameters.delta
epsilon = privacy_parameters.epsilon
rounded_probability_mass_function = collections.defaultdict(lambda: 0)
rounded_probability_mass_function[math.ceil(
epsilon /
value_discretization_interval)] = (1 - delta) / (1 + math.exp(-epsilon))
rounded_probability_mass_function[math.ceil(
-epsilon /
value_discretization_interval)] += (1 - delta) / (1 + math.exp(epsilon))
return PrivacyLossDistribution(rounded_probability_mass_function,
value_discretization_interval,
privacy_parameters.delta)