blob: dd94195dae1553b3a26d83be15b7479734909477 [file] [log] [blame]
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Package privacy contains methods for computing privacy encoding parameters and error of the privacy encoding.
package privacy
import (
"fmt"
"math"
"math/cmplx"
"gonum.org/v1/gonum/dsp/fourier"
)
// ProbabilityMassFunction represents a mass function of a discrete probability distribution
// TODO(https://fxbug.dev/296499681): Consider using only the shiftedDistribution type representation of ProbabilityMassFunction.
type ProbabilityMassFunction map[int]float64
// PrivacyLossDistribution represents a privacy loss distribution defined by probability mass function that is
// discretized to intervals defined by discretization parameter.
type PrivacyLossDistribution struct {
discretization float64
infiniteMass float64 // probability of getting a value Inf in PMF of this PLD.
pmf ProbabilityMassFunction
// Estimation can be optimistic: then we may underestimate probabilities of some values in PLD,
// or pessimistic: then we always overestimate mass of the values in PLD.
// For more details see Section 2.1 in https://github.com/google/differential-privacy/blob/main/common_docs/Privacy_Loss_Distributions.pdf.
pessimisticEstimation bool
}
type shiftedDistribution struct {
array []float64
shift int
}
func (pld *PrivacyLossDistribution) getPLDDiscretization() float64 {
return pld.discretization
}
func (pld *PrivacyLossDistribution) getPLDInfiniteMass() float64 {
return pld.infiniteMass
}
func (pld *PrivacyLossDistribution) getPLDPessimisticEstimate() bool {
return pld.pessimisticEstimation
}
func (pld *PrivacyLossDistribution) getPLDPmf() ProbabilityMassFunction {
return pld.pmf
}
// computeTruncationValueBounds for an input array and truncation mass computes indices of the
// arrays such that all values outside of these boundaries (with indices strictly smaller than
// minBound and strictly greater than maxBound) correspond to tail-values in a distribution with a
// mass that sum up to at most truncatedMass/2 value for each distribution tails.
// This truncation computation is used in a computation of convolution of a pair of distributions.
// Note, that this function can return minBound > maxBound (in that case, we need to truncate all
// elements).
func computeTruncationValueBounds(distrArray []float64, truncatedMass float64) (int, int) {
sum := 0.0
var minBound int
for minBound = 0; minBound < len(distrArray); minBound++ {
if (sum + distrArray[minBound]) <= truncatedMass/2 {
sum += distrArray[minBound]
} else {
break
}
}
sum = 0.0
var maxBound int
for maxBound = len(distrArray) - 1; maxBound >= 0; maxBound-- {
if (sum + distrArray[maxBound]) <= truncatedMass/2 {
sum += distrArray[maxBound]
} else {
break
}
}
return minBound, maxBound
}
// minMaxKeyValues for an input PMF finds the minimum and maximum values respectively in the support
// of the distribution. The function returns (0, 0) if the distribution is empty.
func minMaxKeyValues(pmf ProbabilityMassFunction) (int, int) {
if len(pmf) == 0 {
return 0, 0
}
max := math.MinInt
min := math.MaxInt
for key := range pmf {
if key > max {
max = key
}
if key < min {
min = key
}
}
return min, max
}
// rescaleFFTArray for an input array divides all values in the array by the length of the array.
// Required for post-processing of FFT computations, as forward FFT followed by reverse FFT returns
// values multiplied by the size of the distribution (# elements in it)
func rescaleFFTArray(array []float64) []float64 {
scale := float64(len(array))
if scale == 0 || scale == 1 {
return array
}
result := make([]float64, len(array))
for i, value := range array {
result[i] = value / scale
}
return result
}
// convertMapToShiftedArray for an input PMF converts discretized key-value pairs to integer keys and shifts them so the minimal value has 0 index in the array.
// Array representation of the distribution is necessary for applying FFT.
// If we have a distribution with only positive values, then this shifting makes the array smaller, hence makes the computation faster
func convertMapToShiftedArray(pmf ProbabilityMassFunction, size int) (shiftedDistribution, error) {
if size < 0 || size > math.MaxInt {
return shiftedDistribution{nil, 0.0}, fmt.Errorf("Size parameter for array is either too large or too small, golang cannot create a slice of this size: %d, for distribution: %v", size, pmf)
}
array := make([]float64, size)
shift, _ := minMaxKeyValues(pmf)
result := shiftedDistribution{array, shift}
for key := range pmf {
index := key - shift
if index >= size || index < 0 {
return shiftedDistribution{nil, 0.0}, fmt.Errorf("Incorrect initial size estimation for converting distribution map to array, expected size at least %d, got index %d", size, index)
}
result.array[index] = pmf[key]
}
return result, nil
}
// convolveHeterogeneousDistributions uses FFT to compute a convolution of two discrete distributions ((mu1 * mu2)(g) = \sum_{h in G} mu1(h)mu2(g-h)).
// Convolution computation is required for PLD computations: first we construct two PLDs, then, by the properties that PLD of a product is a convolution of PLDs
// compute a final PLD as a convolution of two PLDs
func convolveHeterogeneousDistributions(pmf1, pmf2 ProbabilityMassFunction, truncatedMass float64) (ProbabilityMassFunction, error) {
// Compute final array length, so it will contain enough space for all coefficients for target discretization
minKey1, maxKey1 := minMaxKeyValues(pmf1)
minKey2, maxKey2 := minMaxKeyValues(pmf2)
minArrayLength := maxKey1 + maxKey2 + 1 - minKey1 - minKey2
// The running time of fft operations depends upon the length of the array and
// the size of the largest prime divisor of the length of the array. We use
// the next factor of 2 to improve performance.
//
// TODO(https://fxbug.dev/296499492): Find something better similar to scipy.fft.next_fast_len
finalArrayLength := int(math.Pow(2, math.Ceil(math.Log2(float64(minArrayLength)))))
// For applying FFT we convert each map to an array, shifting all values to the smallest value.
shiftedDistr1, err := convertMapToShiftedArray(pmf1, finalArrayLength)
if err != nil {
return nil, err
}
shiftedDistr2, err := convertMapToShiftedArray(pmf2, finalArrayLength)
if err != nil {
return nil, err
}
fft := fourier.NewFFT(int(finalArrayLength))
coeff1 := fft.Coefficients(nil, shiftedDistr1.array)
coeff2 := fft.Coefficients(nil, shiftedDistr2.array)
// Compute convolution using Convolution Theorem
for i := range coeff1 {
coeff1[i] = coeff1[i] * coeff2[i]
}
inverseFft := fft.Sequence(nil, coeff1)
rescaledResultArray := rescaleFFTArray(inverseFft)
// Compute distribution tails and then truncate corresponding values from the final distribution.
minNotTruncatedValue, maxNotTruncatedValue := computeTruncationValueBounds(rescaledResultArray, truncatedMass)
// Convert shifted discretized array to the original map
resultPmf := make(ProbabilityMassFunction)
for i := minNotTruncatedValue; i <= maxNotTruncatedValue; i++ {
if rescaledResultArray[i] > 0 {
resultPmf[i+shiftedDistr1.shift+shiftedDistr2.shift] = rescaledResultArray[i]
}
}
return resultPmf, nil
}
// computeTruncationValueBoundsForHomogeneousConvolution for an input array and truncation mass
// computes indices of the array that correspond to tail-values in a distribution we get after
// convolving distrArray numTimes times.
// Tail mass we truncate for any of two tails is at most truncatedMass value divided by 2.
// We use the bound (5) from the supplementary materials to compute these truncation bounds:
// https://github.com/google/differential-privacy/blob/main/common_docs/Privacy_Loss_Distributions.pdf
func computeTruncationValueBoundsForHomogeneousConvolution(distrArray []float64, numTimes uint64, truncatedMass float64) (int, int) {
arraySize := int(len(distrArray))
upperBound := (arraySize - 1) * int(numTimes)
lowerBound := 0
if arraySize == 0 {
return 0, 0
}
if truncatedMass == 0 {
return lowerBound, upperBound
}
// TODO(https://fxbug.dev/195348085): Explain the choice of orders.
for order := -20; order <= 20; order++ {
if order == 0 {
continue
}
rescaledOrder := float64(order) / float64(arraySize)
// to compute moment-generating function we compute corresponding moment (expectation of e^{o*t})
momentValue := 0.0
// TODO(https://fxbug.dev/296499687): Is it possible to use LogSumExp (like scipy.special.logsumexp in python)?
for value, prob := range distrArray {
momentValue += math.Exp(float64(value)*rescaledOrder) * prob
}
// compute bound from (5) in the supplementary material
bound := (float64(numTimes)*math.Log(momentValue) + math.Log(2.0/truncatedMass)) / rescaledOrder
if order > 0 {
upperBound = int(math.Min(float64(upperBound), math.Ceil(bound)))
} else if order < 0 {
lowerBound = int(math.Max(float64(lowerBound), math.Floor(bound)))
}
}
return lowerBound, upperBound
}
// convolveHomogeneousDistributions for PMF and degree uses FFT to compute a convolution of this PMF with itself degree-times.
// Used to compute a PLD of mu^k, as it is a k-degree convolution of PLD of mu.
func convolveHomogeneousDistributions(pmf ProbabilityMassFunction, numTimes uint64, truncatedMass float64) (ProbabilityMassFunction, error) {
if numTimes <= 0 {
return nil, fmt.Errorf("Incorrect numTimes for homogeneous convolution: should be greater or equal 1. Found %d", numTimes)
}
if numTimes == 1 {
return pmf, nil
}
// For applying FFT we convert each map to an array, shifting all values to the smallest value.
// Temporary shiftedDistribution is used only to compute truncation bounds.
minKey, maxKey := minMaxKeyValues(pmf)
tempDistr, err := convertMapToShiftedArray(pmf, maxKey-minKey+1)
if err != nil {
return nil, err
}
// K-times convolution can give us very large array, so, to avoid truncation in a postprocessing,
// we precompute truncation bounds and compute values of FFT for part of the array
minBound, maxBound := computeTruncationValueBoundsForHomogeneousConvolution(tempDistr.array, numTimes, truncatedMass)
if minBound > maxBound {
return nil, fmt.Errorf("Incorrect truncation bound for homogeneous convolution: minBound (%d) is greater than maxBound (%d)", minBound, maxBound)
}
// Final array length defines how many Fourier coefficients we will compute.
// It should be large enough to store distrArray (the initial array).
minArrayLength := maxBound - minBound + 1
// The running time of fft operations depends upon the length of the array and
// the size of the largest prime divisor of the length of the array. We use
// the next factor of 2 to improve performance.
//
// TODO(https://fxbug.dev/296499492): Find something better similar to scipy.fft.next_fast_len
finalArrayLength := int(math.Pow(2, math.Ceil(math.Log2(float64(minArrayLength)))))
// For applying FFT we convert each map to an array, shifting all values to the smallest value.
shiftedDistr, err := convertMapToShiftedArray(pmf, finalArrayLength)
if err != nil {
return nil, err
}
fft := fourier.NewFFT(len(shiftedDistr.array))
coeff := fft.Coefficients(nil, shiftedDistr.array)
for i := range coeff {
coeff[i] = cmplx.Pow(coeff[i], complex(float64(numTimes), 0))
}
inverseFft := fft.Sequence(nil, coeff)
// To normalize values after forward and inverse FFT we rescale all values by the length of the array
rescaledResultArray := rescaleFFTArray(inverseFft)
// For computing FFT we transformed map to shifted discretized array, now, to return a map, we reverse the transformation.
resultPmf := make(ProbabilityMassFunction)
for i := minBound; i <= maxBound; i++ {
modI := i % finalArrayLength
if rescaledResultArray[modI] > 0 {
resultPmf[i+int(numTimes)*shiftedDistr.shift] = rescaledResultArray[modI]
}
}
return resultPmf, nil
}
// generatePLD for two PMFs generates a new privacy loss distribution (PLD) of two distributions
// Computed using a formula PLD_{D1/D2}(y) = \sum_{o: y = ln{D1(o)/D2(o)}} D1(o).
// For more details see Section 2 in https://github.com/google/differential-privacy/blob/main/common_docs/Privacy_Loss_Distributions.pdf.
// We refer to upperPmf to be distribution in the numerator, and lowerPmf is distribution in the denominator.
func generatePLD(upperPmf, lowerPmf ProbabilityMassFunction, logTruncatedMass, discretization float64, pessimistic bool) *PrivacyLossDistribution {
var infMass float64 = 0.0
// For all outputs such that probability under upperPmf is non-zero, but it is zero for the lower distribution,
// then log(upper_dist(O)/lower_distr(O))=inf.
// Hence we need to add mass of upper_dist(O) to the infinite mass.
for value, prob := range upperPmf {
if prob > 0 && lowerPmf[value] == 0 {
infMass += upperPmf[value]
}
}
roundedPmf := make(ProbabilityMassFunction)
// Compute probability of non-infinite values in PLD.
for value := range lowerPmf {
if lowerPmf[value] != 0 && upperPmf[value] != 0 {
// Values in the support of PLD are of the form ln{upperPmf(o)/lowerPmf(o)} for output o in support of upperPmf.
logLowerMass := math.Log(lowerPmf[value])
logUpperMass := math.Log(upperPmf[value])
// Check that this value is large enough, so we should care about it in current estimation.
// If it is small, we add it to infinite mass under pessimistic estimation.
if logUpperMass > logTruncatedMass {
scaledPrivacyLoss := (logUpperMass - logLowerMass) / float64(discretization)
// Discretizing floating point privacy loss values to integers.
if pessimistic {
roundedPmf[int(math.Ceil(scaledPrivacyLoss))] += upperPmf[value]
} else {
roundedPmf[int(math.Floor(scaledPrivacyLoss))] += upperPmf[value]
}
} else if pessimistic {
// For pessimistic estimate we want to account for any truncated mass. Here we account this mass as an infinite mass.
// by increasing an infinity mass by probability of the truncated outcome.
infMass += upperPmf[value]
}
}
}
pld := &PrivacyLossDistribution{
discretization: discretization,
infiniteMass: infMass,
pmf: roundedPmf,
pessimisticEstimation: pessimistic}
return pld
}
// composeHomogeneousPLD computes PLD of a cross product of the same distributions based on PLD of the distributions
func composeHomogeneousPLD(pld *PrivacyLossDistribution, numTimes uint64, truncatedMass float64) (*PrivacyLossDistribution, error) {
var pmf ProbabilityMassFunction
var err error
// Below is a numerically stable version of 1 - math.Pow(1-pld.infiniteMass, float64(numTimes))
newInfiniteMass := -math.Expm1(float64(numTimes) * math.Log1p(-pld.infiniteMass))
// Currently truncation supported only for pessimistic bound computation
if pld.pessimisticEstimation {
pmf, err = convolveHomogeneousDistributions(pld.pmf, numTimes, truncatedMass)
newInfiniteMass += truncatedMass
} else {
if truncatedMass != 0 {
return nil, fmt.Errorf("Currently truncation for optimistic estimation is not supported")
}
pmf, err = convolveHomogeneousDistributions(pld.pmf, numTimes, 0.0)
}
if err != nil {
return nil, err
}
resultPld := &PrivacyLossDistribution{
discretization: pld.discretization,
infiniteMass: newInfiniteMass,
pmf: pmf,
pessimisticEstimation: pld.pessimisticEstimation}
return resultPld, nil
}
// composeHeterogeneousPLD compute PLD of a cross product of two distinct distributions based on PLDs of each of the distributions
func composeHeterogeneousPLD(pld1, pld2 *PrivacyLossDistribution, truncatedMass float64) (*PrivacyLossDistribution, error) {
var pmf ProbabilityMassFunction
var err error
if pld1.discretization != pld2.discretization {
return nil, fmt.Errorf("Composing PLDs with distinct discretization is not supported")
}
if pld1.pessimisticEstimation != pld2.pessimisticEstimation {
return nil, fmt.Errorf("Composing PLDs with mixed estimation types (one is optimistic, other is pessimistic) is not supported")
}
// to compute a new infinite mass use a formula of a probability of two non-independent events
newInfiniteMass := pld1.infiniteMass + pld2.infiniteMass - pld1.infiniteMass*pld2.infiniteMass
if pld1.pessimisticEstimation {
// Truncated mass gets added to infiniteMass.
// TODO(https://fxbug.dev/296500421): Only add the mass that is actually truncated instead of `truncatedMass`.
newInfiniteMass += truncatedMass
}
pmf, err = convolveHeterogeneousDistributions(pld1.pmf, pld2.pmf, truncatedMass)
if err != nil {
return nil, err
}
resultPld := &PrivacyLossDistribution{
discretization: pld1.discretization,
infiniteMass: newInfiniteMass,
pmf: pmf,
pessimisticEstimation: pld1.pessimisticEstimation}
return resultPld, nil
}
// getDeltaForPrivacyLossDistribution computes a lower bound, such that all deltas greater than
// this bound guarantee the corresponding algorithm satisfies (epsilon,delta)-DP.
// These computations are based on Observation 2 in supplementary material
// https://github.com/google/differential-privacy/blob/main/common_docs/Privacy_Loss_Distributions.pdf
func (pld *PrivacyLossDistribution) getDeltaForPrivacyLossDistribution(epsilon float64) float64 {
divergence := pld.infiniteMass
for discPrivacyLoss, prob := range pld.pmf {
privacyLoss := float64(discPrivacyLoss) * pld.discretization
// By Observation 2 it suffices to consider only positive values of (1-e^{epsilon-privacyLoss})
if privacyLoss > epsilon && prob > 0 {
// Below is a numerically stable version of (1 - math.Exp(epsilon-privacyLoss)) * prob
divergence += -math.Expm1(epsilon-privacyLoss) * prob
}
}
return divergence
}