| // |
| // Copyright 2020 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| |
| package dpagg |
| |
| import ( |
| "fmt" |
| "math" |
| |
| "github.com/google/differential-privacy/go/v2/checks" |
| "github.com/google/differential-privacy/go/v2/noise" |
| "github.com/google/differential-privacy/go/v2/rand" |
| ) |
| |
| // PreAggSelectPartition is used to compute an (ε,δ)-differentially private decision |
| // of whether to materialize a partition. |
| // |
| // Many differential privacy mechanisms work by performing an aggregation and |
| // adding noise. They achieve (ε_m,δ_m)-differential privacy under the |
| // assumption that partitions are chosen in advance. In other words, they assume that |
| // even if no data is associated with a partition, noise is added to the empty |
| // aggregation, and the noisy result is materialized. However, when only |
| // partitions containing data are materialized, such mechanisms fail to protect |
| // privacy for partitions containing data from a single privacy ID (e.g., |
| // user). To fix this, partitions with small numbers of privacy IDs must |
| // sometimes be dropped in order to maintain privacy. This process of partition |
| // selection incurs an additional (ε,δ) differential privacy budget resulting |
| // in a total differential privacy budget of (ε+ε_m,δ+δ_m) being used for the |
| // aggregation with partition selection. |
| // |
| // Depending on the l0sensitivity, the PreAggSelectPartition uses one of two |
| // differentially private partition selection algorithms. |
| // |
| // When l0sensitivity ≤ 3, the partition selection process is made (ε,δ) |
| // differentially private by applying the definition of differential privacy to |
| // the count of privacy IDs. Supposing l0Sensitivity bounds the number of partitions |
| // a privacy ID may contribute to, we define: |
| // pε := ε/l0Sensitivity |
| // pδ := δ/l0Sensitivity |
| // to be the per-partition differential privacy losses incurred by the partition |
| // selection process. Letting n denote the number of privacy IDs in a partition, |
| // the probability of selecting a partition is given by the following recurrence |
| // relation: |
| // keepPartitionProbability(n) = min( |
| // keepPartitionProbability(n-1) * exp(pε) + pδ, (1) |
| // 1 - exp(-pε) * (1-keepPartitionProbability(n-1)-pδ), (2) |
| // 1 (3) |
| // ) |
| // with base case keepPartitionProbability(0) = 0. This formula is optimal in terms of |
| // maximizing the likelihood of selecting a partition under (ε,δ)-differential |
| // privacy, with the caveat that the input values for pε and pδ are lower bound |
| // approximations. For efficiency, we use a closed-form solution to this |
| // recurrence relation. See [Differentially private partition selection paper] |
| // https://arxiv.org/pdf/2006.03684.pdf for details on the underlying mathematics. |
| // |
| // When l0sensitivity > 3, the partition selection process is made (ε,δ) |
| // differentially private by using the ThresholdedResult() of the Count primitive |
| // with Gaussian noise. Count computes a (ε,δ/2) differentially private count of |
| // the privacy IDs in a partition by adding Gaussian noise. Then, it computes |
| // a threshold T for which the probability that a (ε,δ/2) differentially private |
| // count of a single privacy ID can exceed T is δ/2. It keeps the partition iff |
| // differentially private count exceeds the threshold. |
| // |
| // The reason two different algorithms for deciding whether to keep a partition |
| // is used is because the first algorithm ("magic partition selection") is optimal |
| // when l0sensitivity ≤ 3 but is outperformed by Gaussian-based thresholding when |
| // l0sensitivity > 3. |
| // |
| // PreAggSelectPartition is a utility for maintaining the count of IDs in a single |
| // partition and then determining whether the partition should be |
| // materialized. Use Increment() to increment the count of IDs and ShouldKeepPartition() to decide |
| // if the partition should be materialized. |
| type PreAggSelectPartition struct { |
| // parameters |
| epsilon float64 |
| delta float64 |
| l0Sensitivity int64 |
| |
| // State variables |
| // idCount is the count of unique privacy IDs in the partition. |
| idCount int64 |
| state aggregationState |
| } |
| |
| func preAggSelectPartitionEquallyInitialized(s1, s2 *PreAggSelectPartition) bool { |
| return s1.epsilon == s2.epsilon && |
| s1.delta == s2.delta && |
| s1.l0Sensitivity == s2.l0Sensitivity && |
| s1.state == s2.state |
| } |
| |
| // PreAggSelectPartitionOptions is used to set the privacy parameters when |
| // constructing a PreAggSelectPartition. |
| type PreAggSelectPartitionOptions struct { |
| // Epsilon and Delta specify the (ε,δ)-differential privacy budget used for |
| // partition selection. Required. |
| Epsilon float64 |
| Delta float64 |
| // MaxPartitionsContributed is the number of distinct partitions a single |
| // privacy unit can contribute to. Defaults to 1. |
| MaxPartitionsContributed int64 |
| } |
| |
| // NewPreAggSelectPartition constructs a new PreAggSelectPartition from opt. |
| func NewPreAggSelectPartition(opt *PreAggSelectPartitionOptions) (*PreAggSelectPartition, error) { |
| s := PreAggSelectPartition{ |
| epsilon: opt.Epsilon, |
| delta: opt.Delta, |
| l0Sensitivity: opt.MaxPartitionsContributed, |
| } |
| // Override the 0-default, but do not override any explicitly set (i.e., negative) values |
| // for l0Sensitivity. |
| if s.l0Sensitivity == 0 { |
| s.l0Sensitivity = 1 |
| } |
| |
| if err := checks.CheckDeltaStrict(s.delta); err != nil { |
| return nil, fmt.Errorf("NewPreAggSelectPartition: %v", err) |
| } |
| // ε=0 is theoretically acceptable, but in practice it's probably an error, |
| // so we do not accept it as argument. |
| if err := checks.CheckEpsilonStrict(s.epsilon); err != nil { |
| return nil, fmt.Errorf("NewPreAggSelectPartition: %v", err) |
| } |
| if err := checks.CheckL0Sensitivity(s.l0Sensitivity); err != nil { |
| return nil, fmt.Errorf("NewPreAggSelectPartition: %v", err) |
| } |
| return &s, nil |
| } |
| |
| // Increment increments the ids count by one. |
| // The caller must ensure this methods called at most once per privacy ID. |
| func (s *PreAggSelectPartition) Increment() error { |
| if s.state != defaultState { |
| return fmt.Errorf("PreAggSelectPartition cannot be amended: %v", s.state.errorMessage()) |
| } |
| s.idCount++ |
| return nil |
| } |
| |
| // Merge merges s2 into s (i.e., add the idCount of s2 to s). This implicitly |
| // assumes that s and s2 act on distinct privacy IDs. s2 is consumed by this |
| // operation: s2 may not be used after it is merged into s. |
| // |
| // Preconditions: s and s2 must have the same privacy parameters. In addition, |
| // ShouldKeepPartition() may not be called yet for either s or s2. |
| func (s *PreAggSelectPartition) Merge(s2 *PreAggSelectPartition) error { |
| if err := checkMergePreAggSelectPartition(s, s2); err != nil { |
| return err |
| } |
| |
| s.idCount += s2.idCount |
| s2.state = merged |
| return nil |
| } |
| |
| func checkMergePreAggSelectPartition(s1, s2 *PreAggSelectPartition) error { |
| if s1.state != defaultState { |
| return fmt.Errorf("checkMergePreAggSelectPartition: s1 cannot be merged with another PreAggSelectPartition instance: %v", s1.state.errorMessage()) |
| } |
| if s2.state != defaultState { |
| return fmt.Errorf("checkMergePreAggSelectPartition: s2 cannot be merged with another PreAggSelectPartition instance: %v", s2.state.errorMessage()) |
| } |
| |
| if !preAggSelectPartitionEquallyInitialized(s1, s2) { |
| return fmt.Errorf("checkMergePreAggSelectPartition: s1 and s2 are not compatible") |
| } |
| |
| return nil |
| } |
| |
| // ShouldKeepPartition returns whether the partition should be materialized. |
| func (s *PreAggSelectPartition) ShouldKeepPartition() (bool, error) { |
| if s.state != defaultState { |
| return false, fmt.Errorf("PreAggSelectPartition's ShouldKeepPartition cannot be computed: %v", s.state.errorMessage()) |
| } |
| s.state = resultReturned |
| if s.l0Sensitivity > 3 { // Gaussian thresholding outperforms in this case. |
| c, err := NewCount(&CountOptions{ |
| Epsilon: s.epsilon, |
| Delta: s.delta / 2, |
| MaxPartitionsContributed: s.l0Sensitivity, |
| Noise: noise.Gaussian()}) |
| if err != nil { |
| return false, fmt.Errorf("couldn't initialize count for PreAggSelectPartition: %v", err) |
| } |
| err = c.IncrementBy(s.idCount) |
| if err != nil { |
| return false, fmt.Errorf("couldn't increment count for PreAggSelectPartition: %v", err) |
| } |
| result, err := c.ThresholdedResult(s.delta / 2) |
| if err != nil { |
| return false, fmt.Errorf("couldn't compute thresholded result for PreAggSelectPartition: %v", err) |
| } |
| return result != nil, nil |
| } |
| prob, err := keepPartitionProbability(s.idCount, s.l0Sensitivity, s.epsilon, s.delta) |
| if err != nil { |
| return false, fmt.Errorf("couldn't compute keepPartitionProbability for PreAggSelectPartition: %v", err) |
| } |
| return rand.Uniform() < prob, nil |
| } |
| |
| // sumExpPowers returns the evaluation of |
| // exp(minPower * ε) + exp((minPower+1) * ε) + ... + exp((numPowers+minPower-1) * ε) |
| // |
| // sumExpPowers requires ε >= 0. sumExpPowers may return +∞, but does not return |
| // NaN. |
| func sumExpPowers(epsilon float64, minPower, numPowers int64) (float64, error) { |
| if err := checks.CheckEpsilon(epsilon); err != nil { |
| return 0, fmt.Errorf("sumExpPowers: %v", err) |
| } |
| if numPowers <= 0 { |
| return 0, fmt.Errorf("sumExpPowers: numPowers (%d) must be > 0", numPowers) |
| } |
| |
| // expEpsilonPow returns exp(a * epsilon). |
| expEpsilonPow := func(a int64) float64 { |
| return math.Exp(float64(a) * epsilon) |
| } |
| |
| if math.IsInf(expEpsilonPow(minPower), 1) { |
| return math.Inf(1), nil |
| } |
| // In the case ε=0, sumExpPowers is simply numPowers. We use exp(-ε) = 1 to |
| // identify this case because our closed form solutions would otherwise |
| // result in division by 0 under finite precision arithmetic. |
| if math.Exp(-epsilon) == 1 { |
| return float64(numPowers), nil |
| } |
| |
| // For the general case, we use a closed form solution to a geometric |
| // series. See https://en.wikipedia.org/wiki/Geometric_series#Sum. |
| // |
| // The typical closed form formula is: |
| // exp(minPower*ε) * (exp(numPowers*ε) - 1) / (exp(ε) - 1) |
| // |
| // In our setting, it is OK to return +∞ but not OK to return NaN. We use the |
| // following mathematically equivalent formulas to avoid returning NaN when |
| // using our finite precision arithmetic: |
| // exp((minPower-1)*ε) * (exp(numPowers*ε) - 1) / (1 - exp(-ε)) (1) |
| // (exp((numPowers+minPower-1)*ε) - exp((minPower-1)*ε)) / (1 - exp(-ε)) (2) |
| // |
| // We use (1) when minPower >= 1. In that case, (exp(numPowers*ε) - 1) is |
| // the only potentially infinite term. The other two terms satisfy |
| // exp((minPower-1) * ε) >= 1 and 0 < (1 - exp(-ε)) <= 1. The |
| // multiplication and division operations involving these other two |
| // terms increase the result. Thus, the result is never NaN, and +∞ is |
| // returned only when necessary. |
| // |
| // We use (2) when minPower < 1. In that case, 0 <= exp((minPower-1)*ε) < 1, |
| // so the numerator |
| // (exp((numPowers+minPower-1)*ε) - exp((minPower-1)*ε)) |
| // is never NaN, and is only +∞ when necessary. The denominator in (2) |
| // satisfies 0 < (1-exp(-ε)) <= 1. The result of (2) is never NaN and |
| // only achieves +∞ when necessary overall. |
| if minPower >= 1 { |
| return expEpsilonPow(minPower-1) * (expEpsilonPow(numPowers) - 1) / (1 - expEpsilonPow(-1)), nil |
| } |
| return (expEpsilonPow(numPowers+minPower-1) - expEpsilonPow(minPower-1)) / (1 - expEpsilonPow(-1)), nil |
| } |
| |
| // keepPartitionProbability calculates the value of keepPartitionProbability |
| // from PreAggSelectPartition's godoc comment. |
| func keepPartitionProbability(idCount, l0Sensitivity int64, epsilon, delta float64) (float64, error) { |
| // Per-partition (ε,δ) privacy loss. |
| pEpsilon := epsilon / float64(l0Sensitivity) |
| pDelta := delta / float64(l0Sensitivity) |
| |
| if idCount == 0 { |
| return 0, nil |
| } |
| |
| // In keepPartitionProbability's recurrence formula (see Theorem 1 in the |
| // [Differentially private partition selection paper]), argument selection in |
| // the min operation has 3 distinct regions: min selects (1) on the lowest |
| // region of the domain, (2) on the second region of the domain, and (3) |
| // (i.e., the value 1) on the highest region of the domain. We denote by nCr |
| // the crossover proint in the domain from (1) to (2). |
| nCr := int64(1 + math.Floor((1/pEpsilon)*math.Log( |
| (1+math.Exp(-pEpsilon)*(2*pDelta-1))/(pDelta*(1+math.Exp(-pEpsilon)))))) |
| |
| if idCount <= nCr { |
| // Closed form solution of keepPartitionProbability(n) on [0, nCr]. |
| sum, err := sumExpPowers(pEpsilon, 0, idCount) |
| if err != nil { |
| return 0, err |
| } |
| return pDelta * sum, nil |
| } |
| |
| sum, err := sumExpPowers(pEpsilon, 0, nCr) |
| if err != nil { |
| return 0, err |
| } |
| selectPartitionPrNCr := pDelta * sum |
| // Compute form solution of keepPartitionProbability(n) on the domain (nCr, ∞). |
| m := idCount - nCr |
| sum, err = sumExpPowers(pEpsilon, -m, m) |
| if err != nil { |
| return 0, err |
| } |
| return math.Min( |
| 1+math.Exp(-float64(m)*pEpsilon)*(selectPartitionPrNCr-1)+sum*pDelta, |
| 1), nil |
| } |
| |
| // GetHardThreshold returns a threshold k, where if there are at least |
| // k privacy units in a partition, we are guaranteed to keep that partition. |
| // |
| // This is the conceptual equivalent of the post-aggregation threshold of the |
| // noise.Noise interface with the difference that here there is 0 probability |
| // of not keeping the partition if it has at least k privacy units, whereas |
| // with the post-aggregation threshold there is a non-zero probability |
| // (however small). |
| func (s *PreAggSelectPartition) GetHardThreshold() (int, error) { |
| for i := int64(1); ; i++ { |
| prob, err := keepPartitionProbability(i, s.l0Sensitivity, s.epsilon, s.delta) |
| if err != nil { |
| return 0, err |
| } |
| if prob >= 1 { // keepPartitionProbability converges to 1. |
| return int(i), nil |
| } |
| } |
| } |
| |
| // encodablePreAggSelectPartition can be encoded by the gob package. |
| type encodablePreAggSelectPartition struct { |
| Epsilon float64 |
| Delta float64 |
| L0Sensitivity int64 |
| IDCount int64 |
| State aggregationState |
| } |
| |
| // GobEncode encodes PreAggSelectPartition. |
| func (s *PreAggSelectPartition) GobEncode() ([]byte, error) { |
| if s.state != defaultState && s.state != serialized { |
| return nil, fmt.Errorf("PreAggSelectPartition object cannot be serialized: " + s.state.errorMessage()) |
| } |
| enc := encodablePreAggSelectPartition{ |
| Epsilon: s.epsilon, |
| Delta: s.delta, |
| L0Sensitivity: s.l0Sensitivity, |
| IDCount: s.idCount, |
| State: s.state, |
| } |
| s.state = serialized |
| return encode(enc) |
| } |
| |
| // GobDecode decodes PreAggSelectPartition. |
| func (s *PreAggSelectPartition) GobDecode(data []byte) error { |
| var enc encodablePreAggSelectPartition |
| err := decode(&enc, data) |
| if err != nil { |
| return fmt.Errorf("couldn't decode PreAggSelectPartition from bytes") |
| } |
| *s = PreAggSelectPartition{ |
| epsilon: enc.Epsilon, |
| delta: enc.Delta, |
| l0Sensitivity: enc.L0Sensitivity, |
| idCount: enc.IDCount, |
| state: enc.State, |
| } |
| return nil |
| } |