blob: 640275b7992d7c5f6b2d6394f1e4703afbc151d9 [file] [log] [blame]
//
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package com.google.privacy.differentialprivacy;
import static java.lang.Math.max;
import static java.lang.Math.min;
import com.google.auto.value.AutoValue;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.privacy.differentialprivacy.proto.SummaryOuterClass.BoundedMeanSummary;
import com.google.privacy.differentialprivacy.proto.SummaryOuterClass.BoundedSumSummary;
import com.google.privacy.differentialprivacy.proto.SummaryOuterClass.CountSummary;
import com.google.protobuf.InvalidProtocolBufferException;
import java.util.Collection;
import javax.annotation.Nullable;
/**
* Calculates differentially private average for a collection of values.
*
* <p>The mean is computed by dividing a noisy sum of the entries by a noisy count of the entries.
* To improve utility, all entries are normalized by setting them to the difference between their
* actual value and the middle of the input range before summation. The original mean is recovered
* by adding the midpoint in a post processing step. This idea is taken from Algorithm 2.4 of
* "Differential Privacy: From Theory to Practice", by Ninghui Li, Min Lyu, Dong Su and Weining Yang
* (section 2.5.5, page 28). In contrast to Algorithm 2.4, we do not return the midpoint if the
* noisy count is less or equal to 1. Instead we set the noisy count to 1. Since this is a mere post
* processing step, the DP bounds are preserved. Moreover, for small numbers of entries, this
* approach will return results that are closer to the actual mean in expectation.
*
* <p>Ninghui Li, Min Lyu, Dong Su and Weining Yang also propose Algorithm 2.3 for computing private
* means, which according to them yields better accuracy. However, the proof of the Algorithm 2.3 is
* flawed and it is not actually DP.
*
* <p>Supports contributions from a single privacy unit to multiple partitions as well as multiple
* contributions from a single privacy unit to a given partition.
*
* <p>The user can provide a {@link Noise} instance which will be used to generate the noise. If no
* instance is specified, {@link LaplaceNoise} is applied.
*
* <p>Note: the class is not thread-safe.
*
* <p>For more implementation details, see {@link #computeResult()}.
*
* <p>For general details and key definitions, see <a href=
* "https://github.com/google/differential-privacy/blob/main/differential_privacy.md#key-definitions">
* this</a> introduction to Differential Privacy.
*/
public class BoundedMean {
private final BoundedMean.Params params;
private final BoundedSum normalizedSum;
private final Count count;
/**
* The midpoint between lower and upper bounds. It cannot be set by the user: it will be
* calculated based on the {@link Params#lower()} and {@link Params#upper()} values.
*/
private final double midpoint;
private AggregationState state = AggregationState.DEFAULT;
private BoundedMean(BoundedMean.Params params) {
this.params = params;
// Note: we don't calculate the midpoint as "(lower + upper) / 2" to avoid overflow.
midpoint = params.lower() * 0.5 + params.upper() * 0.5;
double maxDistFromMidpoint = Math.abs(params.upper() - midpoint);
// We split the budget in half to calculate count and noised normalized sum
double halfEpsilon = params.epsilon() * 0.5;
Double halfDelta = params.delta() == null ? null : params.delta() * 0.5;
// normalizedSum yields a differentially private sum of the position of the entries e_i relative
// to the midpoint m = (lower + upper) / 2 of the range of the bounded mean, i.e., Σ_i (e_i - m)
//
// count yields a differentially private count of the entries.
//
// Given a normalized sum s and count c (both without noise), the true mean can be computed
// as: mean =
// s / c + m =
// (Σ_i (e_i - m)) / c + m =
// (Σ_i (e_i - m)) / c + (Σ_i m) / c =
// (Σ_i e_i) / c
//
// the rest follows from the code.
normalizedSum =
BoundedSum.builder()
.noise(params.noise())
.epsilon(halfEpsilon)
// TODO: this can be optimized for Gaussian noise
.delta(halfDelta)
.maxPartitionsContributed(params.maxPartitionsContributed())
.maxContributionsPerPartition(params.maxContributionsPerPartition())
.lower(-maxDistFromMidpoint)
.upper(maxDistFromMidpoint)
.build();
// Noised count of the entities.
count =
Count.builder()
.noise(params.noise())
.epsilon(halfEpsilon)
// TODO: this can be optimized for Gaussian noise
.delta(halfDelta)
.maxPartitionsContributed(params.maxPartitionsContributed())
.maxContributionsPerPartition(params.maxContributionsPerPartition())
.build();
}
public static BoundedMean.Params.Builder builder() {
return BoundedMean.Params.Builder.newBuilder();
}
/**
* Clamps the input value and adds it to the mean.
*
* @throws IllegalStateException if this this instance of {@link BoundedMean} has already been
* queried or serialized.
*/
public void addEntry(double e) {
Preconditions.checkState(state.equals(AggregationState.DEFAULT), "Entry cannot be added.");
// NaN is ignored because introducing even a single NaN entry will result in a NaN mean
// regardless of other entries, which would break the indistinguishability
// property required for differential privacy.
if (Double.isNaN(e)) {
return;
}
// BoundedSum will also attempt to clamp the input value but we do it here for transparency.
normalizedSum.addEntry(clamp(e) - midpoint);
count.increment();
}
/**
* Clamps the input values and adds them to the mean.
*
* @throws IllegalStateException if this this instance of {@link BoundedMean} has already been
* queried or serialized.
*/
public void addEntries(Collection<Double> e) {
e.forEach(this::addEntry);
}
private double clamp(double value) {
return max(min(value, params.upper()), params.lower());
}
/**
* Calculates and returns differentially private average of elements added using {@link #addEntry}
* and {@link #addEntries}. The method can be called only once for a given collection of elements.
* All subsequent calls will result in throwing an exception.
*
* <p>Note that the returned value is not an unbiased estimate of the raw bounded mean.
*
* @throws IllegalStateException if this instance of {@link BoundedMean} has already been queried
* or serialized.
*/
public double computeResult() {
Preconditions.checkState(state.equals(AggregationState.DEFAULT), "DP mean cannot be computed.");
state = AggregationState.RESULT_RETURNED;
long noisedCount = max(1, count.computeResult());
double normalizedNoisedSum = normalizedSum.computeResult();
// Clamp the average before returning it to ensure it does not exceed the lower and upper
// bounds.
return clamp(normalizedNoisedSum / noisedCount + midpoint);
}
/**
* Computes a confidence interval that contains the raw bounded mean with a probability greater or
* equal to {@code 1 - alpha}. The interval is exclusively based on the noised bounded mean
* returned by {@link #computeResult}. Thus, no privacy budget is consumed by this operation.
*
* <p>Refer to <a
* href="https://github.com/google/differential-privacy/tree/main/common_docs/confidence_intervals.md">this</a> doc for
* more information.
*
* @throws IllegalStateException if this this instance of {@link BoundedMean} has not been queried
* yet.
*/
public ConfidenceInterval computeConfidenceInterval(double alpha) {
Preconditions.checkState(
state.equals(AggregationState.RESULT_RETURNED), "Confidence interval cannot be computed.");
// The confidence interval [low, up] of bounded mean is derived from confidence intervals
// [lowNum, upNum] and [lowDen, upDen] of the mean's numerator and denominator, such that
// Pr(low < raw < up) >= Pr(lowNum < rawNum < upNum & lowDen < rawDen < upDen).
//
// See {@link #computeConfidenceInterval(double alpha, double alphaNum)} for details of how to
// set [low, up] based on lowNum, upNum, lowDen and upDen.
//
// Because the confidence intervals of the numerator and denominator are independent, we can
// lower bound the confidence level of the mean in terms of alphaNum and alphaDen like this:
// Pr(low < raw < up) >= Pr(lowNum < rawNum < upNum & lowDen < rawDen < upDen)
// = Pr(lowNum < rawNum < upNum) * Pr(lowDen < rawDen < upDen)
// >= (1 - alphaNum) * (1 - alphaDen)
//
// This means that we can choose alphaNum and alphaDen arbitrarily as long as
// (1 - alphaNum) * (1 alphaDen) = 1 - alpha.
//
// This implementation uses a brute force search for alphaNum that minimizes the size of the
// confidence interval of bounded mean.
double minSize = Double.POSITIVE_INFINITY;
ConfidenceInterval tightestConfInt = null;
for (int i = 1; i < 1000; i++) {
double alphaNum = (i / 1000.0) * alpha;
ConfidenceInterval confInt = computeConfidenceInterval(alpha, alphaNum);
double size = confInt.upperBound() - confInt.lowerBound();
if (size < minSize) {
minSize = size;
tightestConfInt = confInt;
}
}
return tightestConfInt;
}
/**
* Like {@link #computeConfidenceInterval(double)} with the additional constraint that the
* confidence level of the mean's numerator is {@code 1 - alphaNum}.
*/
@VisibleForTesting
ConfidenceInterval computeConfidenceInterval(double alpha, double alphaNum) {
// Setting alphaDen such that (1 - alpha) = (1 - alphaNum) * (1 - alphaDen).
double alphaDen = (alpha - alphaNum) / (1 - alphaNum);
ConfidenceInterval confIntNum = normalizedSum.computeConfidenceInterval(alphaNum);
ConfidenceInterval confIntDen = count.computeConfidenceInterval(alphaDen);
// Ensuring that the lower and upper bounds of the denominator are consistent with how
// computeResult() processes the denominator.
confIntDen =
ConfidenceInterval.create(
max(1.0, confIntDen.lowerBound()), max(1.0, confIntDen.upperBound()));
double meanLowerBound;
double meanUpperBound;
if (confIntNum.lowerBound() >= 0.0) {
meanLowerBound = confIntNum.lowerBound() / confIntDen.upperBound();
} else {
meanLowerBound = confIntNum.lowerBound() / confIntDen.lowerBound();
}
if (confIntNum.upperBound() >= 0.0) {
meanUpperBound = confIntNum.upperBound() / confIntDen.lowerBound();
} else {
meanUpperBound = confIntNum.upperBound() / confIntDen.upperBound();
}
// Ensuring that the lower and upper bounds of the mean are consistent with how computeResult()
// processes the mean.
meanLowerBound = clamp(meanLowerBound + midpoint);
meanUpperBound = clamp(meanUpperBound + midpoint);
return ConfidenceInterval.create(meanLowerBound, meanUpperBound);
}
/**
* Returns a serializable summary of the current state of this {@link BoundedMean} instance and
* its parameters. The summary can be used to merge this instance with another instance of {@link
* BoundedMean}.
*
* <p>This method cannot be invoked if the mean has already been queried, i.e., {@link
* computeResult()} has been called. Moreover, after this instance of {@link BoundedMean} has been
* serialized once, further modification and queries are not possible anymore.
*
* @throws IllegalStateException if this instance of {@link BoundedMean} has already been queried.
*/
public byte[] getSerializableSummary() {
Preconditions.checkState(
state != AggregationState.RESULT_RETURNED, "Mean cannot be serialized.");
state = AggregationState.SERIALIZED;
CountSummary deserializedCount;
BoundedSumSummary deserializedNormalizedSum;
try {
deserializedCount = CountSummary.parseFrom(count.getSerializableSummary());
deserializedNormalizedSum =
BoundedSumSummary.parseFrom(normalizedSum.getSerializableSummary());
} catch (InvalidProtocolBufferException e) {
throw new IllegalStateException("Mean object cannot be serialized. Reason: " + e);
}
BoundedMeanSummary serializedMean =
BoundedMeanSummary.newBuilder()
.setCountSummary(deserializedCount)
.setSumSummary(deserializedNormalizedSum)
.build();
return serializedMean.toByteArray();
}
/**
* Merges the output of {@link #getSerializableSummary()} from a different instance of {@link
* BoundedMean} with this instance. Intended to be used in the context of distributed computation.
*
* @throws IllegalArgumentException if the parameters of the two instances (epsilon, delta,
* contribution bounds, etc.) do not match or if the passed serialized summary is invalid.
* @throws IllegalStateException if this this instance of {@link BoundedMean} has already been
* queried or serialized.
*/
public void mergeWith(byte[] otherBoundedMeanSummary) {
Preconditions.checkState(state.equals(AggregationState.DEFAULT), "Means cannot be merged.");
BoundedMeanSummary otherSummaryParsed;
try {
otherSummaryParsed = BoundedMeanSummary.parseFrom(otherBoundedMeanSummary);
} catch (InvalidProtocolBufferException pbe) {
throw new IllegalArgumentException(pbe);
}
this.normalizedSum.mergeWith(otherSummaryParsed.getSumSummary().toByteArray());
this.count.mergeWith(otherSummaryParsed.getCountSummary().toByteArray());
}
@AutoValue
public abstract static class Params {
abstract Noise noise();
abstract double epsilon();
@Nullable
abstract Double delta();
abstract int maxPartitionsContributed();
abstract int maxContributionsPerPartition();
abstract double lower();
abstract double upper();
@AutoValue.Builder
public abstract static class Builder {
private static BoundedMean.Params.Builder newBuilder() {
BoundedMean.Params.Builder builder = new AutoValue_BoundedMean_Params.Builder();
// Provides LaplaceNoise as a default noise generator.
builder.noise(new LaplaceNoise());
return builder;
}
/** Epsilon DP parameter. */
public abstract BoundedMean.Params.Builder epsilon(double value);
/**
* Delta DP parameter.
*
* <p>Note that Laplace noise does not use delta. Hence, delta should not be set when Laplace
* noise is used.
*/
public abstract BoundedMean.Params.Builder delta(@Nullable Double value);
/**
* Maximum number of partitions that a single privacy unit (e.g., an individual) is allowed to
* contribute to.
*/
public abstract BoundedMean.Params.Builder maxPartitionsContributed(int value);
/** Max contributions per partition from a single privacy unit (e.g., an individual). */
public abstract BoundedMean.Params.Builder maxContributionsPerPartition(int value);
/** Noise that will be used to make the mean differentially private. */
public abstract BoundedMean.Params.Builder noise(Noise value);
/**
* Lower bound for the entries added to the mean. Any entires smaller than this value will be
* set to this value.
*/
public abstract BoundedMean.Params.Builder lower(double value);
/**
* Upper bound for the entries added to the mean. Any entires greater than this value will be
* set to this value.
*/
public abstract BoundedMean.Params.Builder upper(double value);
abstract BoundedMean.Params autoBuild();
public BoundedMean build() {
BoundedMean.Params params = autoBuild();
// No need to check noise nullability: the noise is defaulted to Laplace noise.
DpPreconditions.checkEpsilon(params.epsilon());
DpPreconditions.checkNoiseDelta(params.delta(), params.noise());
DpPreconditions.checkMaxPartitionsContributed(params.maxPartitionsContributed());
DpPreconditions.checkMaxContributionsPerPartition(params.maxContributionsPerPartition());
DpPreconditions.checkBounds(params.lower(), params.upper());
DpPreconditions.checkBoundsNotEqual(params.lower(), params.upper());
return new BoundedMean(params);
}
}
}
}