blob: 944e237f3005f1c640fa6405913d5c3b5c14ffbe [file] [log] [blame]
//
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package com.google.privacy.differentialprivacy;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static java.lang.Math.abs;
import static java.lang.Math.ceil;
import static java.lang.Math.expm1;
import static java.lang.Math.log;
import static java.lang.Math.log1p;
import static java.lang.Math.max;
import static java.lang.Math.min;
import com.google.auto.value.AutoValue;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.primitives.Longs;
import com.google.privacy.differentialprivacy.proto.SummaryOuterClass.ApproxBoundsSummary;
import com.google.protobuf.ExtensionRegistry;
import com.google.protobuf.InvalidProtocolBufferException;
import java.security.SecureRandom;
import java.util.Collection;
import java.util.Optional;
import java.util.Random;
/**
* Finds the approximate bounds of a set of numbers.
*
* <p>The algorithm uses two logarithmic histograms, one for positive values and one for negative
* values. Histogram bin counts are thresholded to remove bins with too few counts. The threshold is
* derived from the "failure probability", which is an upper bound on the probability of returning
* bounds that are too loose.
*
* <p>We choose the rightmost bin that exceeds the threshold count and return its greater boundary
* as the approximate upper bound. Similarly the leftmost bin that exceeds the threshold count is
* chosen and its smaller boundary is returned as the approximate lower bound. If the failure
* probability is too low, then it is possible that no bin exceeds the threshold, in which case we
* increase the failure probability and try again. When the failure probability exceeds a fixed
* value, we throw an exception rather than risk returning bounds that are primarily determined by
* the noise. If this happens, try providing a larger sample to ApproximateBounds.
*
* <p>For general details and key definitions, see <a href=
* "https://github.com/google/differential-privacy/blob/main/differential_privacy.md#key-definitions">
* the introduction to Differential Privacy</a>.
*/
public class ApproximateBounds {
@VisibleForTesting final Params params;
/**
* The failure probability is an upper bound on the probability of returning bounds that are too
* loose.
*
* <p>More precisely, it is the probability that we (incorrectly) return bounds for an empty
* dataset due to the noise added. It is used to determine the threshold for a bin to be
* considered non-empty.
*
* <p>Setting the failure probability too low will result in higher thresholds and, consequently,
* in dropping bins with too few entries or failing to find bounds at all. If this happens, we
* repeatedly retry with a larger failure probability until the failure probability reaches {@link
* ApproximateBounds.MAX_FAILURE_PROBABILITY}.
*/
private static final double INITIAL_FAILURE_PROBABILITY = 1e-9;
/** The factor by which the failure probability is increased when bounding fails. */
private static final double FAILURE_PROBABILITY_INCREMENT_FACTOR = 2;
/**
* The maximum acceptable failure probability.
*
* <p>When increasing the failure probability in order to try to find bounds for a small sample,
* if the failure probability increases above this value then we return an error: the bounds we
* would get would too often be influenced by noise rather than the sample.
*/
private static final double MAX_FAILURE_PROBABILITY = 1e-6;
private final long[] positiveBins;
private final long[] negativeBins;
private final double[] rightBinBoundaries;
private final Random random = new SecureRandom();
private AggregationState aggregationState = AggregationState.DEFAULT;
private ApproximateBounds(Params params) {
this.params = params;
positiveBins = new long[params.inputType().numPositiveBins];
negativeBins = new long[params.inputType().numNegativeBins];
// Cache bin boundaries for performance reasons.
rightBinBoundaries = params.inputType().generateRightBinBoundaries();
}
public static Params.Builder builder() {
return Params.Builder.newBuilder();
}
/** Adds a value to the approximate bounds calculation. */
public void addEntry(double value) {
checkState(
aggregationState.equals(AggregationState.DEFAULT), aggregationState.getErrorMessage());
// NaN is ignored because we cannot run logic on NaNs.
if (Double.isNaN(value)) {
return;
}
// If our histogram has no negative bins, clamp negative values to the 0th bin
if (value < 0 && !params.inputType().hasNegativeBins) {
positiveBins[0]++;
return;
}
// If our histogram has negative bins, then they are symmetric with the positive ones.
int binIndex = params.inputType().getPositiveBinNumber(abs(value));
if (value >= 0) {
positiveBins[binIndex]++;
} else {
negativeBins[binIndex]++;
}
}
/** Adds a set of value to the approximate bounds calculation. */
public void addEntries(Collection<Double> e) {
e.forEach(this::addEntry);
}
public Result computeResult() {
checkState(
aggregationState.equals(AggregationState.DEFAULT), aggregationState.getErrorMessage());
aggregationState = AggregationState.RESULT_RETURNED;
double[] noisyPositiveBins = addNoise(positiveBins);
double[] noisyNegativeBins = addNoise(negativeBins);
double currentFailureProbability = INITIAL_FAILURE_PROBABILITY;
while (currentFailureProbability <= MAX_FAILURE_PROBABILITY) {
double perBinFailureProbability =
computePerBinFailureProbability(
currentFailureProbability, params.inputType().totalBins());
double threshold =
LaplaceNoise.computeQuantile(
1 - perBinFailureProbability,
/* x= */ 0,
/* l1Sensitivity= */ params.maxContributions(),
params.epsilon());
Optional<Double> minimum = findMinimum(threshold, noisyNegativeBins, noisyPositiveBins);
Optional<Double> maximum = findMaximum(threshold, noisyNegativeBins, noisyPositiveBins);
if (minimum.isPresent() && maximum.isPresent()) {
return Result.create(minimum.get(), maximum.get());
}
currentFailureProbability = FAILURE_PROBABILITY_INCREMENT_FACTOR * currentFailureProbability;
}
throw new IllegalArgumentException(
"Bin count threshold was too large to find approximate bounds. Either run over a larger "
+ "dataset or decrease success_probability and try again.");
}
/**
* Converts an overall failure probability into the corresponding per-bin failure probability.
*
* <p>If we were dealing with a "success probability" (i.e. 1 - failure probability) and had N
* bins, then the per-bin success probability is the N^th root of the overall success probability.
* Translated into failure probabilities: perBinFP = 1 - pow(1 - FP, 1/N)
*
* <p>Since in general the failure probability is very small, we use expm1 and log1p to calculate
* this:
*
* <pre>{@code
* perBinFP = 1 - exp(log(pow(1 - FP, 1/N)))
* = 1 - exp(log(1 - FP) / N)
* = -expm1(log1p(-FP) / N)
* }</pre>
*/
@VisibleForTesting
static double computePerBinFailureProbability(double overallFailureProbability, int numBins) {
return -expm1(log1p(-overallFailureProbability) / numBins);
}
/**
* Finds the minimum by looking for bins with sufficient amount of entries in the noisy
* histograms.
*/
private Optional<Double> findMinimum(
double threshold, double[] noisyNegativeBins, double[] noisyPositiveBins) {
for (int i = noisyNegativeBins.length - 1; i >= 0; --i) {
if (noisyNegativeBins[i] >= threshold) {
return Optional.of(getNegativeLeftBinBoundary(i));
}
}
for (int i = 0; i < noisyPositiveBins.length; i++) {
if (noisyPositiveBins[i] >= threshold) {
return Optional.of(getPositiveLeftBinBoundary(i));
}
}
return Optional.empty();
}
/**
* Finds the maximum by looking for bins with sufficient amount of entries in the noisy
* histograms.
*/
private Optional<Double> findMaximum(
double threshold, double[] noisyNegativeBins, double[] noisyPositiveBins) {
for (int i = noisyPositiveBins.length - 1; i >= 0; --i) {
if (noisyPositiveBins[i] >= threshold) {
return Optional.of(getPositiveRightBinBoundary(i));
}
}
for (int i = 0; i < noisyNegativeBins.length; i++) {
if (noisyNegativeBins[i] >= threshold) {
return Optional.of(getNegativeRightBinBoundary(i));
}
}
return Optional.empty();
}
private double getNegativeLeftBinBoundary(int bin) {
return -1.0 * getPositiveRightBinBoundary(bin);
}
private double getNegativeRightBinBoundary(int bin) {
return -getPositiveLeftBinBoundary(bin);
}
private double getPositiveLeftBinBoundary(int bin) {
if (bin == 0) {
return 0;
}
return getPositiveRightBinBoundary(bin - 1);
}
private double getPositiveRightBinBoundary(int bin) {
return rightBinBoundaries[bin];
}
private double[] addNoise(long[] bins) {
double[] noisyBins = new double[bins.length];
for (int i = 0; i < bins.length; i++) {
// Since we never release the raw noised values from this class, we don't need to defend
// against attacks against the least significant bits of the noise (see "On Significance of
// the Least Significant Bits For Differential Privacy" by Ilya Mironov,
// https://www.microsoft.com/en-us/research/wp-content/uploads/2012/10/lsbs.pdf). Therefore we
// can use a faster randomness generation method.
noisyBins[i] =
LaplaceNoise.computeQuantile(
/* rank= */ random.nextDouble(),
/* x= */ (double) bins[i],
/* l1Sensitivity= */ params.maxContributions(),
params.epsilon());
}
return noisyBins;
}
/**
* Returns a serializable summary of the current state of this {@link ApproximateBounds} instance.
* The summary can be used to merge this instance with another instance of {@link
* ApproximateBounds}.
*
* <p>This method cannot be invoked if the result has already been computed using {@link
* computeResult()}. Moreover, after this instance of {@link ApproximateBounds} has been
* serialized once, further modification and queries are not possible anymore.
*
* @throws IllegalStateException if this instance of {@link ApproximateBounds} has already been
* queried.
*/
public byte[] getSerializableSummary() {
checkState(
aggregationState != AggregationState.RESULT_RETURNED, aggregationState.getErrorMessage());
aggregationState = AggregationState.SERIALIZED;
return ApproxBoundsSummary.newBuilder()
.addAllPosBinCount(Longs.asList(positiveBins))
.addAllNegBinCount(Longs.asList(negativeBins))
.build()
.toByteArray();
}
/**
* Merges the output of {@link #getSerializableSummary()} from a different instance of {@link
* ApproximateBounds} with this instance. Intended to be used in the context of distributed
* computation.
*
* @throws IllegalArgumentException if the passed serialized summary is invalid or has a different
* number of bins to this instance.
* @throws IllegalStateException if this instance of {@link ApproximateBounds} has already been
* queried or serialized.
*/
public void mergeWith(byte[] otherSummary) {
checkState(
aggregationState.equals(AggregationState.DEFAULT), aggregationState.getErrorMessage());
ApproxBoundsSummary otherSummaryParsed;
try {
otherSummaryParsed =
ApproxBoundsSummary.parseFrom(otherSummary, ExtensionRegistry.newInstance());
} catch (InvalidProtocolBufferException e) {
throw new IllegalArgumentException(e);
}
checkArgument(
otherSummaryParsed.getPosBinCountCount() == positiveBins.length,
"Both histograms must have the same number of positive bins (merge target has %s, "
+ "merge source has %s).",
positiveBins.length,
otherSummaryParsed.getPosBinCountCount());
checkArgument(
otherSummaryParsed.getNegBinCountCount() == negativeBins.length,
"Both histograms must have the same number of negative bins (merge target has %s, "
+ "merge source has %s).",
negativeBins.length,
otherSummaryParsed.getNegBinCountCount());
for (int i = 0; i < otherSummaryParsed.getPosBinCountCount(); i++) {
positiveBins[i] += otherSummaryParsed.getPosBinCount(i);
}
for (int i = 0; i < otherSummaryParsed.getNegBinCountCount(); i++) {
negativeBins[i] += otherSummaryParsed.getNegBinCount(i);
}
}
/**
* Parameters for calculation of differentially-private approximate bounds.
*
* <p>See the {@link Builder} for the explanation of each parameter.
*/
@AutoValue
public abstract static class Params {
abstract double epsilon();
abstract InputType inputType();
abstract Integer maxContributions();
public abstract Builder toBuilder();
@AutoValue.Builder
public abstract static class Builder {
// Approximate bounds supports only Laplace noise due to the way sensitivities
// are calculated in the internal logarithmic histogram. Therefore, the builder doesn't
// accept noise or delta parameters.
/** Epsilon to be used for the approximate bounds calculation. */
public abstract Builder epsilon(double value);
/**
* Type of the inputs to {@link ApproximateBounds}.
*
* <p>This parameter is used to optimize the bounding algorithm based on expected properties
* of the input values, e.g., the values have the range and granularity of a certain numerical
* type such as double, long, etc. For example, when the input type is {@code
* InputType.DOUBLE}, we include both positive and negative bins in the histogram, as well as
* bins between 0 and 1. Note that all values passed to {@link ApproximateBounds} are doubles;
* this parameter only specifies how the algorithm processes the values.
*/
public abstract Builder inputType(InputType inputType);
/** The maximum number of contributions each privacy unit can make to the dataset. */
public abstract Builder maxContributions(Integer value);
abstract Params autoBuild();
private static Builder newBuilder() {
Params.Builder builder = new AutoValue_ApproximateBounds_Params.Builder();
builder.inputType(InputType.DOUBLE);
return builder;
}
public ApproximateBounds build() {
Params params = autoBuild();
DpPreconditions.checkEpsilon(params.epsilon());
return new ApproximateBounds(params);
}
}
/**
* The type of input that this algorithm will be provided with.
*
* <p>This determines the size and number of bins, as well as whether we should include negative
* bins and bins between 0 and 1.
*
* <p>In general, bin i contains the number of inputs that lie in the range (scale * base^(i-1),
* scale * base^i], where scale and base are determined by the InputType. The exception is
* positive bin 0, which has boundaries [0, scale * base^0]. Negative bin 0 does not contain 0.
* Values outside of the range covered by bins are clipped to the closest bin boundary.
*
* <p>For example, if scale = 1, base = 2, and num_bins = 4 then the positive histogram bins are
* [0, 1], (1, 2], (2, 4], and (4, 8].
*/
public enum InputType {
// For doubles, we include both positive and negative bins, with successive bins doubling in
// size. These parameters result in a rightmost bin boundary equal to 2^1023. Since
// Double.MAX_VALUE is slightly less than 2^1024, values close to ±Double.MAX_VALUE will get
// clamped. Sample bin boundaries:
// ..., [-2, -1), [-1, -0.5), ..., (0.5, 1], (1, 2], ...
DOUBLE(2046, /*hasNegativeBins=*/ true, Double.MIN_NORMAL),
// For integers, the bins are the same as for doubles except we don't subdivide the bins
// [-1, 0) and [0, 1]. These parameters result in a rightmost bin boundary equal to 2^31.
// Note that this is greater than Integer.MAX_VALUE, so Integer.MAX_VALUE lies in the
// interior of the rightmost bin. Sample bin boundaries:
// ..., [-2, -1), [-1, 0), [0, 1], (1, 2], ...
INTEGER(32, /*hasNegativeBins=*/ true, 1),
// For positive integers, the bins are the same as for integers except we don't include
// negative bins. Sample bin boundaries:
// [0, 1], (1, 2], ...
POSITIVE_INTEGER(32, /*hasNegativeBins=*/ false, 1),
// In tests, we use a reduced number of bins for performance reasons and to keep assertions
// simple. We only use 10 bins, with successive bins doubling in size. Sample bin boundaries:
// [-16, -8), [-8, -4) ..., [0, 1], (1, 2], ..., (8, 16]
TEST(5, /*hasNegativeBins=*/ true, 1);
@VisibleForTesting public final int numPositiveBins;
@VisibleForTesting public final int numNegativeBins;
@VisibleForTesting public final boolean hasNegativeBins;
@VisibleForTesting public final double scale;
@VisibleForTesting public final double base = 2.0;
private InputType(int numPositiveBins, boolean hasNegativeBins, double scale) {
this.numPositiveBins = numPositiveBins;
this.numNegativeBins = hasNegativeBins ? numPositiveBins : 0;
this.hasNegativeBins = hasNegativeBins;
this.scale = scale;
}
int totalBins() {
return this.numPositiveBins + this.numNegativeBins;
}
double[] generateRightBinBoundaries() {
double[] rightBinBoundaries = new double[numPositiveBins];
double current = scale;
for (int i = 0; i < numPositiveBins; i++) {
rightBinBoundaries[i] = current;
// The above values for numPositiveBins are chosen so that they do not cause an overflow
// here.
current *= base;
}
return rightBinBoundaries;
}
/** Returns the bin number where the provided positive value should be placed. */
int getPositiveBinNumber(double value) {
if (value == 0) {
// Zero is special since log(0) is undefined
return 0;
}
if (value < 0) {
throw new IllegalArgumentException("Expected a positive input but got " + value);
}
// Calculate the most significant bit and clamp to a valid bin index.
// x satisfies scale * base^x = abs(value)
double x = (log(abs(value)) - log(scale)) / log(base);
int msb = (int) ceil(x);
return max(0, min(msb, numPositiveBins - 1));
}
}
}
/** A result of approximate bounds calculation. */
@AutoValue
public abstract static class Result {
/** Lower approximate bound. */
public abstract double lowerBound();
/** Upper approximate bound. */
public abstract double upperBound();
public static Result create(double lower, double upper) {
return new AutoValue_ApproximateBounds_Result(lower, upper);
}
}
}