python/dp_auditorium/dp_auditorium/testers/histogram_tester.py - third_party/github.com/google/differential-privacy - Git at Google

 # Copyright 2024 Google LLC.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Approximate DP tester from Gilbert and McMillan (2018).

 The tester in this file corresponds to algorithm 2 in
 https://arxiv.org/pdf/1806.06427.pdf. Parameter names follow naming in the
 original algorithm.
 """

 import collections
 import functools

 from etils.array_types import IntArray
 import numpy as np
 from typing_extensions import override

 from dp_auditorium import interfaces
 from dp_auditorium.configs import privacy_property
 from dp_auditorium.configs import property_tester_config
 from dp_auditorium.testers import property_tester_utils


 def _estimate_discrete_distribution(
     samples: IntArray,
     universe_size: int,
 ) -> np.ndarray:
   """Returns estimated probability mass function over universe using samples.

   This function estimates a probability mass function over a finite universe
   using samples from an unknown underlying distribution. It assumes the sample
   values are in the range [0, universe_size-1].

   Args:
     samples: array of samples from the underlying distribution.
     universe_size: size of the universe.

   Raises:
     ValueError: If the `samples` array is empty or if `samples` contains
     elements outside [0,`universe_size`-1].
   """
   if samples.size == 0:
     raise ValueError("Cannot estimate probability mass from empty array.")
   # Discrete mechanisms return values in {0,...,universe_size-1}. Since
   # we enforce type int for the `samples` array checking for the maximum and the
   # minimum enforces that the histogram is created in the right domain.
   if np.amax(samples) >= universe_size or np.amin(samples) < 0:
     raise ValueError(
         "The samples from the mechanism have a different range than"
         f" {{0,..., {universe_size-1}}}."
     )
   counts = collections.Counter(samples)
   return np.array([counts[i] for i in range(universe_size)]) / samples.shape[0]


 def _estimate_continuous_distribution(
     samples: np.ndarray, universe_size: int, min_value: float, max_value: float
 ) -> np.ndarray:
   """Estimates a discretized probability mass function from samples.

   Assuming a continuous random variable taking values over an interval
   `[min_value, max_value]`, this function estimates the continuous density
   function by discretizing the interval `[min_value, max_value]` and computing
   a normalized histogram from samples of the underlying distribution.

   Args:
     samples: array of samples from the underlying distribution distribution.
     universe_size: size of the discretized output space.
     min_value: minimum value of the random value.
     max_value: maximum value of the random value.

   Returns:
     Array where each entry determines the estimated probability of the
     variable taking values on the corresponding bin.

   Raises:
     ValueError: If the `samples` array is empty.
   """
   clipped_samples = np.clip(samples, min_value, max_value)
   if samples.size == 0:
     raise ValueError("Cannot estimate probability mass from empty array.")
   counts, _ = np.histogram(
       clipped_samples,
       bins=universe_size,
       range=(min_value, max_value),
   )
   return counts / samples.shape[0]


 class HistogramTester(interfaces.PropertyTester):
   """Privacy tester introduced in https://arxiv.org/pdf/1806.06427.pdf.

   Implements the approximate differential privacy (DP) tester corresponding to
   Algorithm 2 in https://arxiv.org/pdf/1806.06427.pdf. The algorithm is given
   samples from an (epsilon, delta)-approximate DP mechanism on neighboring
   datasets and uses them to estimate the parameter `delta`. The algorithm is
   only defined for finite output mechanisms, so the implementation below places
   continuous-valued samples into discrete buckets. The error bound for the
   estimate of `delta` is a slightly improved version of the bound from the
   original paper, as it does not assume that the number of samples is Poisson
   distributed, and it holds for any given probability, rather than only with
   probability 2/3.
   """

   def __init__(
       self, config: property_tester_config.HistogramPropertyTesterConfig
   ):
     """Initializes an instance of a Histogram property tester.

     Args:
       config: Configuration for Histogram tester.
     """
     property_tester_utils.validate_approximate_dp_property(
         config.approximate_dp
     )

     # Set function to estimate a probability mass function.
     if config.test_discrete_mechanism:
       self._estimate_distribution = functools.partial(
           _estimate_discrete_distribution,
           universe_size=config.histogram_size,
       )
     else:
       self._estimate_distribution = functools.partial(
           _estimate_continuous_distribution,
           universe_size=config.histogram_size,
           min_value=config.min_value,
           max_value=config.max_value,
       )
     self._epsilon = config.approximate_dp.epsilon
     self._delta = config.approximate_dp.delta
     self._use_original_tester = config.use_original_tester
     self._histogram_size = config.histogram_size
     self._approximate_dp = config.approximate_dp

   @property
   def privacy_property(self) -> privacy_property.PrivacyProperty:
     """The privacy guarantee that the tester is being used to test for."""
     return privacy_property.PrivacyProperty(approximate_dp=self._approximate_dp)

   def _get_error_tolerance(
       self,
       num_samples: float,
       probabilities1: np.ndarray,
       probabilities2: np.ndarray,
       failure_probability: float
   ) -> float:
     """Gets error tolerance for Histogram property tester."""
     if self._use_original_tester:
       term_1 = (
           2.0
           * (1.0 + np.exp(self._epsilon))
           * np.sqrt(self._histogram_size / num_samples)
       )
     else:
       term_1a = (
           2.0 / np.sqrt(num_samples)
           * sum(np.sqrt(probabilities1))
       )
       term_1b = (
           2.0 * np.exp(self._epsilon) / np.sqrt(num_samples)
           * sum(np.sqrt(probabilities2))
       )
       term_1 = term_1a + term_1b
     term_2 = (
         6.0
         * (1.0 + np.exp(self._epsilon))
         * np.sqrt(np.log(4.0 / failure_probability) / (2.0 * num_samples))
     )
     return term_1 + term_2

   @override
   def estimate_lower_bound(
       self,
       samples1: np.ndarray,
       samples2: np.ndarray,
       failure_probability: float,
   ) -> float:
     """Estimates delta in approximate DP guarantee."""
     num_samples = min(samples1.shape[0], samples2.shape[0])
     probabilities1 = self._estimate_distribution(samples1)
     probabilities2 = self._estimate_distribution(samples2)
     per_outcome_delta = probabilities1 - np.exp(self._epsilon) * probabilities2
     estimated_delta = np.sum(per_outcome_delta[per_outcome_delta > 0])
     error_tolerance = self._get_error_tolerance(
         num_samples, probabilities1, probabilities2, failure_probability
     )
     return estimated_delta - error_tolerance

   @override
   def reject_property(self, lower_bound: float) -> bool:
     return lower_bound >= self._delta
	# Copyright 2024 Google LLC.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Approximate DP tester from Gilbert and McMillan (2018).

	The tester in this file corresponds to algorithm 2 in
	https://arxiv.org/pdf/1806.06427.pdf. Parameter names follow naming in the
	original algorithm.
	"""

	import collections
	import functools

	from etils.array_types import IntArray
	import numpy as np
	from typing_extensions import override

	from dp_auditorium import interfaces
	from dp_auditorium.configs import privacy_property
	from dp_auditorium.configs import property_tester_config
	from dp_auditorium.testers import property_tester_utils


	def _estimate_discrete_distribution(
	samples: IntArray,
	universe_size: int,
	) -> np.ndarray:
	"""Returns estimated probability mass function over universe using samples.

	This function estimates a probability mass function over a finite universe
	using samples from an unknown underlying distribution. It assumes the sample
	values are in the range [0, universe_size-1].

	Args:
	samples: array of samples from the underlying distribution.
	universe_size: size of the universe.

	Raises:
	ValueError: If the `samples` array is empty or if `samples` contains
	elements outside [0,`universe_size`-1].
	"""
	if samples.size == 0:
	raise ValueError("Cannot estimate probability mass from empty array.")
	# Discrete mechanisms return values in {0,...,universe_size-1}. Since
	# we enforce type int for the `samples` array checking for the maximum and the
	# minimum enforces that the histogram is created in the right domain.
	if np.amax(samples) >= universe_size or np.amin(samples) < 0:
	raise ValueError(
	"The samples from the mechanism have a different range than"
	f" {{0,..., {universe_size-1}}}."
	)
	counts = collections.Counter(samples)
	return np.array([counts[i] for i in range(universe_size)]) / samples.shape[0]


	def _estimate_continuous_distribution(
	samples: np.ndarray, universe_size: int, min_value: float, max_value: float
	) -> np.ndarray:
	"""Estimates a discretized probability mass function from samples.

	Assuming a continuous random variable taking values over an interval
	`[min_value, max_value]`, this function estimates the continuous density
	function by discretizing the interval `[min_value, max_value]` and computing
	a normalized histogram from samples of the underlying distribution.

	Args:
	samples: array of samples from the underlying distribution distribution.
	universe_size: size of the discretized output space.
	min_value: minimum value of the random value.
	max_value: maximum value of the random value.

	Returns:
	Array where each entry determines the estimated probability of the
	variable taking values on the corresponding bin.

	Raises:
	ValueError: If the `samples` array is empty.
	"""
	clipped_samples = np.clip(samples, min_value, max_value)
	if samples.size == 0:
	raise ValueError("Cannot estimate probability mass from empty array.")
	counts, _ = np.histogram(
	clipped_samples,
	bins=universe_size,
	range=(min_value, max_value),
	)
	return counts / samples.shape[0]


	class HistogramTester(interfaces.PropertyTester):
	"""Privacy tester introduced in https://arxiv.org/pdf/1806.06427.pdf.

	Implements the approximate differential privacy (DP) tester corresponding to
	Algorithm 2 in https://arxiv.org/pdf/1806.06427.pdf. The algorithm is given
	samples from an (epsilon, delta)-approximate DP mechanism on neighboring
	datasets and uses them to estimate the parameter `delta`. The algorithm is
	only defined for finite output mechanisms, so the implementation below places
	continuous-valued samples into discrete buckets. The error bound for the
	estimate of `delta` is a slightly improved version of the bound from the
	original paper, as it does not assume that the number of samples is Poisson
	distributed, and it holds for any given probability, rather than only with
	probability 2/3.
	"""

	def __init__(
	self, config: property_tester_config.HistogramPropertyTesterConfig
	):
	"""Initializes an instance of a Histogram property tester.

	Args:
	config: Configuration for Histogram tester.
	"""
	property_tester_utils.validate_approximate_dp_property(
	config.approximate_dp
	)

	# Set function to estimate a probability mass function.
	if config.test_discrete_mechanism:
	self._estimate_distribution = functools.partial(
	_estimate_discrete_distribution,
	universe_size=config.histogram_size,
	)
	else:
	self._estimate_distribution = functools.partial(
	_estimate_continuous_distribution,
	universe_size=config.histogram_size,
	min_value=config.min_value,
	max_value=config.max_value,
	)
	self._epsilon = config.approximate_dp.epsilon
	self._delta = config.approximate_dp.delta
	self._use_original_tester = config.use_original_tester
	self._histogram_size = config.histogram_size
	self._approximate_dp = config.approximate_dp

	@property
	def privacy_property(self) -> privacy_property.PrivacyProperty:
	"""The privacy guarantee that the tester is being used to test for."""
	return privacy_property.PrivacyProperty(approximate_dp=self._approximate_dp)

	def _get_error_tolerance(
	self,
	num_samples: float,
	probabilities1: np.ndarray,
	probabilities2: np.ndarray,
	failure_probability: float
	) -> float:
	"""Gets error tolerance for Histogram property tester."""
	if self._use_original_tester:
	term_1 = (
	2.0
	* (1.0 + np.exp(self._epsilon))
	* np.sqrt(self._histogram_size / num_samples)
	)
	else:
	term_1a = (
	2.0 / np.sqrt(num_samples)
	* sum(np.sqrt(probabilities1))
	)
	term_1b = (
	2.0 * np.exp(self._epsilon) / np.sqrt(num_samples)
	* sum(np.sqrt(probabilities2))
	)
	term_1 = term_1a + term_1b
	term_2 = (
	6.0
	* (1.0 + np.exp(self._epsilon))
	* np.sqrt(np.log(4.0 / failure_probability) / (2.0 * num_samples))
	)
	return term_1 + term_2

	@override
	def estimate_lower_bound(
	self,
	samples1: np.ndarray,
	samples2: np.ndarray,
	failure_probability: float,
	) -> float:
	"""Estimates delta in approximate DP guarantee."""
	num_samples = min(samples1.shape[0], samples2.shape[0])
	probabilities1 = self._estimate_distribution(samples1)
	probabilities2 = self._estimate_distribution(samples2)
	per_outcome_delta = probabilities1 - np.exp(self._epsilon) * probabilities2
	estimated_delta = np.sum(per_outcome_delta[per_outcome_delta > 0])
	error_tolerance = self._get_error_tolerance(
	num_samples, probabilities1, probabilities2, failure_probability
	)
	return estimated_delta - error_tolerance

	@override
	def reject_property(self, lower_bound: float) -> bool:
	return lower_bound >= self._delta