learning/clustering/default_clustering_params.py - third_party/github.com/google/differential-privacy - Git at Google

 # Copyright 2021 Google LLC.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Default values for the clustering algorithm parameters."""

 from typing import Tuple
 import numpy as np

 from clustering import central_privacy_utils
 from clustering import clustering_params
 from clustering import privacy_calculator

 PrivateCount = int


 def default_tree_param(
     k: int, data: clustering_params.Data,
     pcalc: privacy_calculator.PrivacyCalculator,
     max_depth: int) -> Tuple[clustering_params.TreeParam, PrivateCount]:
   """Heuristic tree param based on the data and number of clusters.

   Args:
     k: Number of clusters to divide the data into.
     data: Data to find centers for.
     pcalc: Privacy parameters.
     max_depth: Max depth of the tree.

   Returns:
     (default TreeParam, private count). The private count is provided so that
     it doesn't need to be re-computed.
   """
   private_count = central_privacy_utils.get_private_count(
       data.num_points, pcalc.count_privacy_param)

   # We can consider the noise as distributed amongst the points that are being
   # summed. The noise has l2-norm roughly sqrt(dimension) * sum_sigma,
   # so if we distribute among 10 * sqrt(dimension) * sum_sigma / radius, each
   # point has noise roughly 0.1 * radius.
   num_points_in_node_for_low_noise = int(
       10 * np.sqrt(data.dim) *
       pcalc.average_privacy_param.gaussian_standard_deviation /
       pcalc.average_privacy_param.sensitivity)

   # We want to at least have the ability to consider a node per cluster, even
   # if the noise might be higher than we'd like.
   min_num_points_in_node = min(num_points_in_node_for_low_noise,
                                private_count // (2 * k))

   # min_num_points_in_node must always be at least 1. Note it's possible that
   # the private_count is negative, so we should ensure this max is done last.
   min_num_points_in_node = max(1, min_num_points_in_node)
   min_num_points_in_branching_node = 3 * min_num_points_in_node

   return (clustering_params.TreeParam(
       min_num_points_in_branching_node=min_num_points_in_branching_node,
       min_num_points_in_node=min_num_points_in_node,
       max_depth=max_depth), private_count)
	# Copyright 2021 Google LLC.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Default values for the clustering algorithm parameters."""

	from typing import Tuple
	import numpy as np

	from clustering import central_privacy_utils
	from clustering import clustering_params
	from clustering import privacy_calculator

	PrivateCount = int


	def default_tree_param(
	k: int, data: clustering_params.Data,
	pcalc: privacy_calculator.PrivacyCalculator,
	max_depth: int) -> Tuple[clustering_params.TreeParam, PrivateCount]:
	"""Heuristic tree param based on the data and number of clusters.

	Args:
	k: Number of clusters to divide the data into.
	data: Data to find centers for.
	pcalc: Privacy parameters.
	max_depth: Max depth of the tree.

	Returns:
	(default TreeParam, private count). The private count is provided so that
	it doesn't need to be re-computed.
	"""
	private_count = central_privacy_utils.get_private_count(
	data.num_points, pcalc.count_privacy_param)

	# We can consider the noise as distributed amongst the points that are being
	# summed. The noise has l2-norm roughly sqrt(dimension) * sum_sigma,
	# so if we distribute among 10 * sqrt(dimension) * sum_sigma / radius, each
	# point has noise roughly 0.1 * radius.
	num_points_in_node_for_low_noise = int(
	10 * np.sqrt(data.dim) *
	pcalc.average_privacy_param.gaussian_standard_deviation /
	pcalc.average_privacy_param.sensitivity)

	# We want to at least have the ability to consider a node per cluster, even
	# if the noise might be higher than we'd like.
	min_num_points_in_node = min(num_points_in_node_for_low_noise,
	private_count // (2 * k))

	# min_num_points_in_node must always be at least 1. Note it's possible that
	# the private_count is negative, so we should ensure this max is done last.
	min_num_points_in_node = max(1, min_num_points_in_node)
	min_num_points_in_branching_node = 3 * min_num_points_in_node

	return (clustering_params.TreeParam(
	min_num_points_in_branching_node=min_num_points_in_branching_node,
	min_num_points_in_node=min_num_points_in_node,
	max_depth=max_depth), private_count)