blob: 10528d830db5d85ce260316949b90695571d63e6 [file] [log] [blame]
# Copyright 2021 Google LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Default values for the clustering algorithm parameters."""
from typing import Tuple
import numpy as np
from clustering import central_privacy_utils
from clustering import clustering_params
from clustering import privacy_calculator
PrivateCount = int
def default_tree_param(
k: int, data: clustering_params.Data,
pcalc: privacy_calculator.PrivacyCalculator,
max_depth: int) -> Tuple[clustering_params.TreeParam, PrivateCount]:
"""Heuristic tree param based on the data and number of clusters.
Args:
k: Number of clusters to divide the data into.
data: Data to find centers for.
pcalc: Privacy parameters.
max_depth: Max depth of the tree.
Returns:
(default TreeParam, private count). The private count is provided so that
it doesn't need to be re-computed.
"""
private_count = central_privacy_utils.get_private_count(
data.num_points, pcalc.count_privacy_param)
# We can consider the noise as distributed amongst the points that are being
# summed. The noise has l2-norm roughly sqrt(dimension) * sum_sigma,
# so if we distribute among 10 * sqrt(dimension) * sum_sigma / radius, each
# point has noise roughly 0.1 * radius.
num_points_in_node_for_low_noise = int(
10 * np.sqrt(data.dim) *
pcalc.average_privacy_param.gaussian_standard_deviation /
pcalc.average_privacy_param.sensitivity)
# We want to at least have the ability to consider a node per cluster, even
# if the noise might be higher than we'd like.
min_num_points_in_node = min(num_points_in_node_for_low_noise,
private_count // (2 * k))
# min_num_points_in_node must always be at least 1. Note it's possible that
# the private_count is negative, so we should ensure this max is done last.
min_num_points_in_node = max(1, min_num_points_in_node)
min_num_points_in_branching_node = 3 * min_num_points_in_node
return (clustering_params.TreeParam(
min_num_points_in_branching_node=min_num_points_in_branching_node,
min_num_points_in_node=min_num_points_in_node,
max_depth=max_depth), private_count)