blob: ca39f9062f8aaf809520f8da83af5f9d59dcccea [file] [log] [blame]
# Copyright 2021 Google LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Basic Example for Using Private Clustering."""
from typing import Sequence
from absl import app
from absl import flags
import numpy as np
from clustering import clustering_algorithm
from clustering import clustering_params
from clustering.demo import data_generation
FLAGS = flags.FLAGS
_NUM_POINTS = flags.DEFINE_integer('num_points', 100000,
'Number of points in synthetic dataset.')
_DIM = flags.DEFINE_integer('dim', 100,
'Dimension of points in synthetic dataset.')
_NUM_CLUSTERS = flags.DEFINE_integer(
'num_clusters', 64, 'Number of clusters in synthetic dataset.')
_CLUSTER_RATIO = flags.DEFINE_float(
'cluster_ratio', 8.0,
'Parameter controlling the ratio of distances between points in different '
'vs. same cluster.')
_RADIUS = flags.DEFINE_float(
'radius', 1.0,
'Radius of ball in which all points in synthetic dataset lie.')
_FIXED_EPS = flags.DEFINE_float(
'fixed_epsilon', 1.0,
'Value of epsilon to use when experimenting with varying k.')
_K_TO_TRY = flags.DEFINE_list(
'k_to_try', '2, 4, 8, 16, 32, 64',
'List of k values to use when experimenting with varying k.')
_FIXED_K = flags.DEFINE_integer(
'fixed_k', 64, 'Value of k when experimenting with varying epsilon.')
_EPS_TO_TRY = flags.DEFINE_list(
'epsilon_to_try', '0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0, inf',
'List of epsilon values to use when experimenting with varying epsilon.')
_USE_MECHANISM_CALIBRATION = flags.DEFINE_bool(
'use_mechanism_calibration', False,
'Runs demo with mechanism calibration instead of budget split')
def main(argv: Sequence[str]) -> None:
if len(argv) > 1:
raise app.UsageError('Too many command-line arguments.')
data: clustering_params.Data = data_generation.generate_synthetic_dataset(
_NUM_POINTS.value, _DIM.value, _NUM_CLUSTERS.value, _CLUSTER_RATIO.value,
_RADIUS.value)
print('==== Synthentic Dataset Information ====\n'
f'Number of datapoints: {_NUM_POINTS.value}\n'
f'Dimensions: {_DIM.value}\n'
f'Number of clusters: {_NUM_CLUSTERS.value}\n'
f'Radius: {_RADIUS.value}\n'
f'Cluster centers drawn from: Uniform over ball of '
f'radius={_RADIUS.value * (1 - 1 / float(_CLUSTER_RATIO.value)):.4}\n'
f'Each cluster drawn from: '
f'N(cluster_center, '
f'{_RADIUS.value / _CLUSTER_RATIO.value / np.sqrt(_DIM.value):.4} * I) '
f'clipped to ball of radius {_RADIUS.value}')
eval_head = ('| k | epsilon | clustering loss | dominant label accuracy '
' | false match fraction | true non-match fraction |')
def run_clustering(k: int, eps: float) -> None:
privacy_param = clustering_params.DifferentialPrivacyParam(
epsilon=eps, delta=1e-6)
clustering_result: clustering_algorithm.ClusteringResult = (
clustering_algorithm.private_lsh_clustering(
k,
data,
privacy_param,
multipliers=clustering_params.PrivacyCalculatorMultiplier()
if _USE_MECHANISM_CALIBRATION.value else None))
clustering_metrics: clustering_algorithm.ClusteringMetrics = (
clustering_result.get_clustering_metrics())
correct_pred = clustering_metrics.dominant_label_correct_count
accuracy = clustering_metrics.dominant_label_accuracy
false_match_frac = clustering_metrics.false_match_frac
true_nonmatch_frac = clustering_metrics.true_nonmatch_frac
print(
f'| {k:>2} | {eps:>7} '
f'| {clustering_result.loss:>15.8} '
f'| {accuracy:>6.2} ({correct_pred:>6} out of {_NUM_POINTS.value:>6}) '
f'| {false_match_frac:>20.4} '
f'| {true_nonmatch_frac:>23.4} |')
print(f'\n# Evaluation with epsilon = {_FIXED_EPS.value} and '
f'varying k in {list(map(int, _K_TO_TRY.value))}')
print(eval_head)
for k in list(map(int, _K_TO_TRY.value)):
run_clustering(k, _FIXED_EPS.value)
print(f'\n# Evaluation with k = {_FIXED_K.value} and '
f'varying epsilon in {list(map(float, _EPS_TO_TRY.value))}')
print(eval_head)
for epsilon in list(map(float, _EPS_TO_TRY.value)):
run_clustering(_FIXED_K.value, epsilon)
print('Note: all computations apart from cluster centers, such as loss, '
'label accuracy, etc. above are not differentially private.')
if __name__ == '__main__':
app.run(main)