learning/clustering/clustering_algorithm.py - third_party/github.com/google/differential-privacy - Git at Google

 # Copyright 2021 Google LLC.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Implement private clustering."""

 import dataclasses
 import typing

 from absl import logging
 import numpy as np
 import sklearn.cluster

 from clustering import clustering_params
 from clustering import coreset_params
 from clustering import default_clustering_params
 from clustering import lsh_tree
 from clustering import privacy_calculator
 from clustering import private_outputs


 class ClusteringMetrics():
   """Class for computing various clustering quality metrics.

   Note: This class is relevant only for data with specified ground truth labels.

   1. Dominant Label Accuracy: For each cluster, as indicated by cluster
   labels, the accuracy is computed for the labeling of the points that assigns
   the most frequently occurring ground truth label to each cluster.

   2. True Non-matches: Fraction of pairs of points with the same ground truth
   label, which get assigned to different clusters.
   The number of pairs of points with the same true label present in different
   clusters is computed as follows: For a single ground truth label with a
   histogram of cluster labels as (n_1, ... , n_k), the number of pairs of points
   in different clusters is given as
   ((n_1 + ... + n_k)^2 - (n_1^2 + ... + n_k^2))/2.

   3. False Matches: Fraction of pairs of points with different ground truth
   labels, which get assigned to the same cluster.
   The number of pairs of points with different true labels in the same cluster
   can also be computed similarly as above.

   Attributes:
     cross_label_histogram: 2D histogram of (cluster label, true label) pairs.
     num_points: total number of points
     dominant_label_correct_count: number of labels correctly predicted
     dominant_label_accuracy: ratio of dominant_label_correct_count and
       num_points
     true_pairs: number of pairs of points with the same true label.
     true_nonmatch_count: number of pairs of points with same true label, but
       assigned to different clusters.
     true_nonmatch_frac: ratio of true_nonmatch_count and true_pairs
     false_pairs: number of pairs of points with different true labels.
     false_match_count: number of pairs of points with different true labels, but
       assigned to the same cluster.
     false_match_frac: ratio of false_match_count and false_pairs
   """
   cross_label_histogram: np.ndarray
   num_points: int
   dominant_label_correct_count: int
   dominant_label_accuracy: float
   true_pairs: int
   true_nonmatch_count: int
   true_nonmatch_frac: float
   false_pairs: int
   false_match_count: int
   false_match_frac: float

   def __init__(self, cross_label_histogram: np.ndarray):
     self.cross_label_histogram = cross_label_histogram
     self.num_points = np.sum(cross_label_histogram)
     hist_square_sum = np.sum(cross_label_histogram**2)
     num_pairs = self.num_points * (self.num_points - 1) / 2

     # Dominant Label Accuracy
     self.dominant_label_correct_count = np.sum(
         np.max(cross_label_histogram, axis=1))
     self.dominant_label_accuracy = (
         self.dominant_label_correct_count / self.num_points)

     # True Non-matches
     true_label_count = np.sum(cross_label_histogram, axis=0)
     self.true_pairs = np.sum(true_label_count * (true_label_count - 1) / 2)
     self.true_nonmatch_count = (
         (np.sum(true_label_count**2) - hist_square_sum) / 2)
     self.true_nonmatch_frac = self.true_nonmatch_count / self.true_pairs

     # False Matches
     cluster_label_count = np.sum(cross_label_histogram, axis=1)
     self.false_pairs = num_pairs - self.true_pairs
     self.false_match_count = (
         (np.sum(cluster_label_count**2) - hist_square_sum) / 2)
     self.false_match_frac = self.false_match_count / self.false_pairs


 @dataclasses.dataclass(frozen=True)
 class ClusteringResult():
   """Result of labelling the data using the centers.

   Attributes:
     data: Data that is being labelled.
     centers: Cluster centers.
     labels: Indices of the closest center for each datapoint.
     loss: The k-means objective with respect to the centers, i.e., sum of
       squared distances of the data to their closest center.
   """
   data: clustering_params.Data
   centers: clustering_params.Points
   labels: typing.Optional[np.ndarray] = None
   loss: typing.Optional[float] = None

   def __post_init__(self):

     def closest_center(datapoint: np.ndarray):
       """Returns closest center to data point and the squared distance from it.

       Args:
         datapoint: 1D np.ndarray containing a single datapoint
       """
       squared_distances = np.sum((self.centers - datapoint)**2, axis=1)
       min_index = np.argmin(squared_distances)
       return (min_index, squared_distances[min_index])

     if self.labels is None and self.loss is None:
       result = [closest_center(datapoint) for datapoint in self.data.datapoints]
       object.__setattr__(self, "labels",
                          np.array([res[0] for res in result], dtype=int))
       object.__setattr__(self, "loss", sum([res[1] for res in result]))
     if self.labels is None or self.loss is None:
       raise ValueError("Only one of labels or loss was initialized; "
                        "either both should be initialized or none.")

     if self.data.num_points != len(self.labels):
       raise ValueError(f"number of labels ({self.labels.shape[0]}) is not "
                        f"equal to number of points ({self.data.num_points})")
     num_clusters, centers_dim = self.centers.shape
     if centers_dim != self.data.dim:
       raise ValueError(f"Dimension of cluster centers ({centers_dim}) is not "
                        f"equal to dimension of data points ({self.data.dim})")
     if not all([label in list(range(num_clusters)) for label in self.labels]):
       raise ValueError("Labels in incorrect format. Each entry of label must "
                        "be an integer between 0 and number of clusters - 1")

   def cross_label_histogram(self) -> np.ndarray:
     """Returns 2D histogram of (cluster label, true label) pairs.

     Example:
     For cluster labels (self.labels) = [0, 0, 1, 1, 2, 2], and
     true labels (self.data.labels)   = [0, 0, 0, 1, 1, 1]
     the 2D histogram is given as [[2, 0],
                                   [1, 1],
                                   [0, 2]]
     This is computed using np.histogram2d with bins
     [-0.5, 0.5, 1.5, 2.5] for cluster labels and
     [-0.5, 0.5, 1.5] for true labels.

     Raises:
       ValueError: if data does not have any specified true labels.
     """
     if self.data.labels is None:
       raise ValueError("Cross label histogram is undefined since data does not "
                        "have any specified labels")
     bin_start = -0.5
     cluster_label_bins = np.arange(bin_start, np.max(self.labels) + 1, 1)
     true_label_bins = np.arange(bin_start, np.max(self.data.labels) + 1, 1)
     hist, _, _ = np.histogram2d(
         self.labels,
         self.data.labels,
         bins=(cluster_label_bins, true_label_bins))
     return hist.astype(int)

   def get_clustering_metrics(self) -> ClusteringMetrics:
     """Returns various clustering quality metrics, when data labels are given.

     Raises:
       ValueError: if data does not have any specified true labels.
     """
     return ClusteringMetrics(self.cross_label_histogram())


 def private_lsh_clustering(
     k: int,
     data: clustering_params.Data,
     privacy_param: clustering_params.DifferentialPrivacyParam,
     privacy_budget_split: typing.Optional[
         clustering_params.PrivacyBudgetSplit] = None,
     tree_param: typing.Optional[clustering_params.TreeParam] = None,
     multipliers: typing.Optional[
         clustering_params.PrivacyCalculatorMultiplier] = None,
     short_description: str = "CoresetParam") -> ClusteringResult:
   """Clusters data into k clusters.

   Args:
     k: Number of clusters to divide the data into.
     data: Data to find centers for. Centering the data around the origin
       beforehand may provide performance improvements.
     privacy_param: Differential privacy parameters.
     privacy_budget_split: Optional privacy budget split between operations in
       the clustering algorithm for fine-tuning.
     tree_param: Optional tree parameters for generating the LSH net tree for
       fine-tuning.
     multipliers: Optional multipliers to determine ratio between noise
       parameters for different operations in the clustering algorithm for
       fine-tuning. When set, privacy_budget_split is ignored.
     short_description: Optional description to identify this parameter
       configuration.

   Returns:
     ClusteringResult with differentially private centers. The rest of
     ClusteringResult is nonprivate, and only provided for convenience.
   """
   # Note that max_depth is used for the private count calculation so it cannot
   # depend on the count.
   # Chosen experimentally over multiple datasets.
   if tree_param is None:
     max_depth = 20
   else:
     max_depth = tree_param.max_depth

   # Initialize the parameters.
   if multipliers is not None:
     pcalc = privacy_calculator.PrivacyCalculator.from_mechanism_calibration(
         privacy_param, data.radius, max_depth, multipliers)
   else:
     if privacy_budget_split is None:
       privacy_budget_split = clustering_params.PrivacyBudgetSplit()
     pcalc = privacy_calculator.PrivacyCalculator.from_budget_split(
         privacy_param, privacy_budget_split, data.radius, max_depth)

   logging.debug("Privacy calculator: %s", pcalc)
   pcalc.validate_accounting(privacy_param, max_depth)

   private_count = None
   if tree_param is None:
     # Saves the private count to re-use for the root node of the tree.
     tree_param, private_count = default_clustering_params.default_tree_param(
         k, data, pcalc, max_depth)
   coreset_param = coreset_params.CoresetParam(pcalc, tree_param,
                                               short_description, data.radius)
   logging.debug("coreset_param: %s", coreset_param)

   # To guarantee privacy, enforce the radius provided.
   clipped_data = clustering_params.Data(data.clip_by_radius(), data.radius,
                                         data.labels)

   coreset: private_outputs.PrivateWeightedData = get_private_coreset(
       clipped_data, coreset_param, private_count)

   k = min(k, len(coreset.datapoints))
   logging.debug(
       "Starting k-means++ computation on private coreset with k=%d. This may "
       "be less than the original if generated coreset data ended up with "
       "less than k unique points.", k)
   kmeans = sklearn.cluster.KMeans(
       n_clusters=k, init="k-means++").fit(
           coreset.datapoints, sample_weight=coreset.weights)
   # Calculate the result relative to the original data.
   # Note: the calculations besides the centers are nonprivate.
   return ClusteringResult(data, kmeans.cluster_centers_)


 def get_private_coreset(
     data: clustering_params.Data,
     coreset_param: coreset_params.CoresetParam,
     private_count: typing.Optional[int],
 ) -> private_outputs.PrivateWeightedData:
   """Returns private coreset, when clustered it approximates data clustering.

   Args:
     data: Data to approximate with the coreset.
     coreset_param: Parameters for generating the coreset.
     private_count: Optional private count. If None, the private count will be
       computed.
   """
   logging.debug("Starting process to get private coreset.")
   root = lsh_tree.root_node(data, coreset_param, private_count)

   # Root node must have private count >= 1.
   root.private_count = max(1, root.private_count)
   leaves = lsh_tree.LshTree(root).leaves
   coreset_points = []
   coreset_point_weights = []
   for leaf in leaves:
     coreset_points.append(leaf.get_private_average())
     coreset_point_weights.append(leaf.private_count)

   # To improve accuracy, we can clip the coreset points to the provided radius.
   coreset_points = data.clip_by_radius(np.array(coreset_points))

   logging.debug("Finished generating private coreset.")
   logging.debug("The coreset consists of %s points.", len(coreset_points))
   logging.debug("The coreset weights are: %s.", coreset_point_weights)
   return private_outputs.PrivateWeightedData(
       np.array(coreset_points), np.array(coreset_point_weights))
	# Copyright 2021 Google LLC.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Implement private clustering."""

	import dataclasses
	import typing

	from absl import logging
	import numpy as np
	import sklearn.cluster

	from clustering import clustering_params
	from clustering import coreset_params
	from clustering import default_clustering_params
	from clustering import lsh_tree
	from clustering import privacy_calculator
	from clustering import private_outputs


	class ClusteringMetrics():
	"""Class for computing various clustering quality metrics.

	Note: This class is relevant only for data with specified ground truth labels.

	1. Dominant Label Accuracy: For each cluster, as indicated by cluster
	labels, the accuracy is computed for the labeling of the points that assigns
	the most frequently occurring ground truth label to each cluster.

	2. True Non-matches: Fraction of pairs of points with the same ground truth
	label, which get assigned to different clusters.
	The number of pairs of points with the same true label present in different
	clusters is computed as follows: For a single ground truth label with a
	histogram of cluster labels as (n_1, ... , n_k), the number of pairs of points
	in different clusters is given as
	((n_1 + ... + n_k)^2 - (n_1^2 + ... + n_k^2))/2.

	3. False Matches: Fraction of pairs of points with different ground truth
	labels, which get assigned to the same cluster.
	The number of pairs of points with different true labels in the same cluster
	can also be computed similarly as above.

	Attributes:
	cross_label_histogram: 2D histogram of (cluster label, true label) pairs.
	num_points: total number of points
	dominant_label_correct_count: number of labels correctly predicted
	dominant_label_accuracy: ratio of dominant_label_correct_count and
	num_points
	true_pairs: number of pairs of points with the same true label.
	true_nonmatch_count: number of pairs of points with same true label, but
	assigned to different clusters.
	true_nonmatch_frac: ratio of true_nonmatch_count and true_pairs
	false_pairs: number of pairs of points with different true labels.
	false_match_count: number of pairs of points with different true labels, but
	assigned to the same cluster.
	false_match_frac: ratio of false_match_count and false_pairs
	"""
	cross_label_histogram: np.ndarray
	num_points: int
	dominant_label_correct_count: int
	dominant_label_accuracy: float
	true_pairs: int
	true_nonmatch_count: int
	true_nonmatch_frac: float
	false_pairs: int
	false_match_count: int
	false_match_frac: float

	def __init__(self, cross_label_histogram: np.ndarray):
	self.cross_label_histogram = cross_label_histogram
	self.num_points = np.sum(cross_label_histogram)
	hist_square_sum = np.sum(cross_label_histogram**2)
	num_pairs = self.num_points * (self.num_points - 1) / 2

	# Dominant Label Accuracy
	self.dominant_label_correct_count = np.sum(
	np.max(cross_label_histogram, axis=1))
	self.dominant_label_accuracy = (
	self.dominant_label_correct_count / self.num_points)

	# True Non-matches
	true_label_count = np.sum(cross_label_histogram, axis=0)
	self.true_pairs = np.sum(true_label_count * (true_label_count - 1) / 2)
	self.true_nonmatch_count = (
	(np.sum(true_label_count**2) - hist_square_sum) / 2)
	self.true_nonmatch_frac = self.true_nonmatch_count / self.true_pairs

	# False Matches
	cluster_label_count = np.sum(cross_label_histogram, axis=1)
	self.false_pairs = num_pairs - self.true_pairs
	self.false_match_count = (
	(np.sum(cluster_label_count**2) - hist_square_sum) / 2)
	self.false_match_frac = self.false_match_count / self.false_pairs


	@dataclasses.dataclass(frozen=True)
	class ClusteringResult():
	"""Result of labelling the data using the centers.

	Attributes:
	data: Data that is being labelled.
	centers: Cluster centers.
	labels: Indices of the closest center for each datapoint.
	loss: The k-means objective with respect to the centers, i.e., sum of
	squared distances of the data to their closest center.
	"""
	data: clustering_params.Data
	centers: clustering_params.Points
	labels: typing.Optional[np.ndarray] = None
	loss: typing.Optional[float] = None

	def __post_init__(self):

	def closest_center(datapoint: np.ndarray):
	"""Returns closest center to data point and the squared distance from it.

	Args:
	datapoint: 1D np.ndarray containing a single datapoint
	"""
	squared_distances = np.sum((self.centers - datapoint)**2, axis=1)
	min_index = np.argmin(squared_distances)
	return (min_index, squared_distances[min_index])

	if self.labels is None and self.loss is None:
	result = [closest_center(datapoint) for datapoint in self.data.datapoints]
	object.__setattr__(self, "labels",
	np.array([res[0] for res in result], dtype=int))
	object.__setattr__(self, "loss", sum([res[1] for res in result]))
	if self.labels is None or self.loss is None:
	raise ValueError("Only one of labels or loss was initialized; "
	"either both should be initialized or none.")

	if self.data.num_points != len(self.labels):
	raise ValueError(f"number of labels ({self.labels.shape[0]}) is not "
	f"equal to number of points ({self.data.num_points})")
	num_clusters, centers_dim = self.centers.shape
	if centers_dim != self.data.dim:
	raise ValueError(f"Dimension of cluster centers ({centers_dim}) is not "
	f"equal to dimension of data points ({self.data.dim})")
	if not all([label in list(range(num_clusters)) for label in self.labels]):
	raise ValueError("Labels in incorrect format. Each entry of label must "
	"be an integer between 0 and number of clusters - 1")

	def cross_label_histogram(self) -> np.ndarray:
	"""Returns 2D histogram of (cluster label, true label) pairs.

	Example:
	For cluster labels (self.labels) = [0, 0, 1, 1, 2, 2], and
	true labels (self.data.labels) = [0, 0, 0, 1, 1, 1]
	the 2D histogram is given as [[2, 0],
	[1, 1],
	[0, 2]]
	This is computed using np.histogram2d with bins
	[-0.5, 0.5, 1.5, 2.5] for cluster labels and
	[-0.5, 0.5, 1.5] for true labels.

	Raises:
	ValueError: if data does not have any specified true labels.
	"""
	if self.data.labels is None:
	raise ValueError("Cross label histogram is undefined since data does not "
	"have any specified labels")
	bin_start = -0.5
	cluster_label_bins = np.arange(bin_start, np.max(self.labels) + 1, 1)
	true_label_bins = np.arange(bin_start, np.max(self.data.labels) + 1, 1)
	hist, _, _ = np.histogram2d(
	self.labels,
	self.data.labels,
	bins=(cluster_label_bins, true_label_bins))
	return hist.astype(int)

	def get_clustering_metrics(self) -> ClusteringMetrics:
	"""Returns various clustering quality metrics, when data labels are given.

	Raises:
	ValueError: if data does not have any specified true labels.
	"""
	return ClusteringMetrics(self.cross_label_histogram())


	def private_lsh_clustering(
	k: int,
	data: clustering_params.Data,
	privacy_param: clustering_params.DifferentialPrivacyParam,
	privacy_budget_split: typing.Optional[
	clustering_params.PrivacyBudgetSplit] = None,
	tree_param: typing.Optional[clustering_params.TreeParam] = None,
	multipliers: typing.Optional[
	clustering_params.PrivacyCalculatorMultiplier] = None,
	short_description: str = "CoresetParam") -> ClusteringResult:
	"""Clusters data into k clusters.

	Args:
	k: Number of clusters to divide the data into.
	data: Data to find centers for. Centering the data around the origin
	beforehand may provide performance improvements.
	privacy_param: Differential privacy parameters.
	privacy_budget_split: Optional privacy budget split between operations in
	the clustering algorithm for fine-tuning.
	tree_param: Optional tree parameters for generating the LSH net tree for
	fine-tuning.
	multipliers: Optional multipliers to determine ratio between noise
	parameters for different operations in the clustering algorithm for
	fine-tuning. When set, privacy_budget_split is ignored.
	short_description: Optional description to identify this parameter
	configuration.

	Returns:
	ClusteringResult with differentially private centers. The rest of
	ClusteringResult is nonprivate, and only provided for convenience.
	"""
	# Note that max_depth is used for the private count calculation so it cannot
	# depend on the count.
	# Chosen experimentally over multiple datasets.
	if tree_param is None:
	max_depth = 20
	else:
	max_depth = tree_param.max_depth

	# Initialize the parameters.
	if multipliers is not None:
	pcalc = privacy_calculator.PrivacyCalculator.from_mechanism_calibration(
	privacy_param, data.radius, max_depth, multipliers)
	else:
	if privacy_budget_split is None:
	privacy_budget_split = clustering_params.PrivacyBudgetSplit()
	pcalc = privacy_calculator.PrivacyCalculator.from_budget_split(
	privacy_param, privacy_budget_split, data.radius, max_depth)

	logging.debug("Privacy calculator: %s", pcalc)
	pcalc.validate_accounting(privacy_param, max_depth)

	private_count = None
	if tree_param is None:
	# Saves the private count to re-use for the root node of the tree.
	tree_param, private_count = default_clustering_params.default_tree_param(
	k, data, pcalc, max_depth)
	coreset_param = coreset_params.CoresetParam(pcalc, tree_param,
	short_description, data.radius)
	logging.debug("coreset_param: %s", coreset_param)

	# To guarantee privacy, enforce the radius provided.
	clipped_data = clustering_params.Data(data.clip_by_radius(), data.radius,
	data.labels)

	coreset: private_outputs.PrivateWeightedData = get_private_coreset(
	clipped_data, coreset_param, private_count)

	k = min(k, len(coreset.datapoints))
	logging.debug(
	"Starting k-means++ computation on private coreset with k=%d. This may "
	"be less than the original if generated coreset data ended up with "
	"less than k unique points.", k)
	kmeans = sklearn.cluster.KMeans(
	n_clusters=k, init="k-means++").fit(
	coreset.datapoints, sample_weight=coreset.weights)
	# Calculate the result relative to the original data.
	# Note: the calculations besides the centers are nonprivate.
	return ClusteringResult(data, kmeans.cluster_centers_)


	def get_private_coreset(
	data: clustering_params.Data,
	coreset_param: coreset_params.CoresetParam,
	private_count: typing.Optional[int],
	) -> private_outputs.PrivateWeightedData:
	"""Returns private coreset, when clustered it approximates data clustering.

	Args:
	data: Data to approximate with the coreset.
	coreset_param: Parameters for generating the coreset.
	private_count: Optional private count. If None, the private count will be
	computed.
	"""
	logging.debug("Starting process to get private coreset.")
	root = lsh_tree.root_node(data, coreset_param, private_count)

	# Root node must have private count >= 1.
	root.private_count = max(1, root.private_count)
	leaves = lsh_tree.LshTree(root).leaves
	coreset_points = []
	coreset_point_weights = []
	for leaf in leaves:
	coreset_points.append(leaf.get_private_average())
	coreset_point_weights.append(leaf.private_count)

	# To improve accuracy, we can clip the coreset points to the provided radius.
	coreset_points = data.clip_by_radius(np.array(coreset_points))

	logging.debug("Finished generating private coreset.")
	logging.debug("The coreset consists of %s points.", len(coreset_points))
	logging.debug("The coreset weights are: %s.", coreset_point_weights)
	return private_outputs.PrivateWeightedData(
	np.array(coreset_points), np.array(coreset_point_weights))