Add documentation and codelab for partition selection by truncated geometric thresholding.
PiperOrigin-RevId: 396013287
Change-Id: Ia2f5a77ab81ed5154d18a28013ecc054a23bdd45
GitOrigin-RevId: 3594567e28f98c56f7932aa7cd97eb9eab1cd863
diff --git a/docs/partition_selection.md b/docs/partition_selection.md
new file mode 100644
index 0000000..6d7ccf4
--- /dev/null
+++ b/docs/partition_selection.md
@@ -0,0 +1,29 @@
+# Partition Selection (Truncated Geometric Thresholding)
+
+Many data analysis operations can be expressed as a `GROUP BY` query on an
+unbounded set of partitions, followed by a per-partition aggregation. To make
+such a query differentially private, adding noise to each aggregation is not
+enough: we also need to make sure that the set of partitions released is itself
+differentially private.
+
+For the common case where each user contributes a record to exactly one
+partition, an optimal $$(\varepsilon,\delta)$$-differentially private mechanism
+for publishing or dropping partitions is described in
+https://arxiv.org/abs/2006.03684. In the paper, it is also shown that the
+optimal mechanism can be closely approximated by adding noise drawn from a
+truncated geometric distribution to the raw count of unique users in a
+partition, and then thresholding the noisy count. This library includes
+implementations of the mechanism in
+[C++](https://github.com/google/differential-privacy/blob/main/cc/algorithms/partition-selection.h#L192)
+and
+[Go](https://github.com/google/differential-privacy/blob/main/go/dpagg/select_partition.go#L87).
+A simple Python implementation and visualization is also provided in this
+[Google Colab notebook](https://colab.research.google.com/github/differential-privacy/blob/main/common_docs/partition_selection_playground.ipynb).
+
+In addition to the raw algorithm,
+[Privacy-On-Beam](https://github.com/google/differential-privacy/tree/main/privacy-on-beam)
+wraps partition selection as one component of an end-to-end differentially
+private data pipeline. Privacy-On-Beam also guarantees differential privacy in
+the case when users contribute to multiple partitions by splitting the privacy
+budget across these contributions. However, this is not guaranteed to be optimal
+in the sense of maximizing the number of partitions reported.
diff --git a/docs/partition_selection_playground.ipynb b/docs/partition_selection_playground.ipynb
new file mode 100644
index 0000000..4ef27d5
--- /dev/null
+++ b/docs/partition_selection_playground.ipynb
@@ -0,0 +1,165 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "Partition selection playground.ipynb",
+ "provenance": [],
+ "collapsed_sections": [],
+ "authorship_tag": "ABX9TyOcI8emsgdxD+Wxlz3mQIc6",
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ "<a href=\"https://colab.research.google.com/github/differential-privacy/blob/main/common_docs/partition_selection_playground.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "JLeHI_2ZxNfm"
+ },
+ "source": [
+ "### Copyright 2021 Google LLC.\n",
+ "### SPDX-License-Identifier: Apache-2.0\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "1-UTSWYmfq6i"
+ },
+ "source": [
+ "# Optimal Partition Selection\n",
+ "This notebook presents Python code to enable exploration of the optimal partition selection algorithm presented in https://arxiv.org/pdf/2006.03684.pdf."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "QihnuIsSu41Q"
+ },
+ "source": [
+ "import numpy as np\n",
+ "from matplotlib import pyplot as plt\n",
+ "import seaborn as sns\n",
+ "sns.set(rc={'figure.figsize':(15,8)})"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "XPX_D9I7vK6r"
+ },
+ "source": [
+ "# PreAggPartitionSelection implements optimal partition selection - instead\n",
+ "# of thresholding a noised value to determine whether or not a partition\n",
+ "# should be kept, this uses a formula derived from the\n",
+ "# original probablistic definition of differential privacy to generate the\n",
+ "# probability with which a partition should be kept. The math is shown in\n",
+ "# https://arxiv.org/pdf/2006.03684.pdf.\n",
+ "class PreaggPartitionSelection(object):\n",
+ "\n",
+ " def __init__(self, epsilon: float, delta: float):\n",
+ " self._epsilon = epsilon\n",
+ " self._delta = delta\n",
+ " self._crossover_1 = (\n",
+ " 1 +\n",
+ " np.floor(np.log1p(np.tanh(epsilon / 2) * (1 / delta - 1)) / epsilon))\n",
+ " self._p_crossover = (np.expm1(self._crossover_1 * self.epsilon) /\n",
+ " np.expm1(self.epsilon)) * self.delta\n",
+ " self._crossover_2 = self._crossover_1 + np.floor((1.0 / epsilon) * np.log1p(\n",
+ " (np.expm1(epsilon) / delta) * (1 - self._p_crossover)))\n",
+ "\n",
+ " @property\n",
+ " def epsilon(self):\n",
+ " return self._epsilon\n",
+ "\n",
+ " @property\n",
+ " def delta(self):\n",
+ " return self._delta\n",
+ "\n",
+ " @property\n",
+ " def first_crossover(self):\n",
+ " return self._crossover_1\n",
+ "\n",
+ " @property\n",
+ " def second_crossover(self):\n",
+ " return self._crossover_2\n",
+ "\n",
+ " def sample_keep(self, num_users: int):\n",
+ " \"\"\"Sample a Bernoulli RV whether or not the variable should be exposed.\"\"\"\n",
+ " return np.random.Bernoulli(self.probability_of_keep(num_users))\n",
+ "\n",
+ " # ProbabilityOfKeep returns the probability with which a partition with n\n",
+ " # users should be kept, Thm. 1 of https://arxiv.org/pdf/2006.03684.pdf\n",
+ " def probability_of_keep(self, n: int) -> float:\n",
+ " conds = np.array([n == 0, n <= self.first_crossover, n <= self.second_crossover, \n",
+ " n > self.first_crossover])\n",
+ " choicelist = np.array([0.0, ((np.expm1(n * self.epsilon) / np.expm1(self.epsilon)) *\n",
+ " self.delta),\n",
+ " self._p_crossover -\n",
+ " (1 - self._p_crossover +\n",
+ " (self.delta / np.expm1(self.epsilon))) * np.expm1(-(n - self.first_crossover) * self.epsilon),\n",
+ " 1.0])\n",
+ " return np.select(conds, choicelist)"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "LYDAMoNogVjN"
+ },
+ "source": [
+ "## Plot probabilities for varying values of $\\varepsilon$ and $\\delta$"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "bv5V7CtwvNyg"
+ },
+ "source": [
+ "#@title Probabilities and diffs{ run: \"auto\" }\n",
+ "epsilon = 1.034 #@param { type: \"slider\", min: 0.01, max: 3 , step: 0.001}\n",
+ "log10_delta = -5.112 #@param { type: \"slider\", min: -15, max: 0 , step: 0.001}\n",
+ "\n",
+ "pp = PreaggPartitionSelection(epsilon, np.power(10, log10_delta))\n",
+ "fig, axes = plt.subplots(3, 1, figsize=(25, 15))\n",
+ "sns.lineplot(\n",
+ " ax=axes[0],\n",
+ " x=np.arange(pp.second_crossover + 2),\n",
+ " y=pp.probability_of_keep(np.arange(pp.second_crossover + 2)),ci=None)\n",
+ "sns.barplot(\n",
+ " ax=axes[1],\n",
+ " x=np.arange(pp.second_crossover + 1) + 1,\n",
+ " y=np.diff(pp.probability_of_keep(np.arange(pp.second_crossover + 2))),ci=None)\n",
+ "sns.lineplot(\n",
+ " ax=axes[2],\n",
+ " x=np.arange(pp.second_crossover + 1) + 1,\n",
+ " y=np.log10(\n",
+ " np.diff(pp.probability_of_keep(np.arange(pp.second_crossover + 2)))),ci=None)\n"
+ ],
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file