common_docs/partition_selection_playground.ipynb - third_party/github.com/google/differential-privacy - Git at Google

 {
   "nbformat": 4,
   "nbformat_minor": 0,
   "metadata": {
     "colab": {
       "name": "Partition selection playground.ipynb",
       "provenance": [],
       "collapsed_sections": [],
       "authorship_tag": "ABX9TyOcI8emsgdxD+Wxlz3mQIc6",
       "include_colab_link": true
     },
     "kernelspec": {
       "name": "python3",
       "display_name": "Python 3"
     },
     "language_info": {
       "name": "python"
     }
   },
   "cells": [
     {
       "cell_type": "markdown",
       "metadata": {
         "id": "view-in-github",
         "colab_type": "text"
       },
       "source": [
         "<a href=\"https://colab.research.google.com/github/google/differential-privacy/blob/main/common_docs/partition_selection_playground.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "id": "JLeHI_2ZxNfm"
       },
       "source": [
         "### Copyright 2021 Google LLC.\n",
         "### SPDX-License-Identifier: Apache-2.0\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "id": "1-UTSWYmfq6i"
       },
       "source": [
         "# Optimal Partition Selection\n",
         "This notebook presents Python code to enable exploration of the optimal partition selection algorithm presented in https://arxiv.org/pdf/2006.03684.pdf."
       ]
     },
     {
       "cell_type": "code",
       "metadata": {
         "id": "QihnuIsSu41Q"
       },
       "source": [
         "import numpy as np\n",
         "from matplotlib import pyplot as plt\n",
         "import seaborn as sns\n",
         "sns.set(rc={'figure.figsize':(15,8)})"
       ],
       "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {
         "id": "XPX_D9I7vK6r"
       },
       "source": [
         "# PreAggPartitionSelection implements optimal partition selection - instead\n",
         "# of thresholding a noised value to determine whether or not a partition\n",
         "# should be kept, this uses a formula derived from the\n",
         "# original probablistic definition of differential privacy to generate the\n",
         "# probability with which a partition should be kept. The math is shown in\n",
         "# https://arxiv.org/pdf/2006.03684.pdf.\n",
         "class PreaggPartitionSelection(object):\n",
         "\n",
         "  def __init__(self, epsilon: float, delta: float):\n",
         "    self._epsilon = epsilon\n",
         "    self._delta = delta\n",
         "    self._crossover_1 = (\n",
         "        1 +\n",
         "        np.floor(np.log1p(np.tanh(epsilon / 2) * (1 / delta - 1)) / epsilon))\n",
         "    self._p_crossover = (np.expm1(self._crossover_1 * self.epsilon) /\n",
         "                         np.expm1(self.epsilon)) * self.delta\n",
         "    self._crossover_2 = self._crossover_1 + np.floor((1.0 / epsilon) * np.log1p(\n",
         "        (np.expm1(epsilon) / delta) * (1 - self._p_crossover)))\n",
         "\n",
         "  @property\n",
         "  def epsilon(self):\n",
         "    return self._epsilon\n",
         "\n",
         "  @property\n",
         "  def delta(self):\n",
         "    return self._delta\n",
         "\n",
         "  @property\n",
         "  def first_crossover(self):\n",
         "    return self._crossover_1\n",
         "\n",
         "  @property\n",
         "  def second_crossover(self):\n",
         "    return self._crossover_2\n",
         "\n",
         "  def sample_keep(self, num_users: int):\n",
         "    \"\"\"Sample a Bernoulli RV whether or not the variable should be exposed.\"\"\"\n",
         "    return np.random.Bernoulli(self.probability_of_keep(num_users))\n",
         "\n",
         "  # ProbabilityOfKeep returns the probability with which a partition with n\n",
         "  # users should be kept, Thm. 1 of https://arxiv.org/pdf/2006.03684.pdf\n",
         "  def probability_of_keep(self, n: int) -> float:\n",
         "   conds =  np.array([n == 0, n <= self.first_crossover, n <= self.second_crossover, \n",
         "      n > self.first_crossover])\n",
         "   choicelist = np.array([0.0, ((np.expm1(n * self.epsilon) / np.expm1(self.epsilon)) *\n",
         "              self.delta),\n",
         "          self._p_crossover -\n",
         "          (1 - self._p_crossover +\n",
         "           (self.delta / np.expm1(self.epsilon))) * np.expm1(-(n - self.first_crossover) * self.epsilon),\n",
         "     1.0])\n",
         "   return np.select(conds, choicelist)"
       ],
       "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "id": "LYDAMoNogVjN"
       },
       "source": [
         "## Plot probabilities for varying values of $\\varepsilon$ and $\\delta$"
       ]
     },
     {
       "cell_type": "code",
       "metadata": {
         "id": "bv5V7CtwvNyg"
       },
       "source": [
         "#@title Probabilities and diffs{ run: \"auto\" }\n",
         "epsilon = 1.034  #@param { type: \"slider\", min: 0.01, max: 3 , step: 0.001}\n",
         "log10_delta = -5.112  #@param { type: \"slider\", min: -15, max: 0 , step: 0.001}\n",
         "\n",
         "pp = PreaggPartitionSelection(epsilon, np.power(10, log10_delta))\n",
         "fig, axes = plt.subplots(3, 1, figsize=(25, 15))\n",
         "sns.lineplot(\n",
         "    ax=axes[0],\n",
         "    x=np.arange(pp.second_crossover + 2),\n",
         "    y=pp.probability_of_keep(np.arange(pp.second_crossover + 2)),ci=None)\n",
         "sns.barplot(\n",
         "    ax=axes[1],\n",
         "    x=np.arange(pp.second_crossover + 1) + 1,\n",
         "    y=np.diff(pp.probability_of_keep(np.arange(pp.second_crossover + 2))),ci=None)\n",
         "sns.lineplot(\n",
         "    ax=axes[2],\n",
         "    x=np.arange(pp.second_crossover + 1) + 1,\n",
         "    y=np.log10(\n",
         "        np.diff(pp.probability_of_keep(np.arange(pp.second_crossover + 2)))),ci=None)\n"
       ],
       "execution_count": null,
       "outputs": []
     }
   ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "Partition selection playground.ipynb",
	"provenance": [],
	"collapsed_sections": [],
	"authorship_tag": "ABX9TyOcI8emsgdxD+Wxlz3mQIc6",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/github/google/differential-privacy/blob/main/common_docs/partition_selection_playground.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "JLeHI_2ZxNfm"
	},
	"source": [
	"### Copyright 2021 Google LLC.\n",
	"### SPDX-License-Identifier: Apache-2.0\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "1-UTSWYmfq6i"
	},
	"source": [
	"# Optimal Partition Selection\n",
	"This notebook presents Python code to enable exploration of the optimal partition selection algorithm presented in https://arxiv.org/pdf/2006.03684.pdf."
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "QihnuIsSu41Q"
	},
	"source": [
	"import numpy as np\n",
	"from matplotlib import pyplot as plt\n",
	"import seaborn as sns\n",
	"sns.set(rc={'figure.figsize':(15,8)})"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "XPX_D9I7vK6r"
	},
	"source": [
	"# PreAggPartitionSelection implements optimal partition selection - instead\n",
	"# of thresholding a noised value to determine whether or not a partition\n",
	"# should be kept, this uses a formula derived from the\n",
	"# original probablistic definition of differential privacy to generate the\n",
	"# probability with which a partition should be kept. The math is shown in\n",
	"# https://arxiv.org/pdf/2006.03684.pdf.\n",
	"class PreaggPartitionSelection(object):\n",
	"\n",
	" def __init__(self, epsilon: float, delta: float):\n",
	" self._epsilon = epsilon\n",
	" self._delta = delta\n",
	" self._crossover_1 = (\n",
	" 1 +\n",
	" np.floor(np.log1p(np.tanh(epsilon / 2) * (1 / delta - 1)) / epsilon))\n",
	" self._p_crossover = (np.expm1(self._crossover_1 * self.epsilon) /\n",
	" np.expm1(self.epsilon)) * self.delta\n",
	" self._crossover_2 = self._crossover_1 + np.floor((1.0 / epsilon) * np.log1p(\n",
	" (np.expm1(epsilon) / delta) * (1 - self._p_crossover)))\n",
	"\n",
	" @property\n",
	" def epsilon(self):\n",
	" return self._epsilon\n",
	"\n",
	" @property\n",
	" def delta(self):\n",
	" return self._delta\n",
	"\n",
	" @property\n",
	" def first_crossover(self):\n",
	" return self._crossover_1\n",
	"\n",
	" @property\n",
	" def second_crossover(self):\n",
	" return self._crossover_2\n",
	"\n",
	" def sample_keep(self, num_users: int):\n",
	" \"\"\"Sample a Bernoulli RV whether or not the variable should be exposed.\"\"\"\n",
	" return np.random.Bernoulli(self.probability_of_keep(num_users))\n",
	"\n",
	" # ProbabilityOfKeep returns the probability with which a partition with n\n",
	" # users should be kept, Thm. 1 of https://arxiv.org/pdf/2006.03684.pdf\n",
	" def probability_of_keep(self, n: int) -> float:\n",
	" conds = np.array([n == 0, n <= self.first_crossover, n <= self.second_crossover, \n",
	" n > self.first_crossover])\n",
	" choicelist = np.array([0.0, ((np.expm1(n * self.epsilon) / np.expm1(self.epsilon)) *\n",
	" self.delta),\n",
	" self._p_crossover -\n",
	" (1 - self._p_crossover +\n",
	" (self.delta / np.expm1(self.epsilon))) * np.expm1(-(n - self.first_crossover) * self.epsilon),\n",
	" 1.0])\n",
	" return np.select(conds, choicelist)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "LYDAMoNogVjN"
	},
	"source": [
	"## Plot probabilities for varying values of $\\varepsilon$ and $\\delta$"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "bv5V7CtwvNyg"
	},
	"source": [
	"#@title Probabilities and diffs{ run: \"auto\" }\n",
	"epsilon = 1.034 #@param { type: \"slider\", min: 0.01, max: 3 , step: 0.001}\n",
	"log10_delta = -5.112 #@param { type: \"slider\", min: -15, max: 0 , step: 0.001}\n",
	"\n",
	"pp = PreaggPartitionSelection(epsilon, np.power(10, log10_delta))\n",
	"fig, axes = plt.subplots(3, 1, figsize=(25, 15))\n",
	"sns.lineplot(\n",
	" ax=axes[0],\n",
	" x=np.arange(pp.second_crossover + 2),\n",
	" y=pp.probability_of_keep(np.arange(pp.second_crossover + 2)),ci=None)\n",
	"sns.barplot(\n",
	" ax=axes[1],\n",
	" x=np.arange(pp.second_crossover + 1) + 1,\n",
	" y=np.diff(pp.probability_of_keep(np.arange(pp.second_crossover + 2))),ci=None)\n",
	"sns.lineplot(\n",
	" ax=axes[2],\n",
	" x=np.arange(pp.second_crossover + 1) + 1,\n",
	" y=np.log10(\n",
	" np.diff(pp.probability_of_keep(np.arange(pp.second_crossover + 2)))),ci=None)\n"
	],
	"execution_count": null,
	"outputs": []
	}
	]
	}