blob: 3c94a09ef1179ea276a2f26ccbd409a2bf639b35 [file] [log] [blame]
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Partition selection playground.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyOcI8emsgdxD+Wxlz3mQIc6",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/google/differential-privacy/blob/main/common_docs/partition_selection_playground.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "JLeHI_2ZxNfm"
},
"source": [
"### Copyright 2021 Google LLC.\n",
"### SPDX-License-Identifier: Apache-2.0\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "1-UTSWYmfq6i"
},
"source": [
"# Optimal Partition Selection\n",
"This notebook presents Python code to enable exploration of the optimal partition selection algorithm presented in https://arxiv.org/pdf/2006.03684.pdf."
]
},
{
"cell_type": "code",
"metadata": {
"id": "QihnuIsSu41Q"
},
"source": [
"import numpy as np\n",
"from matplotlib import pyplot as plt\n",
"import seaborn as sns\n",
"sns.set(rc={'figure.figsize':(15,8)})"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "XPX_D9I7vK6r"
},
"source": [
"# PreAggPartitionSelection implements optimal partition selection - instead\n",
"# of thresholding a noised value to determine whether or not a partition\n",
"# should be kept, this uses a formula derived from the\n",
"# original probablistic definition of differential privacy to generate the\n",
"# probability with which a partition should be kept. The math is shown in\n",
"# https://arxiv.org/pdf/2006.03684.pdf.\n",
"class PreaggPartitionSelection(object):\n",
"\n",
" def __init__(self, epsilon: float, delta: float):\n",
" self._epsilon = epsilon\n",
" self._delta = delta\n",
" self._crossover_1 = (\n",
" 1 +\n",
" np.floor(np.log1p(np.tanh(epsilon / 2) * (1 / delta - 1)) / epsilon))\n",
" self._p_crossover = (np.expm1(self._crossover_1 * self.epsilon) /\n",
" np.expm1(self.epsilon)) * self.delta\n",
" self._crossover_2 = self._crossover_1 + np.floor((1.0 / epsilon) * np.log1p(\n",
" (np.expm1(epsilon) / delta) * (1 - self._p_crossover)))\n",
"\n",
" @property\n",
" def epsilon(self):\n",
" return self._epsilon\n",
"\n",
" @property\n",
" def delta(self):\n",
" return self._delta\n",
"\n",
" @property\n",
" def first_crossover(self):\n",
" return self._crossover_1\n",
"\n",
" @property\n",
" def second_crossover(self):\n",
" return self._crossover_2\n",
"\n",
" def sample_keep(self, num_users: int):\n",
" \"\"\"Sample a Bernoulli RV whether or not the variable should be exposed.\"\"\"\n",
" return np.random.Bernoulli(self.probability_of_keep(num_users))\n",
"\n",
" # ProbabilityOfKeep returns the probability with which a partition with n\n",
" # users should be kept, Thm. 1 of https://arxiv.org/pdf/2006.03684.pdf\n",
" def probability_of_keep(self, n: int) -> float:\n",
" conds = np.array([n == 0, n <= self.first_crossover, n <= self.second_crossover, \n",
" n > self.first_crossover])\n",
" choicelist = np.array([0.0, ((np.expm1(n * self.epsilon) / np.expm1(self.epsilon)) *\n",
" self.delta),\n",
" self._p_crossover -\n",
" (1 - self._p_crossover +\n",
" (self.delta / np.expm1(self.epsilon))) * np.expm1(-(n - self.first_crossover) * self.epsilon),\n",
" 1.0])\n",
" return np.select(conds, choicelist)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "LYDAMoNogVjN"
},
"source": [
"## Plot probabilities for varying values of $\\varepsilon$ and $\\delta$"
]
},
{
"cell_type": "code",
"metadata": {
"id": "bv5V7CtwvNyg"
},
"source": [
"#@title Probabilities and diffs{ run: \"auto\" }\n",
"epsilon = 1.034 #@param { type: \"slider\", min: 0.01, max: 3 , step: 0.001}\n",
"log10_delta = -5.112 #@param { type: \"slider\", min: -15, max: 0 , step: 0.001}\n",
"\n",
"pp = PreaggPartitionSelection(epsilon, np.power(10, log10_delta))\n",
"fig, axes = plt.subplots(3, 1, figsize=(25, 15))\n",
"sns.lineplot(\n",
" ax=axes[0],\n",
" x=np.arange(pp.second_crossover + 2),\n",
" y=pp.probability_of_keep(np.arange(pp.second_crossover + 2)),ci=None)\n",
"sns.barplot(\n",
" ax=axes[1],\n",
" x=np.arange(pp.second_crossover + 1) + 1,\n",
" y=np.diff(pp.probability_of_keep(np.arange(pp.second_crossover + 2))),ci=None)\n",
"sns.lineplot(\n",
" ax=axes[2],\n",
" x=np.arange(pp.second_crossover + 1) + 1,\n",
" y=np.log10(\n",
" np.diff(pp.probability_of_keep(np.arange(pp.second_crossover + 2)))),ci=None)\n"
],
"execution_count": null,
"outputs": []
}
]
}