blob: 9d4e7692e98b414f1d65841a357d06253078349f [file] [log] [blame]
//
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Package examples contains different usage examples of the DP library and related utilities.
package examples
import (
"fmt"
"math"
"math/rand"
"time"
"github.com/google/differential-privacy/go/v2/dpagg"
"github.com/google/differential-privacy/go/v2/noise"
)
const (
// openingHour is the hour when visitors start entering the restaurant.
openingHour = 9
// closingHour is the hour when visitors stop entering the restaurant.
closingHour = 20
// Number of weekly visits for a visitor is limited to 3. All exceeding visits will be discarded.
maxThreeVisitsPerWeek = 3
// Number of weekly visits for a visitor is limited to 4. All exceeding visits will be discarded.
maxFourVisitsPerWeek = 4
// Minimum amount of money we expect a visitor to spend on a single visit.
minEurosSpent = 0
// Maximum amount of money we expect a visitor to spend on a single visit.
maxEurosSpent = 50
)
var (
ln3 = math.Log(3)
// 1 - Monday, 2 - Tuesday, 3 - Wednesday, 4 - Thursday, 5 - Friday, 6 - Saturday, 7 - Sunday
weekDays = []int64{1, 2, 3, 4, 5, 6, 7}
)
// Scenario is an interface for codelab scenarios.
type Scenario interface {
getNonPrivateResults(visits []Visit) map[int64]int64
getPrivateResults(visits []Visit) (map[int64]int64, error)
}
// RunScenario runs scenario sc on the input file and writes private and non-private results to the output files.
func RunScenario(sc Scenario, inputFile, nonPrivateResultsOutputFile, privateResultsOutputFile string) error {
visits, err := readVisitsFromCSV(inputFile)
if err != nil {
return err
}
nonPrResults := sc.getNonPrivateResults(visits)
prResults, err := sc.getPrivateResults(visits)
if err != nil {
return err
}
err = writeResultsToCSV(nonPrResults, nonPrivateResultsOutputFile)
if err != nil {
return err
}
return writeResultsToCSV(prResults, privateResultsOutputFile)
}
// CountVisitsPerHourScenario loads the input file, calculates non-anonymized and
// anonymized counts of visitors entering a restaurant every hour, and prints results
// to nonPrivateResultsOutputFile and privateResultsOutputFile.
// Uses dpagg.Count for calculating anonymized counts.
type CountVisitsPerHourScenario struct{}
// Calculates the raw count of the given visits per hour of day.
// Returns the map that maps an hour to a raw count of visits for the hours between
// OpeningHour and ClosingHour.
func (sc *CountVisitsPerHourScenario) getNonPrivateResults(dayVisits []Visit) map[int64]int64 {
counts := make(map[int64]int64)
for _, visit := range dayVisits {
h := visit.VisitTime.Hour()
counts[int64(h)]++
}
return counts
}
// Calculates the anonymized (i.e., "private") counts of the given visits per hour of day.
// Returns the map that maps an hour to an anonymized count of visits for the hours between
// OpeningHour and ClosingHour.
func (sc *CountVisitsPerHourScenario) getPrivateResults(dayVisits []Visit) (map[int64]int64, error) {
hourToDpCount := make(map[int64]*dpagg.Count)
var err error
for h := int64(openingHour); h <= closingHour; h++ {
// Construct dpagg.Count objects which will be used to calculate DP counts.
// One dpagg.Count is created for every work hour.
hourToDpCount[h], err = dpagg.NewCount(&dpagg.CountOptions{
Epsilon: ln3,
MaxPartitionsContributed: 1,
Noise: noise.Laplace(),
})
if err != nil {
return nil, fmt.Errorf("couldn't initialize count for hour %d: %w", h, err)
}
}
for _, visit := range dayVisits {
h := visit.VisitTime.Hour()
hourToDpCount[int64(h)].Increment()
}
privateCounts := make(map[int64]int64)
for h, dpCount := range hourToDpCount {
privateCounts[h], err = dpCount.Result()
if err != nil {
return nil, fmt.Errorf("couldn't compute dp count: %w", err)
}
}
return privateCounts, nil
}
// CountVisitsPerDayScenario loads the input file, calculates non-anonymized and
// anonymized counts of visits per week day, and prints results to
// nonPrivateResultsOutputFile and privateResultsOutputFile.
// Assumes that a visitor may enter the restaurant only once per day,
// but may enter multiple days per week.
// Uses dpagg.Count for calculating anonymized counts.
type CountVisitsPerDayScenario struct{}
// Calculates the raw count of the given number of visits per day.
// Returns the map that maps a day to a raw count of visits in this day.
func (sc *CountVisitsPerDayScenario) getNonPrivateResults(weekVisits []Visit) map[int64]int64 {
counts := make(map[int64]int64)
for _, visit := range weekVisits {
counts[int64(visit.Day)]++
}
return counts
}
// Calculates the anonymized (i.e., "private") count of the given visits per day.
// Returns the map that maps a day to an anonymized count of visits in this day.
func (sc *CountVisitsPerDayScenario) getPrivateResults(weekVisits []Visit) (map[int64]int64, error) {
dayToDpCount := make(map[int64]*dpagg.Count)
var err error
for _, day := range weekDays {
// Construct dpagg.Count objects which will be used to calculate the DP counts.
// One dpagg.Count is created for every work hour.
dayToDpCount[day], err = dpagg.NewCount(&dpagg.CountOptions{
Epsilon: ln3,
// The data was pre-processed so that
// each visitor may visit the restaurant up to maxThreeVisitsPerWeek times per week.
// Hence, each visitor may contribute to up to maxThreeVisitsPerWeek daily visit counts.
// Note: while the library accepts this limit as a configurable parameter,
// it doesn't pre-process the data to ensure this limit is respected.
// It is responsibility of the caller to ensure the data passed to the library
// is capped for getting the correct privacy guarantee.
// TODO: Clarify the note above.
MaxPartitionsContributed: maxThreeVisitsPerWeek,
Noise: noise.Laplace(),
})
if err != nil {
return nil, fmt.Errorf("couldn't initialize count for day %d: %w", day, err)
}
}
// Pre-process the data set by limiting the number of visits to maxThreeVisitsPerWeek
// per VisitorId.
boundedVisits := boundVisits(weekVisits, maxThreeVisitsPerWeek)
for _, visit := range boundedVisits {
dayToDpCount[int64(visit.Day)].Increment()
}
privateCounts := make(map[int64]int64)
for day, dpCount := range dayToDpCount {
privateCounts[day], err = dpCount.Result()
if err != nil {
return nil, fmt.Errorf("couldn't compute dp count: %w", err)
}
}
return privateCounts, nil
}
// SumRevenuePerDayScenario loads the input file, calculates non-anonymized and
// anonymized amount of money spent by the visitors per weekday, and prints results
// to nonPrivateResultsOutputFile and privateResultsOutputFile.
// Assumes that a visitor may enter the restaurant at most once per day,
// but may enter multiple days per week.
// Uses dpagg.BoundedSumInt64 for calculating anonymized sums.
type SumRevenuePerDayScenario struct {
Scenario
}
// Calculates the total raw revenue for each day of the week.
// Returns the map that maps a day to a raw revenue for this day.
func (sc *SumRevenuePerDayScenario) getNonPrivateResults(visits []Visit) map[int64]int64 {
sums := make(map[int64]int64)
for _, visit := range visits {
sums[int64(visit.Day)] += visit.EurosSpent
}
return sums
}
// Calculates the anonymized revenue for each day of the week.
// Returns the map that maps a day to an anonymized revenue for this day.
func (sc *SumRevenuePerDayScenario) getPrivateResults(visits []Visit) (map[int64]int64, error) {
dayToBoundedSum := make(map[int64]*dpagg.BoundedSumInt64)
var err error
for _, day := range weekDays {
// Construct dpagg.BoundedSumInt64 objects that will be used to calculate DP sums.
// One dpagg.BoundedSumInt64 is created for every day.
dayToBoundedSum[day], err = dpagg.NewBoundedSumInt64(&dpagg.BoundedSumInt64Options{
Epsilon: ln3,
// The data was pre-processed so that
// each visitor may visit the restaurant up to maxFourVisitsPerWeek times per week.
// Hence, each privacy unit may contribute to up to maxFourVisitsPerWeek daily counts.
// Note: while the library accepts this limit as a configurable parameter,
// it doesn't pre-process the data to ensure this limit is respected.
// It is responsibility of the caller to ensure the data passed to the library
// is capped for getting the correct privacy guarantee.
// TODO: Clarify the note above.
MaxPartitionsContributed: maxFourVisitsPerWeek,
// No need to pre-process the data: BoundedSumInt64 will clamp the input values.
Lower: minEurosSpent,
Upper: maxEurosSpent,
Noise: noise.Laplace(),
})
if err != nil {
return nil, fmt.Errorf("couldn't initialize sum for day %d: %w", day, err)
}
}
// Pre-process the data set by limiting the number of visits to maxFourVisitsPerWeek
// per VisitorID.
boundedVisits := boundVisits(visits, maxFourVisitsPerWeek)
d := make(map[int]int64)
for _, v := range boundedVisits {
d[v.Day]++
}
for _, visit := range boundedVisits {
dayToBoundedSum[int64(visit.Day)].Add(visit.EurosSpent)
}
privateSums := make(map[int64]int64)
for day, boundedSum := range dayToBoundedSum {
privateSums[day], err = boundedSum.Result()
if err != nil {
return nil, fmt.Errorf("couldn't compute dp sum: %w", err)
}
}
return privateSums, nil
}
// CountVisitsPerCertainDurationScenario loads the input file and calculates the non-anonymized
// and anonymized count of visits per certain duration during the week.
// Assumes that a visitor may enter the restaurant at most once per day,
// but may enter multiple times per week.
// Uses dpagg.BoundedSumInt64 for calculating anonymized counts because the visitor can
// contribute to the same duration multiple times and therefore dpagg.Count can't be used
// for this.
type CountVisitsPerCertainDurationScenario struct{}
// Calculates the total count of visits for each exixting visit duration during the week.
// Returns the map that maps a duration to a raw count of visits for this duration.
func (sc *CountVisitsPerCertainDurationScenario) getNonPrivateResults(weekVisits []Visit) map[int64]int64 {
counts := make(map[int64]int64)
for _, visit := range weekVisits {
counts[roundMinutes(visit.MinutesSpent, 10)]++
}
return counts
}
// Calculates the total anonymized count of visits for each existing visit duration during the week and
// eliminate the durations of time whereby too few visitors spent time at the restaurant.
// Returns the map that maps a duration to an anonymized count of visits for this duration.
func (sc *CountVisitsPerCertainDurationScenario) getPrivateResults(weekVisits []Visit) (map[int64]int64, error) {
// Pre-process the data set by limiting the number of visits to maxThreeVisitsPerWeek
// per VisitorID.
boundedVisits := boundVisits(weekVisits, maxThreeVisitsPerWeek)
durationToSelectPartition := make(map[int64]*dpagg.PreAggSelectPartition)
durationToBoundedSum := make(map[int64]*dpagg.BoundedSumInt64)
// Pre-aggregate data, since a visitor can contribute multiple times to a single partition.
visitorIDToDurationToVisitCount := make(map[int64]map[int64]int64)
for _, visit := range boundedVisits {
durationToVisitCount, ok := visitorIDToDurationToVisitCount[visit.VisitorID]
if !ok {
durationToVisitCount = make(map[int64]int64)
visitorIDToDurationToVisitCount[visit.VisitorID] = durationToVisitCount
}
duration := roundMinutes(visit.MinutesSpent, 10)
durationToVisitCount[duration]++
var err error
_, ok = durationToSelectPartition[duration]
if !ok {
// Recall that each possible visit duration may be represented by a
// partition. Here, we construct DP PreAggSelectPartition objects for each
// such duration. PreAggSelectPartition decides whether there are enough
// visitors in a partition to warrant keeping it, or if there were too few
// visitors who stayed for that duration, meaning the partition should be
// dropped.
//
// We use epsilon = log(3) / 2 in this example,
// because we must split epsilon between all the functions that apply differential privacy,
// which, in this case, is 2 functions: BoundedSumInt64 and PreAggSelectPartition.
durationToSelectPartition[duration], err = dpagg.NewPreAggSelectPartition(&dpagg.PreAggSelectPartitionOptions{
Epsilon: ln3 / 2,
Delta: 0.02,
MaxPartitionsContributed: maxThreeVisitsPerWeek,
})
if err != nil {
return nil, fmt.Errorf("couldn't initialize PreAggSelectPartition for duration %d: %w", duration, err)
}
// Construct dpagg.BoundedSumInt64 objects which will be used to calculate DP
// counts with multiple contributions from a single privacy unit (visitor).
// One dpagg.BoundedSumInt64 is created for every duration.
// We use epsilon = log(3) / 2 in this example,
// because we must split epsilon between all the functions that apply differential privacy,
// which, in this case, is 2 functions: BoundedSumInt64 and PreAggSelectPartition.
durationToBoundedSum[duration], err = dpagg.NewBoundedSumInt64(&dpagg.BoundedSumInt64Options{
Epsilon: ln3 / 2,
MaxPartitionsContributed: maxThreeVisitsPerWeek,
Lower: 0,
Upper: maxThreeVisitsPerWeek,
Noise: noise.Laplace(),
})
if err != nil {
return nil, fmt.Errorf("couldn't initialize sum for duration %d: %w", duration, err)
}
}
}
for _, visits := range visitorIDToDurationToVisitCount {
for duration, totalVisits := range visits {
durationToBoundedSum[duration].Add(totalVisits)
// Count distinct visitors for each duration.
durationToSelectPartition[duration].Increment()
}
}
privateSums := make(map[int64]int64)
for duration, boundedSum := range durationToBoundedSum {
// Pre-aggregation partition selection.
// If there are enough visitors within this duration,
// then it will appear in the result statistics table.
// Otherwise, the duration's partition is simply dropped and excluded from the result.
shouldKeepPartition, err := durationToSelectPartition[duration].ShouldKeepPartition()
if err != nil {
return nil, fmt.Errorf("couldn't compute shouldKeepPartition: %w", err)
}
if shouldKeepPartition {
privateSums[duration], err = boundedSum.Result()
if err != nil {
return nil, fmt.Errorf("couldn't compute dp sum: %w", err)
}
}
}
return privateSums, nil
}
// Round up to the next 10-minute mark based on a divider
// e.g divider = 10 then
// for minutes = 1 => 10
// for minutes = 10 => 10
// for minutes = 16 => 20
// for minutes = 20 => 20
// etc
func roundMinutes(minutes int64, divider int64) int64 {
quotient := minutes / divider
if minutes%divider > 0 {
quotient++
}
return quotient * divider
}
func boundVisits(initialVisits []Visit, maxVisitsPerWeek int64) []Visit {
boundedVisits := make([]Visit, 0)
// Shuffle all visits.
rand.Seed(time.Now().UnixNano())
rand.Shuffle(len(initialVisits), func(i, j int) { initialVisits[i], initialVisits[j] = initialVisits[j], initialVisits[i] })
// Go through the unordered collection of visits, but only add, at most,
// maxVisitsPerWeek visits per VisitorID to the final result, and
// discard all additional visits.
visitorIDToVisitCount := make(map[int64]int64)
for _, visit := range initialVisits {
val, ok := visitorIDToVisitCount[visit.VisitorID]
if !ok {
val = 1
} else {
val++
}
if val <= maxVisitsPerWeek {
visitorIDToVisitCount[visit.VisitorID] = val
boundedVisits = append(boundedVisits, visit)
}
}
return boundedVisits
}