| #!/usr/bin/env python |
| # Copyright 2016 The Fuchsia Authors |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """Runs all of the randomizers. This file also contains utilities common to |
| all randomizers. |
| """ |
| |
| import csv |
| import logging |
| import os |
| import sys |
| |
| THIS_DIR = os.path.dirname(__file__) |
| ROOT_DIR = os.path.abspath(os.path.join(THIS_DIR, os.path.pardir)) |
| sys.path.insert(0, ROOT_DIR) |
| |
| _logger = logging.getLogger() |
| |
| import third_party.rappor.client.python.rappor as rappor |
| |
| try: |
| import third_party.fastrand.fastrand as fastrand |
| except ImportError: |
| fastrand = None |
| |
| import algorithms.forculus.forculus as forculus |
| import city_randomizer |
| import help_query_randomizer |
| import hour_randomizer |
| import module_name_randomizer |
| import url_randomizer |
| import utils.data as data |
| import utils.file_util as file_util |
| import utils.public_key_crypto_helper as crypto_helper |
| |
| # Should public key encryption be used for communication between the |
| # Randomizers and the Analyzers via the Shufflers? |
| _use_public_key_encryption=False |
| |
| def initializeFastrand(): |
| ''' Initializes fastrand environment. |
| |
| Returns: A fastrand module or an object for performing simple |
| randomness depending on the fastrand extension availability. |
| ''' |
| # Fastrand module written in C++ speeds up random number generation. |
| if fastrand: |
| _logger.info('Using fastrand extension') |
| # NOTE: This doesn't take 'rand'. It's seeded in C with srand(). |
| irr_rand = fastrand.FastIrrRand |
| else: |
| _logger.warning('fastrand module not importable; see README for build ' |
| 'instructions. Falling back to simple randomness.') |
| irr_rand = rappor.SecureIrrRand |
| return irr_rand |
| |
| def readRapporConfigParamsFromFile(config_file): |
| ''' Returns the RAPPOR config params as specified by the config file in csv |
| format. |
| |
| Args: |
| config_file {string}: The simple name of the RAPPOR config file. |
| |
| Returns: A list of RAPPOR configuration values. |
| ''' |
| with file_util.openFileForReading( |
| config_file, file_util.CONFIG_DIR) as cf: |
| return rappor.Params.from_csv(cf) |
| |
| def encodeWithBloomFilter(user_id, data, config_params, irr_rand): |
| ''' Encodes plain text using RAPPOR with bloom filters using user_id to |
| derive per-client secret and returns an encoded string along with the |
| generated cohort value. |
| |
| Args: |
| user_id {Int}: A unique value for each input entry that is used for |
| deriving client secret. |
| |
| data {string}: Data to be encoded. |
| |
| config_params: A list of RAPPOR configuration values. |
| irr_rand {function} A function that complies with the signature of |
| rappor.SecureIrrRand |
| |
| Returns: cohort value and the RAPPOR encoded string. |
| ''' |
| |
| # User_id is used to derive a cohort. |
| cohort = user_id % config_params.num_cohorts |
| |
| # User_id is used to derive a per-client secret as required by the |
| # RAPPOR encoder. For the current prototype, clients only report one |
| # value, so using RAPPOR protection across multiple client values is |
| # not demonstrated. |
| data_e = rappor.Encoder(config_params, cohort, |
| str(user_id), |
| irr_rand(config_params)) |
| |
| data_rr = data_e.encode(data) |
| return cohort, data_rr |
| |
| def encodeWithoutBloomFilter(user_id, data, config_params, irr_rand): |
| ''' Encodes plain text using basic RAPPOR (without any bloom filters) |
| and returns an encoded string with cohort set to 0. |
| |
| Args: |
| user_id {Int}: A unique value for each input entry that is used for |
| deriving client secret. |
| |
| data {string}: Data to be encoded. |
| |
| config_params: A list of RAPPOR configuration values. |
| irr_rand {function} A function that complies with the signature of |
| rappor.SecureIrrRand |
| |
| Returns: cohort value and the RAPPOR encoded string. |
| ''' |
| # Using a single cohort for all users. |
| cohort = 0 |
| |
| # For simple data like hour of the day that is bounded between 0 and |
| # 23, we use basic RAPPOR(no Bloom filters) with a single cohort |
| # (specified with cohort=0) for all users. |
| data_e = rappor.Encoder(config_params, cohort, |
| str(user_id), |
| irr_rand(config_params)) |
| |
| # The data is usually a small deterministic value that is well bounded. |
| # We use basic RAPPOR (no Bloom filters) and represent the value |
| # |n| as a bit string with all zeroes except a 1 in position n; |
| # in other words as the number 2^n. |
| data_rr = data_e.encode_bits(2**data) |
| return cohort, data_rr |
| |
| def randomizeUsingRappor(entries, param_configs, output_file): |
| ''' A helper function that may be invoked by individual randomizers. |
| It reads input data in the form of a CSV file, performs some randomization |
| on data using RAPPOR with or without bloom filters, and then writes output |
| to another CSV file to be consumed by a shuffler. |
| |
| Args: |
| entries: A list of input entries to be randomized. |
| |
| param_configs: A list of tuples containing a param index into |Entry| tuple, |
| a boolean value to specify whether that param supports cohort based analysis |
| or not, and the name of the RAPPOR config_file for the specified param. |
| For example: |
| param_configs = [(1, True, 'rappor_module_name_config.csv')] |
| implies that the randomization should be performed on module_names using |
| bloom filters and RAPPOR configuration as specified in file: |
| <rappor_module_name_config.csv>. |
| |
| output_file {string}: The simple name of the CSV file to be written in |
| the 'r_to_s' directory. |
| ''' |
| with file_util.openForRandomizerWriting(output_file) as f: |
| writer = csv.writer(f) |
| |
| # Read RAPPOR config params into a list of config params with keys as |
| # param index from |Entry|. |
| rappor_configs = {} |
| for config in param_configs: |
| config_params = readRapporConfigParamsFromFile(config[2]) |
| config_fmt_string = '0%ib' % config_params.num_bloombits |
| rappor_configs[config[0]] = (config_params, config_fmt_string) |
| |
| # Initialize fastrand module |
| irr_rand = initializeFastrand() |
| |
| encrypt_for_analyzer=None |
| if _use_public_key_encryption: |
| ch = crypto_helper.CryptoHelper() |
| encrypt_for_analyzer = ch.encryptForSendingToAnalyzer |
| |
| # Format strings for RAPPOR reports. |
| for entry in entries: |
| # For randomizing multiple params, generate the encoded data based on |
| # cohort configuration for each param separately. |
| data_out = [] |
| data_out.append('%d' % 0) # default cohort |
| for param_index, use_bloom_filter, config_file in param_configs: |
| if use_bloom_filter: |
| (cohort, data_rr) = encodeWithBloomFilter(entry.user_id, |
| entry[param_index], |
| rappor_configs[param_index][0], |
| irr_rand) |
| # For now, always use the cohort generated from the bloom filter, if |
| # multiple params are involved. |
| # TODO(ukode): From an API perspective, it might be best to have |
| # (cohort, report) separate for each metric we report and just treat |
| # the cohort as redundant if it's boolean/basic RAPPOR. If we ever |
| # choose to use different # of cohorts for different metrics, this |
| # will be useful in the future. |
| data_out[0] = '%d' % cohort |
| else: |
| (cohort, data_rr) = encodeWithoutBloomFilter(entry.user_id, |
| entry[param_index], |
| rappor_configs[param_index][0], |
| irr_rand) |
| data_out.append(format(data_rr, rappor_configs[param_index][1])) |
| |
| # Write each row to the out file with the following syntax: |
| # {cohort, data1_rr, data2_rr, data3_rr, ...} |
| # for example, city_rating_randomized output looks like: |
| # {cohort, city_name_rr, rating_rr) |
| data_to_write = [data for data in data_out] |
| if _use_public_key_encryption: |
| # Express the data-to-write as a single string with comma-separated |
| # fields. Then encrypt that string, receiving a tuple of strings |
| # representing the ciphertext. We use that tuple as the data-to-write. |
| data_to_write = encrypt_for_analyzer(",".join(map(str, data_to_write))) |
| writer.writerow(data_to_write) |
| |
| def randomizeUsingForculus(entries, param_index, config_file, output_file): |
| '''A helper function that may be invoked by individual randomizers. |
| It reads input data in the form of a CSV file, performs some randomization |
| on data using Forculus, and then writes output to another CSV file to be |
| consumed by a shuffler. |
| |
| Args: |
| entries: A list of input entries to be randomized. |
| |
| param_index {int}: An index into |Entry| tuple that identifies the |
| parameter to be randomized. |
| |
| config_file {string}: The simple name of the Forculus config file used for |
| randomizing the param specified by |param_index|. |
| |
| output_file {string}: The simple name of the CSV file to be written in |
| the 'r_to_s' directory. |
| ''' |
| with file_util.openFileForReading(config_file, file_util.CONFIG_DIR) as cf: |
| config = forculus.Config.from_csv(cf) |
| |
| encrypt_for_analyzer = None |
| if _use_public_key_encryption: |
| ch = crypto_helper.CryptoHelper() |
| encrypt_for_analyzer=ch.encryptForSendingToAnalyzer |
| |
| with file_util.openForRandomizerWriting(output_file) as f: |
| forculus_inserter = forculus.ForculusInserter(config.threshold, f) |
| for entry in entries: |
| forculus_inserter.Insert(entry[param_index], |
| additional_encryption_func=encrypt_for_analyzer) |
| |
| def runAllRandomizers(entries, use_public_key_encryption=False): |
| '''Runs all of the randomizers on the given list of entries. |
| |
| This function does not return anything but it invokes all of the |
| randomizers each of which will write a file |
| into the "r_to_s" output directory containing the randomizer output. |
| |
| Args: |
| entries {list of Entry}: The entries to be randomized. |
| use_public_key_encryption {boolean}: Should public key encrytpion be |
| used to encrypt communication between the Randomizers and the Analyzers |
| via the shufflers? |
| ''' |
| |
| global _use_public_key_encryption |
| _use_public_key_encryption = use_public_key_encryption |
| |
| # Run the help query randomizer |
| print "Running the help-query randomizer..." |
| hq_randomizer = help_query_randomizer.HelpQueryRandomizer() |
| hq_randomizer.randomize(entries) |
| |
| # Run the city randomizer |
| print "Running the city randomizer..." |
| c_randomizer = city_randomizer.CityRandomizer() |
| c_randomizer.randomize(entries) |
| |
| # Run the module name randomizer |
| print "Running the module name randomizer..." |
| mn_randomizer = module_name_randomizer.ModuleNameRandomizer() |
| mn_randomizer.randomize(entries) |
| |
| # Run the module name randomizer in another mode |
| print("Running the module name randomizer for differentially " |
| "private release...") |
| mn_randomizer.randomize(entries, for_private_release=True) |
| |
| # Run the hour randomizer |
| print "Running the hour of day randomizer..." |
| hr_randomizer = hour_randomizer.HourRandomizer() |
| hr_randomizer.randomize(entries) |
| |
| # Run the url randomizer |
| print "Running the url randomizer..." |
| u_randomizer = url_randomizer.UrlRandomizer() |
| u_randomizer.randomize(entries) |
| |
| def readAndRandomize(use_public_key_encryption=False): |
| '''Reads the fake data and runs all of the Randomizers on it. |
| |
| Args: |
| use_public_key_encryption {boolean}: Should public key encrytpion be |
| used to encrypt communication between the Randomizers and the Analyzers |
| via the shufflers? |
| ''' |
| entries = data.readEntries(file_util.GENERATED_INPUT_DATA_FILE_NAME) |
| runAllRandomizers(entries, use_public_key_encryption) |
| |
| def main(): |
| readAndRandomize() |
| |
| if __name__ == '__main__': |
| main() |