prototype/fake_data/generate_fake_data.py - cobalt - Git at Google

 #!/usr/bin/env python
 # Copyright 2016 The Fuchsia Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """ This script is the first step in the Cobalt prototype. It
 generates synthetic data, writes this data to a file called
 input_data.csv, then runs the straight counting pipeline on the
 data which emits several csv files to the out directory.
 """

 import base64
 import collections
 import csv
 import hashlib
 import os
 import random
 import sys

 THIS_DIR = os.path.dirname(__file__)
 ROOT_DIR = os.path.abspath(os.path.join(THIS_DIR, os.path.pardir))
 sys.path.insert(0, ROOT_DIR)

 import utils.file_util as file_util
 import utils.data as data

 # For the names of the modules in our fake data we will use names from a
 # list of "girl's names" found on the internet.
 from baby_names import BABY_NAMES
 # We will use cities from a list of cities found on the internet.
 from us_cities import US_CITIES
 # We will use proper nouns from a sample list constructed with few special
 # characters such as capital letters, single quotes and a period.
 from help_query_primary_nouns import HELP_QUERY_PRIMARY_NOUNS
 # We will use verbs from a predefined list of commonly used verbs.
 from help_query_verbs import HELP_QUERY_VERBS
 # We will use predefined set of secondary nouns commonly used in speaking.
 from help_query_secondary_nouns import HELP_QUERY_SECONDARY_NOUNS
 # We will use predefined set of most visited urls for covering the low entropy
 # url usecase.
 from most_visited_urls import MOST_VISITED_URLS

 # Total number of users
 # TODO(rudominer) Move to a config file.
 NUM_USERS = 1000

 # A pair consisting of a usage and a rating.
 class UsageAndRating:
   def  __init__(self, rating):
     self.num_uses = 1
     self.total_rating = rating

 def powerRandomInt(max_val):
   """Returns a random integer from the interval [0, max_val],
   using a power-law distribution.

   The underlying probability distribution is given by:
   P(X >= n) = (c/(n+c))^4, for n>=0 an integer, and where we use c=20.

   But if X > max_val is generated then max_val is returned.

   Assuming max_val is sufficiently large the distribution should look
   approximately like the following. We display all values of n for
   which P(n) >= 1%

   P(0)  = 0.177
   P(1)  = 0.139
   P(2)  = 0.111
   P(3)  = 0.089
   P(4)  = 0.072
   P(5)  = 0.059
   P(6)  = 0.049
   P(7)  = 0.040
   P(8)  = 0.034
   P(9)  = 0.028
   P(10) = 0.024
   P(11) = 0.020
   P(12) = 0.017
   P(13) = 0.015
   P(14) = 0.013
   P(15) = 0.011
   P(16) = 0.009

   The mean is approximately 6 and the variance is approximaley 4.4.

   Args:
     max_val {number} A positive number. All returned values will be less than
       this.

   Returns:
     {int} A random integer in the range [0, max_val].
   """
   x  = int(20*random.paretovariate(4) - 20)
   # Ensure the value is in the range [0, max_val]
   return max(0, min(x, max_val))

 def normalRandomInt(max_val, spread, skew=0):
   """Returns a random integer from a normal distribution whose parameters may be
   tweaked by setting max_val, spread and skew. The value is clipped to
   the range [0, max_val].

   Args:
     max_val {number} A positive number. All returned values will be less than
       this.
     spread {float} Should be a value between 0 and 1. The standard deviation of
       the normal distribution will be set to this value times max_val.
     skew {float} Should be value between -1 and 1. The mean of the normal
       distribution will be set to max_val * 0.5 * (1 + skew).

   Returns:
     {int} A random integer in the range [0, max_val].
   """
   mu = max_val * 0.5 * (1.0 + skew)
   sigma = max_val*spread
   x  = int(random.normalvariate(mu, sigma))
   # Ensure the value is in the range [0, max_val]
   return max(0, min(x, max_val))

 def generateRandomHelpQuery():
   """Generates a random help query string of the form <noun verb noun> from a
   predefined set of words for noun and verb category.

   Returns:
     {string} A random help query string containing three words separated with a
     space.
   """

   help_query = ""
   # The |spread| arguments used below were arrived at by experiminetation.
   # The goal was to get the distribution to be as spread out as possible
   # while still having the property that one could see the histogram drop below
   # 20 at about n = 45. This means that when capturing the top 50 queries we
   # will capture all of the queries that occur at least 20 times. If you
   # examine the file popular_help_queries.csv you should see 50 entries
   # and all but the bottom few should have an occurrence >= 20.
   index = normalRandomInt(len(HELP_QUERY_PRIMARY_NOUNS)-1, 0.038)
   help_query += HELP_QUERY_PRIMARY_NOUNS[index] + " "
   index = normalRandomInt(len(HELP_QUERY_VERBS) - 1, 0.04)
   help_query += HELP_QUERY_VERBS[index] + " "
   index = normalRandomInt(len(HELP_QUERY_SECONDARY_NOUNS)-1, 0.035)
   help_query += HELP_QUERY_SECONDARY_NOUNS[index]
   return help_query

 def generateHighEntropyUrls():
   """Generates a list of urls that are long and hard to guess such as Google doc
   urls. These urls inherently have high entropy guaranteed by the high
   randomness in the url string. Also, these urls are assumed to appear
   infrequently in the real-world scenarios such as Google doc urls or spam urls
   sent in hangout messages.

   Returns:
     {list of string} A list of 5000 randomly generated urls.
   """
   url_base_path = "https://docs.google.com/document/d/"
   high_entropy_urls = []
   for i in xrange(5000):
     hash_object = hashlib.sha256(bytes(i))
     high_entropy_urls.append(url_base_path + base64.b64encode(str.encode(hash_object.hexdigest())))

   return high_entropy_urls

 def generateRandomEntries(num_entries):
   """Generates a random list of Entries.

   Args:
     num_entries {int} The number of random entries to generate.
   Returns:
     {list of Entry} A list of random entries of length |num_entries|.
   """
   # Generate a list of high-entropy urls.
   high_entropy_urls = generateHighEntropyUrls()
   # Bucket 5 urls as spammy that appear 2% of the time in the final outcome.
   spammy_high_entropy_urls = []
   for i in xrange(5):
     spammy_high_entropy_urls.append(high_entropy_urls.pop())

   entries = []
   for i in xrange(num_entries):
     city_index = powerRandomInt(len(US_CITIES)-1)
     city = US_CITIES[city_index]
     name_index = powerRandomInt(len(BABY_NAMES)-1)
     name = BABY_NAMES[name_index]
     hour = int(random.triangular(0,23))
     # The |rating_skew| and |rating_spread| parameters below are functions
     # of |city_index| that were arrived at by experimentation. The goal
     # was to have the rating and city be statistically dependent and for
     # the conditional distribution of the rating to have the property that
     # the mean decreases and the variance increases as the
     # city index increases (and therefore as the popularity of the city
     # decreases) in such a way that the geo-chart tends to show larger
     # greener circles and smaller redish circles. We have arranged for the skew
     # to be a number in the range [-0.9, 0.9] and the spread to be a number
     # in the range [0.3, 1]. The slopes of the linear functions have no
     # great explanation--they just seem to give rates of increase
     # and decrease that look good.
     rating_skew = max(-0.9, 0.9 - 0.111 * city_index)
     rating_spread = min(1, 0.3 + 0.07 * city_index)
     rating = normalRandomInt(10, rating_spread, rating_skew)
     user_id = random.randint(1, NUM_USERS + 1)
     # Generate free-form help queries from a list of primary nouns, verbs and
     # secondary nouns.
     help_query = generateRandomHelpQuery()
     # Generate either a low_entropy or a high_entropy url using the following
     # random selection:
     # - 60% of the time a most-visited URL is used.
     # - 38% of the time a uniformly random high-entropy URL is used.
     # - 2% of the time one of the 5 spammy high-entropy URL is used.

     # Generates the next random floating point number in the range [0.0, 1.0).
     r = random.random()
     if r > 0.4:
       url_index = powerRandomInt(len(MOST_VISITED_URLS)-1)
       url = MOST_VISITED_URLS[url_index]
     elif (0.02 < r <= 0.4):
       url_index = random.randint(0, len(high_entropy_urls)-1)
       url = high_entropy_urls[url_index]
     else:
       url_index = random.randint(0, len(spammy_high_entropy_urls)-1)
       url = spammy_high_entropy_urls[url_index]

     entries.append(data.Entry(user_id, name, city, hour, rating, help_query, url))
   return entries

 class Accumulator:
   """Accumulates the randomly produced entries and aggregates stats about them.
   """
   def  __init__(self):
     # A map from city name to UsageAnRating
     self.usage_and_rating_by_city={}
     # A counter used to count occurrences of each module seen.
     self.usage_by_module=collections.Counter()
     # A list of 24 singleton lists of usage counts.
     self.usage_by_hour=[[0] for i in xrange(24)]
     # A counter used to count occurences of each help query.
     self.popular_help_query=collections.Counter()
     # A counter used to count occurences of each url.
     self.popular_url=collections.Counter()

   def addEntry(self, entry):
     self.usage_by_module[entry.name] +=1
     self.usage_by_hour[entry.hour][0] += 1
     self.popular_help_query[entry.help_query] +=1
     self.popular_url[entry.url] +=1
     if entry.city in self.usage_and_rating_by_city:
       self.usage_and_rating_by_city[entry.city].num_uses = (
         self.usage_and_rating_by_city[entry.city].num_uses + 1)
       self.usage_and_rating_by_city[entry.city].total_rating +=entry.rating
     else:
       self.usage_and_rating_by_city[entry.city] = UsageAndRating(entry.rating)

 def main():
   # Generate the synthetic input data.
   # TODO(rudominer) Move '10000' to a config file.
   print "Generating 10,000 random entries..."
   entries = generateRandomEntries(10000)

   # Write the synthetic input data to a file for consumption by the
   # Cobalt prototype.
   data.writeEntries(entries, file_util.GENERATED_INPUT_DATA_FILE_NAME)

   # Start the straight counting pipeline. We don't bother reading the input
   # file that we just wrote since we already have it in memory.
   # We just use data that is already in memory in |entries|.
   print "Running the straight-counting pipeline..."
   accumulator = Accumulator()
   for entry in entries:
   	accumulator.addEntry(entry)

   with file_util.openForWriting(file_util.USAGE_BY_HOUR_CSV_FILE_NAME) as f:
     writer = csv.writer(f)
     writer.writerows(accumulator.usage_by_hour)

   with file_util.openForWriting(file_util.USAGE_BY_MODULE_CSV_FILE_NAME) as f:
     writer = csv.writer(f)
     for name in accumulator.usage_by_module:
       writer.writerow([name, accumulator.usage_by_module[name]])

   with file_util.openForWriting(
       file_util.POPULAR_HELP_QUERIES_CSV_FILE_NAME) as f:
     writer = csv.writer(f)
     writer.writerows(accumulator.popular_help_query.most_common(50))

   with file_util.openForWriting(
       file_util.POPULAR_URLS_CSV_FILE_NAME) as f:
     writer = csv.writer(f)
     writer.writerows(accumulator.popular_url.most_common(50))

   with file_util.openForWriting(file_util.USAGE_BY_CITY_CSV_FILE_NAME) as f:
     writer = csv.writer(f)
     for city in accumulator.usage_and_rating_by_city:
       num_uses = accumulator.usage_and_rating_by_city[city].num_uses
       avg_rating = (accumulator.usage_and_rating_by_city[city].total_rating /
       	            float(num_uses))
       writer.writerow([city, num_uses, avg_rating])

 if __name__ == '__main__':
   main()
	#!/usr/bin/env python
	# Copyright 2016 The Fuchsia Authors
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	""" This script is the first step in the Cobalt prototype. It
	generates synthetic data, writes this data to a file called
	input_data.csv, then runs the straight counting pipeline on the
	data which emits several csv files to the out directory.
	"""

	import base64
	import collections
	import csv
	import hashlib
	import os
	import random
	import sys

	THIS_DIR = os.path.dirname(__file__)
	ROOT_DIR = os.path.abspath(os.path.join(THIS_DIR, os.path.pardir))
	sys.path.insert(0, ROOT_DIR)

	import utils.file_util as file_util
	import utils.data as data

	# For the names of the modules in our fake data we will use names from a
	# list of "girl's names" found on the internet.
	from baby_names import BABY_NAMES
	# We will use cities from a list of cities found on the internet.
	from us_cities import US_CITIES
	# We will use proper nouns from a sample list constructed with few special
	# characters such as capital letters, single quotes and a period.
	from help_query_primary_nouns import HELP_QUERY_PRIMARY_NOUNS
	# We will use verbs from a predefined list of commonly used verbs.
	from help_query_verbs import HELP_QUERY_VERBS
	# We will use predefined set of secondary nouns commonly used in speaking.
	from help_query_secondary_nouns import HELP_QUERY_SECONDARY_NOUNS
	# We will use predefined set of most visited urls for covering the low entropy
	# url usecase.
	from most_visited_urls import MOST_VISITED_URLS

	# Total number of users
	# TODO(rudominer) Move to a config file.
	NUM_USERS = 1000

	# A pair consisting of a usage and a rating.
	class UsageAndRating:
	def __init__(self, rating):
	self.num_uses = 1
	self.total_rating = rating

	def powerRandomInt(max_val):
	"""Returns a random integer from the interval [0, max_val],
	using a power-law distribution.

	The underlying probability distribution is given by:
	P(X >= n) = (c/(n+c))^4, for n>=0 an integer, and where we use c=20.

	But if X > max_val is generated then max_val is returned.

	Assuming max_val is sufficiently large the distribution should look
	approximately like the following. We display all values of n for
	which P(n) >= 1%

	P(0) = 0.177
	P(1) = 0.139
	P(2) = 0.111
	P(3) = 0.089
	P(4) = 0.072
	P(5) = 0.059
	P(6) = 0.049
	P(7) = 0.040
	P(8) = 0.034
	P(9) = 0.028
	P(10) = 0.024
	P(11) = 0.020
	P(12) = 0.017
	P(13) = 0.015
	P(14) = 0.013
	P(15) = 0.011
	P(16) = 0.009

	The mean is approximately 6 and the variance is approximaley 4.4.

	Args:
	max_val {number} A positive number. All returned values will be less than
	this.

	Returns:
	{int} A random integer in the range [0, max_val].
	"""
	x = int(20*random.paretovariate(4) - 20)
	# Ensure the value is in the range [0, max_val]
	return max(0, min(x, max_val))

	def normalRandomInt(max_val, spread, skew=0):
	"""Returns a random integer from a normal distribution whose parameters may be
	tweaked by setting max_val, spread and skew. The value is clipped to
	the range [0, max_val].

	Args:
	max_val {number} A positive number. All returned values will be less than
	this.
	spread {float} Should be a value between 0 and 1. The standard deviation of
	the normal distribution will be set to this value times max_val.
	skew {float} Should be value between -1 and 1. The mean of the normal
	distribution will be set to max_val * 0.5 * (1 + skew).

	Returns:
	{int} A random integer in the range [0, max_val].
	"""
	mu = max_val * 0.5 * (1.0 + skew)
	sigma = max_val*spread
	x = int(random.normalvariate(mu, sigma))
	# Ensure the value is in the range [0, max_val]
	return max(0, min(x, max_val))

	def generateRandomHelpQuery():
	"""Generates a random help query string of the form <noun verb noun> from a
	predefined set of words for noun and verb category.

	Returns:
	{string} A random help query string containing three words separated with a
	space.
	"""

	help_query = ""
	# The \|spread\| arguments used below were arrived at by experiminetation.
	# The goal was to get the distribution to be as spread out as possible
	# while still having the property that one could see the histogram drop below
	# 20 at about n = 45. This means that when capturing the top 50 queries we
	# will capture all of the queries that occur at least 20 times. If you
	# examine the file popular_help_queries.csv you should see 50 entries
	# and all but the bottom few should have an occurrence >= 20.
	index = normalRandomInt(len(HELP_QUERY_PRIMARY_NOUNS)-1, 0.038)
	help_query += HELP_QUERY_PRIMARY_NOUNS[index] + " "
	index = normalRandomInt(len(HELP_QUERY_VERBS) - 1, 0.04)
	help_query += HELP_QUERY_VERBS[index] + " "
	index = normalRandomInt(len(HELP_QUERY_SECONDARY_NOUNS)-1, 0.035)
	help_query += HELP_QUERY_SECONDARY_NOUNS[index]
	return help_query

	def generateHighEntropyUrls():
	"""Generates a list of urls that are long and hard to guess such as Google doc
	urls. These urls inherently have high entropy guaranteed by the high
	randomness in the url string. Also, these urls are assumed to appear
	infrequently in the real-world scenarios such as Google doc urls or spam urls
	sent in hangout messages.

	Returns:
	{list of string} A list of 5000 randomly generated urls.
	"""
	url_base_path = "https://docs.google.com/document/d/"
	high_entropy_urls = []
	for i in xrange(5000):
	hash_object = hashlib.sha256(bytes(i))
	high_entropy_urls.append(url_base_path + base64.b64encode(str.encode(hash_object.hexdigest())))

	return high_entropy_urls

	def generateRandomEntries(num_entries):
	"""Generates a random list of Entries.

	Args:
	num_entries {int} The number of random entries to generate.
	Returns:
	{list of Entry} A list of random entries of length \|num_entries\|.
	"""
	# Generate a list of high-entropy urls.
	high_entropy_urls = generateHighEntropyUrls()
	# Bucket 5 urls as spammy that appear 2% of the time in the final outcome.
	spammy_high_entropy_urls = []
	for i in xrange(5):
	spammy_high_entropy_urls.append(high_entropy_urls.pop())

	entries = []
	for i in xrange(num_entries):
	city_index = powerRandomInt(len(US_CITIES)-1)
	city = US_CITIES[city_index]
	name_index = powerRandomInt(len(BABY_NAMES)-1)
	name = BABY_NAMES[name_index]
	hour = int(random.triangular(0,23))
	# The \|rating_skew\| and \|rating_spread\| parameters below are functions
	# of \|city_index\| that were arrived at by experimentation. The goal
	# was to have the rating and city be statistically dependent and for
	# the conditional distribution of the rating to have the property that
	# the mean decreases and the variance increases as the
	# city index increases (and therefore as the popularity of the city
	# decreases) in such a way that the geo-chart tends to show larger
	# greener circles and smaller redish circles. We have arranged for the skew
	# to be a number in the range [-0.9, 0.9] and the spread to be a number
	# in the range [0.3, 1]. The slopes of the linear functions have no
	# great explanation--they just seem to give rates of increase
	# and decrease that look good.
	rating_skew = max(-0.9, 0.9 - 0.111 * city_index)
	rating_spread = min(1, 0.3 + 0.07 * city_index)
	rating = normalRandomInt(10, rating_spread, rating_skew)
	user_id = random.randint(1, NUM_USERS + 1)
	# Generate free-form help queries from a list of primary nouns, verbs and
	# secondary nouns.
	help_query = generateRandomHelpQuery()
	# Generate either a low_entropy or a high_entropy url using the following
	# random selection:
	# - 60% of the time a most-visited URL is used.
	# - 38% of the time a uniformly random high-entropy URL is used.
	# - 2% of the time one of the 5 spammy high-entropy URL is used.

	# Generates the next random floating point number in the range [0.0, 1.0).
	r = random.random()
	if r > 0.4:
	url_index = powerRandomInt(len(MOST_VISITED_URLS)-1)
	url = MOST_VISITED_URLS[url_index]
	elif (0.02 < r <= 0.4):
	url_index = random.randint(0, len(high_entropy_urls)-1)
	url = high_entropy_urls[url_index]
	else:
	url_index = random.randint(0, len(spammy_high_entropy_urls)-1)
	url = spammy_high_entropy_urls[url_index]

	entries.append(data.Entry(user_id, name, city, hour, rating, help_query, url))
	return entries

	class Accumulator:
	"""Accumulates the randomly produced entries and aggregates stats about them.
	"""
	def __init__(self):
	# A map from city name to UsageAnRating
	self.usage_and_rating_by_city={}
	# A counter used to count occurrences of each module seen.
	self.usage_by_module=collections.Counter()
	# A list of 24 singleton lists of usage counts.
	self.usage_by_hour=[[0] for i in xrange(24)]
	# A counter used to count occurences of each help query.
	self.popular_help_query=collections.Counter()
	# A counter used to count occurences of each url.
	self.popular_url=collections.Counter()

	def addEntry(self, entry):
	self.usage_by_module[entry.name] +=1
	self.usage_by_hour[entry.hour][0] += 1
	self.popular_help_query[entry.help_query] +=1
	self.popular_url[entry.url] +=1
	if entry.city in self.usage_and_rating_by_city:
	self.usage_and_rating_by_city[entry.city].num_uses = (
	self.usage_and_rating_by_city[entry.city].num_uses + 1)
	self.usage_and_rating_by_city[entry.city].total_rating +=entry.rating
	else:
	self.usage_and_rating_by_city[entry.city] = UsageAndRating(entry.rating)

	def main():
	# Generate the synthetic input data.
	# TODO(rudominer) Move '10000' to a config file.
	print "Generating 10,000 random entries..."
	entries = generateRandomEntries(10000)

	# Write the synthetic input data to a file for consumption by the
	# Cobalt prototype.
	data.writeEntries(entries, file_util.GENERATED_INPUT_DATA_FILE_NAME)

	# Start the straight counting pipeline. We don't bother reading the input
	# file that we just wrote since we already have it in memory.
	# We just use data that is already in memory in \|entries\|.
	print "Running the straight-counting pipeline..."
	accumulator = Accumulator()
	for entry in entries:
	accumulator.addEntry(entry)

	with file_util.openForWriting(file_util.USAGE_BY_HOUR_CSV_FILE_NAME) as f:
	writer = csv.writer(f)
	writer.writerows(accumulator.usage_by_hour)

	with file_util.openForWriting(file_util.USAGE_BY_MODULE_CSV_FILE_NAME) as f:
	writer = csv.writer(f)
	for name in accumulator.usage_by_module:
	writer.writerow([name, accumulator.usage_by_module[name]])

	with file_util.openForWriting(
	file_util.POPULAR_HELP_QUERIES_CSV_FILE_NAME) as f:
	writer = csv.writer(f)
	writer.writerows(accumulator.popular_help_query.most_common(50))

	with file_util.openForWriting(
	file_util.POPULAR_URLS_CSV_FILE_NAME) as f:
	writer = csv.writer(f)
	writer.writerows(accumulator.popular_url.most_common(50))

	with file_util.openForWriting(file_util.USAGE_BY_CITY_CSV_FILE_NAME) as f:
	writer = csv.writer(f)
	for city in accumulator.usage_and_rating_by_city:
	num_uses = accumulator.usage_and_rating_by_city[city].num_uses
	avg_rating = (accumulator.usage_and_rating_by_city[city].total_rating /
	float(num_uses))
	writer.writerow([city, num_uses, avg_rating])

	if __name__ == '__main__':
	main()