blob: 1406a0b17d963f9e1042a0c449acdc5249c4e65c [file] [log] [blame]
#!/usr/bin/env python
# Copyright 2016 The Fuchsia Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" This script reads several csv files containing data to be visualized.
It then uses the Google Data Visualization API to generate a JavaScript
file that contains definitions of DataTables holding the data.
This is used by visualization.html to generate visualizations.
"""
import csv
import os
import sys
THIS_DIR = os.path.dirname(__file__)
ROOT_DIR = os.path.abspath(os.path.join(THIS_DIR, os.path.pardir))
sys.path.insert(0, ROOT_DIR)
import utils.file_util as file_util
import third_party.google_visualization.gviz_api as gviz_api
from randomizers.randomizer import readRapporConfigParamsFromFile
# The javascript variables to write. Note "_sc" refers to the data from
# the "straight-counting pipeline" "_pr_" refers to the version of
# a metric that uses differentially-private release.
USAGE_BY_MODULE_JS_VAR_NAME = 'usage_by_module_data'
USAGE_BY_MODULE_PR_JS_VAR_NAME = 'usage_by_module_pr_data'
USAGE_BY_MODULE_SC_JS_VAR_NAME = 'usage_by_module_data_sc'
USAGE_BY_MODULE_PARAMS_JS_VAR_NAME = 'usage_by_module_params'
USAGE_BY_MODULE_PR_PARAMS_JS_VAR_NAME = 'usage_by_module_pr_params'
USAGE_BY_CITY_SC_JS_VAR_NAME = 'usage_by_city_data_sc'
USAGE_BY_CITY_JS_VAR_NAME = 'usage_by_city_data'
USAGE_BY_HOUR_SC_JS_VAR_NAME = 'usage_by_hour_data_sc'
USAGE_BY_HOUR_JS_VAR_NAME = 'usage_by_hour_data'
USAGE_BY_HOUR_PARAMS_JS_VAR_NAME = 'usage_by_hour_params'
POPULAR_URLS_JS_VAR_NAME = 'popular_urls_data'
POPULAR_URLS_SC_JS_VAR_NAME = 'popular_urls_data_sc'
POPULAR_URLS_HISTOGRAM_SC_JS_VAR_NAME = \
'popular_urls_histogram_data_sc'
POPULAR_HELP_QUERIES_JS_VAR_NAME = 'popular_help_queries_data'
POPULAR_HELP_QUERIES_SC_JS_VAR_NAME = 'popular_help_queries_data_sc'
POPULAR_HELP_QUERIES_HISTOGRAM_SC_JS_VAR_NAME = \
'popular_help_queries_histogram_data_sc'
# The output JavaScript file to be created.
OUTPUT_JS_FILE_NAME = 'data.js'
def buildDataTableJs(data=None, var_name=None, description=None,
columns_order=None, order_by=()):
"""Builds a JavaScript string defining a DataTable containing the given data.
Args:
data: {dictionary}: The data with which to populate the DataTable.
var_name {string}: The name of the JavaScript variable to write.
description {dictionary}: Passed to the constructor of gviz_api.DataTable()
columns_order {tuple of string}: The names of the table columns in the
order they should be written. Optional.
order_by {tuple of string}: Optional. Specify something like ('foo', 'des')
to sort the rows by the 'foo' column in descending order.
Returns:
{string} of the form |var_name|=<json>, where <json> is a json string
defining a data table.
"""
# Load data into a gviz_api.DataTable
data_table = gviz_api.DataTable(description)
data_table.LoadData(data)
json = data_table.ToJSon(columns_order=columns_order,order_by=order_by)
return "%s=%s;" % (var_name, json)
def buildUsageByModuleJsFromRapporOutput(sc_values, rappor_out_file, jsvar,
params_jsvar, config_file):
""" A helper function for buildUsageByModuleJs().
This function will be
invoked twice: once for the metric without differentially-private release
and once for the metric with differentially-private release.
Args:
sc_values: {dictionary} A dictionary of actual values from the straigh-
counting pipeline.
rappor_out_file: {string} The path to the file containing RAPPOR output
data.
jsvar: {string} The name of the Javascript variable to be used for the
data table.
params_jsvar: {string} The name of the Javascript variable to be used for
the RAPPOR parameters.
config_file: {string} The path of the file containing RAPPOR config data.
Returns:
{tuple of two strings} (usage_data, rappor_parameters).
See buildUsageByModuleJs() for details.
"""
# We skip row zero because it is the header row. We are going to visualize
# the data as an interval chart and so we want to compute the high and
# low 95% confidence interval values which we may do using the "std_error"
# column, column 2.
with file_util.openForReading(rappor_out_file) as csvfile:
reader = csv.reader(csvfile)
data = [{"module" : row[0], "estimate": float(row[1]),
"actual" : sc_values.get(row[0], 0),
"low" : float(row[1]) - 1.96 * float(row[2]),
"high": float(row[1]) + 1.96 * float(row[2])}
for row in reader if reader.line_num > 1]
usage_data_js = buildDataTableJs(
data=data,
var_name=jsvar,
description={"module": ("string", "Module"),
"estimate": ("number", "Estimate"),
"actual": ("number", "Actual"),
# The role: 'interval' property is what tells the Google
# Visualization API to draw an interval chart.
"low": ("number", "Low", {'role': 'interval'}),
"high": ("number", "High", {'role': 'interval'})},
columns_order=("module", "estimate", "actual", "low", "high"),
order_by=("estimate", "desc"))
# RAPPOR parameters
rappor_params_js = "{} = {};".format(params_jsvar,
readRapporConfigParamsFromFile(config_file).to_json())
return (usage_data_js, rappor_params_js)
def buildUsageByModuleJs():
""" Builds several strings defining variables used for visualization.
Reads some CSV files containing the usage-by-module data for the straight-
counting pipeline and the Cobalt prototype pipeline respectively and uses them
to build three JavaScript strings defining DataTables containing the data
and two JavaScript strings defining RAPPOR parameters.
Returns:
{tuple of five strings} (sc_string, cobalt_string, cobalt_with_pr_string,
rappor_parameters, rappor_with_pr_parameters). The "_pr_" variables
refer to the version of the RAPPOR metric that uses very weak RAPPOR
parameters but then adds Laplace noise at the end to affect differentially
private release. So "_pr_" for "private release". The first three
strings are of the form <var_name>=<json>, where |json| is a json
string defining a data table. The |var_name|s are respectively
USAGE_BY_MODULE_SC_JS_VAR_NAME, USAGE_BY_MODULE_JS_VAR_NAME, and
USAGE_BY_MODULE_PR_JS_VAR_NAME.
rappor_parameters is a json string containing values for k, h, m, p, q, f.
"""
# straight-counting:
# Read the data from the csv file and put it into a dictionary.
with file_util.openForReading(
file_util.USAGE_BY_MODULE_CSV_FILE_NAME) as csvfile:
reader = csv.reader(csvfile)
# |data| will be used to generate the visualiation data for the
# straight-counting pipeline
data = []
# |values| will be used below to include the actual values along with
# the RAPPOR estimates in the visualization of the Cobalt pipeline.
values = {}
for row in reader:
data.append({"module" : row[0], "count": int(row[1])})
values[row[0]] = int(row[1])
usage_by_module_sc_js = buildDataTableJs(
data=data,
var_name=USAGE_BY_MODULE_SC_JS_VAR_NAME,
description={"module": ("string", "Module"),
"count": ("number", "Count")},
columns_order=("module", "count"),
order_by=("count", "desc"))
# cobalt:
# Here the CSV file is the output of the RAPPOR analyzer.
# We read it and put the data into a dictionary.
usage_by_module_cobalt_js, rappor_params_js = \
buildUsageByModuleJsFromRapporOutput(values,
file_util.MODULE_NAME_ANALYZER_OUTPUT_FILE_NAME,
USAGE_BY_MODULE_JS_VAR_NAME,
USAGE_BY_MODULE_PARAMS_JS_VAR_NAME,
file_util.RAPPOR_MODULE_NAME_CONFIG)
# cobalt with differentially-private release:
usage_by_module_cobalt_with_pr_js, rappor_with_pr_params_js = \
buildUsageByModuleJsFromRapporOutput(values,
file_util.MODULE_NAME_PR_ANALYZER_OUTPUT_FILE_NAME,
USAGE_BY_MODULE_PR_JS_VAR_NAME,
USAGE_BY_MODULE_PR_PARAMS_JS_VAR_NAME,
file_util.RAPPOR_MODULE_NAME_PR_CONFIG)
return (usage_by_module_sc_js, usage_by_module_cobalt_js,
usage_by_module_cobalt_with_pr_js,
rappor_params_js, rappor_with_pr_params_js)
def buildUsageAndRatingByCityJs():
"""Builds several strings defining variables used for visualization.
Reads a CSV file containing the usage-by-city data and uses it
to build a JavaScript string defining a DataTable containing the data.
Returns:
{string} of the form <var_name>=<json>, where |var_name| is
USAGE_BY_CITY_SC_JS_VAR_NAME and |json| is a json string defining
a data table.
"""
# straight-counting:
# Read the data from the csv file and put it into a dictionary.
with file_util.openForReading(
file_util.USAGE_BY_CITY_CSV_FILE_NAME) as csvfile:
reader = csv.reader(csvfile)
# |data| will be used to generate the visualiation data for the
# straight-counting pipeline
data = []
# |values| will be used below to include the actual values along with
# the RAPPOR estimates in the visualization of the Cobalt pipeline.
values = []
for row in reader:
data.append({"city" : row[0], "usage": int(row[1]),
"rating": float(row[2])})
if int(row[1]) > 50:
values.append({"city" : row[0], "usage": int(row[1]),
"type" : "actual",
"radius_95" : 0,
"rating": float(row[2])})
usage_and_rating_by_city_sc_js = buildDataTableJs(
data=data,
var_name=USAGE_BY_CITY_SC_JS_VAR_NAME,
description = {"city": ("string", "City"),
"rating": ("number", "Rating"),
"usage": ("number", "Usage")},
columns_order=("city", "rating", "usage"),
order_by=("usage", "desc"))
# cobalt:
# Here the CSV file is the output of the RAPPOR analyzer.
# We read it and put the data into a dictionary.
# We skip row zero because it is the header row.
with file_util.openForReading(
file_util.CITY_RATINGS_ANALYZER_OUTPUT_FILE_NAME) as csvfile:
reader = csv.reader(csvfile)
data = [{"city" : row[0], "usage": float(row[1]),
"type" : "estimate",
"radius_95" : 1.96 * float(row[2]),
"rating": float(row[7])}
for row in reader if reader.line_num > 1]
data.extend(values)
usage_and_rating_by_city_cobalt_js = buildDataTableJs(
data=data,
var_name=USAGE_BY_CITY_JS_VAR_NAME,
description={"city": ("string", "City"),
"usage": ("number", "Usage"),
"rating": ("number", "Rating"),
"type" : ("string", "Estimate or Actual"),
"radius_95": ("number", "95% conf. intlv. radius")},
columns_order=("city", "usage", "rating", "type", "radius_95"),
order_by=("estimate", "desc"))
return (usage_and_rating_by_city_sc_js, usage_and_rating_by_city_cobalt_js)
def buildUsageByHourJs():
"""Builds several strings defining variables used for visualization.
Reads two CSV files containing the usage-by-hour data for the
straight-counting pipeline and the Cobalt prototype pipeline respectively and
uses them to build two JavaScript strings defining DataTables containing the
data and one string describing basic RAPPOR parameters.
Returns:
{tuple of two strings} (sc_string, cobalt_string, params_string). The first
two strings are of the form <var_name>=<json>, where |json| is a json string
defining a data table. The |var_name|s are respectively
USAGE_BY_HOUR_SC_JS_VAR_NAME and USAGE_BY_HOUR_JS_VAR_NAME.
params_string is a json string containing RAPPOR parameters.
"""
# straight-counting:
# Read the data from the csv file and put it into a dictionary.
with file_util.openForReading(
file_util.USAGE_BY_HOUR_CSV_FILE_NAME) as csvfile:
reader = csv.reader(csvfile)
# |data| will be used to generate the visualiation data for the
# straight-counting pipeline
data = []
# |values| will be used below to include the actual values along with
# the RAPPOR estimates in the visualization of the Cobalt pipeline.
values = []
hour = 0
for row in reader:
data.append({"hour" : hour, "usage": int(row[0])})
values.append(int(row[0]))
hour += 1
usage_by_hour_sc_js = buildDataTableJs(
data=data,
var_name=USAGE_BY_HOUR_SC_JS_VAR_NAME,
description = {"hour": ("number", "Hour of Day"),
"usage": ("number", "Usage")},
columns_order=("hour", "usage"))
# cobalt:
# Here the CSV file is the output of the RAPPOR analyzer.
# We read it and put the data into a dictionary.
# We skip row zero because it is the header row. We are going to visualize
# the data as an interval chart and so we want to compute the high and
# low 95% confidence interval values wich we may do using the "std_error"
# column, column 2.
with file_util.openForReading(
file_util.HOUR_ANALYZER_OUTPUT_FILE_NAME) as csvfile:
reader = csv.reader(csvfile)
data = [{"hour" : int(row[0]), "estimate": max(float(row[1]), 0),
"actual": values[int(row[0])],
"low" : max(float(row[1]) - 1.96 * float(row[2]), 0),
"high": float(row[1]) + 1.96 * float(row[2])}
for row in reader if reader.line_num > 1]
usage_by_hour_cobalt_js = buildDataTableJs(
data=data,
var_name=USAGE_BY_HOUR_JS_VAR_NAME,
description={"hour": ("number", "Hour"),
"estimate": ("number", "Estimate"),
"actual": ("number", "Actual"),
# The role: 'interval' property is what tells the Google
# Visualization API to draw an interval chart.
"low": ("number", "Low", {'role': 'interval'}),
"high": ("number", "High", {'role': 'interval'})},
columns_order=("hour", "estimate", "low", "high", "actual"),
order_by=("hour", "asc"))
# RAPPOR parameters
rappor_params_js = "{} = {};".format(
USAGE_BY_HOUR_PARAMS_JS_VAR_NAME,
readRapporConfigParamsFromFile(file_util.RAPPOR_HOUR_CONFIG).to_json())
return (usage_by_hour_sc_js, usage_by_hour_cobalt_js, rappor_params_js)
def buildItemAndCountJs(filename, varname1, varname2, item_column,
item_description):
"""Builds several strings defining variables used for visualization.
Reads a CSV file containing two columns, an item column and a
count column, and and uses the data to build two JavaScript strings defining
DataTables containing the data. The two DataTables will be the same except
for the order of the columns: The first DataTable will have the count
column first and the second DataTable will have the item column first.
Args:
filename: {string} The full path of the CSV file to read.
varname1: {string} The name of the first javascript variable to generate.
varname2: {string} The name of the second javascript variable to generate.
If this is None then the second returned string will
also be None.
item_column: {string} The name of the item column to use in the generated
JS.
item_description: The description string to use in the generated JS.
Returns:
{tuple of two string} of the form <varname>=<json>, where <json> is a json
string defining a data table. In the first returned string <varname> will
be |varname1| and the "count" column will come first in the DataTable.
In the second returned string <varname> will be |varname1| and |item_column|
will come first in the DataTable.
"""
with file_util.openForReading(filename) as csvfile:
reader = csv.reader(csvfile)
data = [{item_column : row[0], "count": int(row[1])} for row in reader]
count_first_string = buildDataTableJs(
data=data,
var_name=varname1,
description={item_column: ("string", item_description),
"count": ("number", "Count")},
columns_order=("count", item_column),
order_by=(("count", "desc"), item_column))
item_first_string = None
if varname2 is not None:
item_first_string = buildDataTableJs(
data=data,
var_name=varname2,
description={item_column: ("string", item_description),
"count": ("number", "Count")},
columns_order=(item_column, "count"),
order_by=(("count", "desc"), item_column))
return (count_first_string, item_first_string)
def buildPopularUrlsJs():
"""Builds several strings defining variables used for visualization.
Reads two CSV files containing the popular URL data for the straight-
counting pipeline and the Cobalt prototype pipeline respectively and uses them
to build three JavaScript strings defining DataTables containing the data.
Returns:
{tuple of three strings} (sc_string, sc_histogram_string, cobalt_string).
Each of the three strings is of the form <var_name>=<json>, where |json|
is a json string defining a data table. The |var_name|s are respectively
POPULAR_URLS_SC_JS_VAR_NAME,
POPULAR_URLS_HISTOGRAM_SC_JS_VAR_NAME,
POPULAR_URLS_JS_VAR_NAME.
"""
# straight-counting
popular_urls_sc_js, popular_urls_histogram_sc_js = buildItemAndCountJs(
file_util.POPULAR_URLS_CSV_FILE_NAME, POPULAR_URLS_SC_JS_VAR_NAME,
POPULAR_URLS_HISTOGRAM_SC_JS_VAR_NAME,
"url", "URL")
# Cobalt.
popular_urls_js, _ = buildItemAndCountJs(
file_util.URL_ANALYZER_OUTPUT_FILE_NAME, POPULAR_URLS_JS_VAR_NAME, None,
"url", "URL")
return (popular_urls_sc_js, popular_urls_histogram_sc_js, popular_urls_js)
def buildPopularHelpQueriesJs():
"""Builds several strings defining variables used for visualization.
Reads two CSV files containing the popular help-qury data for the straight-
counting pipeline and the Cobalt prototype pipeline respectively and uses them
to build three JavaScript strings defining DataTables containing the data.
Returns:
{tuple of three strings} (sc_string, sc_histogram_string, cobalt_string).
Each of the three strings is of the form <var_name>=<json>, where |json| is
a json string defining a data table. The |var_name|s are respectively
POPULAR_HELP_QUERIES_SC_JS_VAR_NAME,
POPULAR_HELP_QUERIES_HISTOGRAM_SC_JS_VAR_NAME,
and POPULAR_HELP_QUERIES_JS_VAR_NAME.
"""
# straight-counting, table visualization
popular_help_queries_sc_js, popular_help_queries_histogram_sc_js = \
buildItemAndCountJs(file_util.POPULAR_HELP_QUERIES_CSV_FILE_NAME,
POPULAR_HELP_QUERIES_SC_JS_VAR_NAME,
POPULAR_HELP_QUERIES_HISTOGRAM_SC_JS_VAR_NAME,
"help_query ", "Help Query")
# Cobalt.
popular_help_queries_js, _ = buildItemAndCountJs(
file_util.HELP_QUERY_ANALYZER_OUTPUT_FILE_NAME,
POPULAR_HELP_QUERIES_JS_VAR_NAME, None,
"help_query", "Help Query")
return (popular_help_queries_sc_js, popular_help_queries_histogram_sc_js,
popular_help_queries_js)
def main():
print "Generating visualization..."
# Read the input file and build the JavaScript strings to write.
usage_by_module_sc_js, usage_by_module_js, usage_by_module_with_pr_js, \
usage_by_module_params_js, usage_by_module_with_pr_params_js = \
buildUsageByModuleJs()
usage_by_city_js, usage_by_city_sc_js = buildUsageAndRatingByCityJs()
usage_by_hour_sc_js, usage_by_hour_js, usage_by_hour_params_js = \
buildUsageByHourJs()
(popular_urls_sc_js, popular_urls_histogram_sc_js,
popular_urls_js) = buildPopularUrlsJs()
(popular_help_queries_sc_js, popular_help_queries_histogram_sc_js,
popular_help_queries_js) = buildPopularHelpQueriesJs()
# Write the output file.
with file_util.openForWriting(OUTPUT_JS_FILE_NAME) as f:
f.write("// This js file is generated by the script "
"generate_data_js.py\n\n")
f.write("%s\n\n" % usage_by_module_sc_js)
f.write("%s\n\n" % usage_by_module_js)
f.write("%s\n\n" % usage_by_module_with_pr_js)
f.write("%s\n\n" % usage_by_module_params_js)
f.write("%s\n\n" % usage_by_module_with_pr_params_js)
f.write("%s\n\n" % usage_by_city_sc_js)
f.write("%s\n\n" % usage_by_city_js)
f.write("%s\n\n" % usage_by_hour_sc_js)
f.write("%s\n\n" % usage_by_hour_js)
f.write("%s\n\n" % usage_by_hour_params_js)
f.write("%s\n\n" % popular_urls_sc_js)
f.write("%s\n\n" % popular_urls_histogram_sc_js)
f.write("%s\n\n" % popular_urls_js)
f.write("%s\n\n" % popular_help_queries_sc_js)
f.write("%s\n\n" % popular_help_queries_histogram_sc_js)
f.write("%s\n\n" % popular_help_queries_js)
f.write("")
print "View this file in your browser:"
print "file://%s" % file_util.VISUALIZATION_FILE
if __name__ == '__main__':
main()