prototype/visualization/generate_data_js.py - cobalt - Git at Google

 #!/usr/bin/env python
 # Copyright 2016 The Fuchsia Authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """ This script reads several csv files containing data to be visualized.
     It then uses the Google Data Visualization API to generate a JavaScript
     file that contains definitions of DataTables holding the data.
     This is used by visualization.html to generate visualizations.
 """

 import csv
 import os
 import sys

 THIS_DIR = os.path.dirname(__file__)
 ROOT_DIR = os.path.abspath(os.path.join(THIS_DIR, os.path.pardir))
 sys.path.insert(0, ROOT_DIR)

 import utils.file_util as file_util
 import third_party.google_visualization.gviz_api as gviz_api

 from randomizers.randomizer import readRapporConfigParamsFromFile

 # The javascript variables to write. Note "_sc" refers to the data from
 # the "straight-counting pipeline" "_pr_" refers to the version of
 # a metric that uses differentially-private release.
 USAGE_BY_MODULE_JS_VAR_NAME = 'usage_by_module_data'
 USAGE_BY_MODULE_PR_JS_VAR_NAME = 'usage_by_module_pr_data'
 USAGE_BY_MODULE_SC_JS_VAR_NAME = 'usage_by_module_data_sc'
 USAGE_BY_MODULE_PARAMS_JS_VAR_NAME = 'usage_by_module_params'
 USAGE_BY_MODULE_PR_PARAMS_JS_VAR_NAME = 'usage_by_module_pr_params'

 USAGE_BY_CITY_SC_JS_VAR_NAME = 'usage_by_city_data_sc'
 USAGE_BY_CITY_JS_VAR_NAME = 'usage_by_city_data'

 USAGE_BY_HOUR_SC_JS_VAR_NAME = 'usage_by_hour_data_sc'
 USAGE_BY_HOUR_JS_VAR_NAME = 'usage_by_hour_data'
 USAGE_BY_HOUR_PARAMS_JS_VAR_NAME = 'usage_by_hour_params'

 POPULAR_URLS_JS_VAR_NAME = 'popular_urls_data'
 POPULAR_URLS_SC_JS_VAR_NAME = 'popular_urls_data_sc'
 POPULAR_URLS_HISTOGRAM_SC_JS_VAR_NAME = \
     'popular_urls_histogram_data_sc'

 POPULAR_HELP_QUERIES_JS_VAR_NAME = 'popular_help_queries_data'
 POPULAR_HELP_QUERIES_SC_JS_VAR_NAME = 'popular_help_queries_data_sc'
 POPULAR_HELP_QUERIES_HISTOGRAM_SC_JS_VAR_NAME = \
     'popular_help_queries_histogram_data_sc'

 # The output JavaScript file to be created.
 OUTPUT_JS_FILE_NAME = 'data.js'

 def buildDataTableJs(data=None, var_name=None, description=None,
     columns_order=None, order_by=()):
   """Builds a JavaScript string defining a DataTable containing the given data.

   Args:
     data: {dictionary}:  The data with which to populate the DataTable.
     var_name {string}: The name of the JavaScript variable to write.
     description {dictionary}: Passed to the constructor of gviz_api.DataTable()
     columns_order {tuple of string}: The names of the table columns in the
       order they should be written. Optional.
     order_by {tuple of string}: Optional. Specify something like ('foo', 'des')
       to sort the rows by the 'foo' column in descending order.

   Returns:
     {string} of the form |var_name|=<json>, where <json> is a json string
     defining a data table.
   """
   # Load data into a gviz_api.DataTable
   data_table = gviz_api.DataTable(description)
   data_table.LoadData(data)
   json = data_table.ToJSon(columns_order=columns_order,order_by=order_by)

   return "%s=%s;" % (var_name, json)

 def buildUsageByModuleJsFromRapporOutput(sc_values, rappor_out_file, jsvar,
                                          params_jsvar, config_file):
   """ A helper function for buildUsageByModuleJs().

   This function will be
   invoked twice: once for the metric without differentially-private release
   and once for the metric with differentially-private release.

   Args:
     sc_values: {dictionary} A dictionary of actual values from the straigh-
     counting pipeline.

     rappor_out_file: {string} The path to the file containing RAPPOR output
     data.

     jsvar: {string} The name of the Javascript variable to be used for the
     data table.

     params_jsvar: {string} The name of the Javascript variable to be used for
     the RAPPOR parameters.

     config_file: {string} The path of the file containing RAPPOR config data.

   Returns:
     {tuple of two strings} (usage_data, rappor_parameters).
     See buildUsageByModuleJs() for details.
   """
   # We skip row zero because it is the header row. We are going to visualize
   # the data as an interval chart and so we want to compute the high and
   # low 95% confidence interval values which we may do using the "std_error"
   # column, column 2.
   with file_util.openForReading(rappor_out_file) as csvfile:
     reader = csv.reader(csvfile)
     data = [{"module" : row[0], "estimate": float(row[1]),
              "actual" : sc_values.get(row[0], 0),
              "low" : float(row[1]) - 1.96 * float(row[2]),
              "high": float(row[1]) + 1.96 * float(row[2])}
         for row in reader if reader.line_num > 1]
   usage_data_js = buildDataTableJs(
       data=data,
       var_name=jsvar,
       description={"module": ("string", "Module"),
                    "estimate": ("number", "Estimate"),
                    "actual": ("number", "Actual"),
                    # The role: 'interval' property is what tells the Google
                    # Visualization API to draw an interval chart.
                    "low": ("number", "Low", {'role': 'interval'}),
                    "high": ("number", "High", {'role': 'interval'})},
       columns_order=("module", "estimate", "actual", "low", "high"),
       order_by=("estimate", "desc"))

   # RAPPOR parameters
   rappor_params_js = "{} = {};".format(params_jsvar,
       readRapporConfigParamsFromFile(config_file).to_json())

   return (usage_data_js, rappor_params_js)


 def buildUsageByModuleJs():
   """ Builds several strings defining variables used for visualization.

   Reads some CSV files containing the usage-by-module data for the straight-
   counting pipeline and the Cobalt prototype pipeline respectively and uses them
   to build three JavaScript strings defining DataTables containing the data
   and two JavaScript strings defining RAPPOR parameters.

   Returns:
     {tuple of five strings} (sc_string, cobalt_string, cobalt_with_pr_string,
     rappor_parameters, rappor_with_pr_parameters). The "_pr_" variables
     refer to the version of the RAPPOR metric that uses very weak RAPPOR
     parameters but then adds Laplace noise at the end to affect differentially
     private release. So "_pr_" for "private release". The first three
     strings are of the form <var_name>=<json>, where |json| is a json
     string defining a data table. The |var_name|s are respectively
     USAGE_BY_MODULE_SC_JS_VAR_NAME, USAGE_BY_MODULE_JS_VAR_NAME, and
     USAGE_BY_MODULE_PR_JS_VAR_NAME.
     rappor_parameters is a json string containing values for k, h, m, p, q, f.
   """
   # straight-counting:
   # Read the data from the csv file and put it into a dictionary.
   with file_util.openForReading(
       file_util.USAGE_BY_MODULE_CSV_FILE_NAME) as csvfile:
     reader = csv.reader(csvfile)
     # |data| will be used to generate the visualiation data for the
     # straight-counting pipeline
     data = []
     # |values| will be used below to include the actual values along with
     # the RAPPOR estimates in the visualization of the Cobalt pipeline.
     values = {}
     for row in reader:
       data.append({"module" : row[0], "count": int(row[1])})
       values[row[0]] = int(row[1])
   usage_by_module_sc_js = buildDataTableJs(
       data=data,
       var_name=USAGE_BY_MODULE_SC_JS_VAR_NAME,
       description={"module": ("string", "Module"),
                    "count": ("number", "Count")},
       columns_order=("module", "count"),
       order_by=("count", "desc"))

   # cobalt:
   # Here the CSV file is the output of the RAPPOR analyzer.
   # We read it and put the data into a dictionary.
   usage_by_module_cobalt_js, rappor_params_js = \
       buildUsageByModuleJsFromRapporOutput(values,
           file_util.MODULE_NAME_ANALYZER_OUTPUT_FILE_NAME,
           USAGE_BY_MODULE_JS_VAR_NAME,
           USAGE_BY_MODULE_PARAMS_JS_VAR_NAME,
           file_util.RAPPOR_MODULE_NAME_CONFIG)

   # cobalt with differentially-private release:
   usage_by_module_cobalt_with_pr_js, rappor_with_pr_params_js = \
       buildUsageByModuleJsFromRapporOutput(values,
           file_util.MODULE_NAME_PR_ANALYZER_OUTPUT_FILE_NAME,
           USAGE_BY_MODULE_PR_JS_VAR_NAME,
           USAGE_BY_MODULE_PR_PARAMS_JS_VAR_NAME,
           file_util.RAPPOR_MODULE_NAME_PR_CONFIG)

   return (usage_by_module_sc_js, usage_by_module_cobalt_js,
           usage_by_module_cobalt_with_pr_js,
           rappor_params_js, rappor_with_pr_params_js)


 def buildUsageAndRatingByCityJs():
   """Builds several strings defining variables used for visualization.

   Reads a CSV file containing the usage-by-city data and uses it
   to build a JavaScript string defining a DataTable containing the data.

   Returns:
     {string} of the form <var_name>=<json>, where |var_name| is
     USAGE_BY_CITY_SC_JS_VAR_NAME and |json| is a json string defining
     a data table.
   """
   # straight-counting:
   # Read the data from the csv file and put it into a dictionary.
   with file_util.openForReading(
       file_util.USAGE_BY_CITY_CSV_FILE_NAME) as csvfile:
     reader = csv.reader(csvfile)
     # |data| will be used to generate the visualiation data for the
     # straight-counting pipeline
     data = []
     # |values| will be used below to include the actual values along with
     # the RAPPOR estimates in the visualization of the Cobalt pipeline.
     values = []
     for row in reader:
       data.append({"city" : row[0], "usage": int(row[1]),
                    "rating": float(row[2])})
       if int(row[1]) > 50:
         values.append({"city" : row[0], "usage": int(row[1]),
                        "type" : "actual",
                        "radius_95" : 0,
                        "rating": float(row[2])})
   usage_and_rating_by_city_sc_js = buildDataTableJs(
       data=data,
       var_name=USAGE_BY_CITY_SC_JS_VAR_NAME,
       description = {"city": ("string", "City"),
                      "rating": ("number", "Rating"),
                      "usage": ("number", "Usage")},
       columns_order=("city", "rating", "usage"),
       order_by=("usage", "desc"))

   # cobalt:
   # Here the CSV file is the output of the RAPPOR analyzer.
   # We read it and put the data into a dictionary.
   # We skip row zero because it is the header row.
   with file_util.openForReading(
       file_util.CITY_RATINGS_ANALYZER_OUTPUT_FILE_NAME) as csvfile:
     reader = csv.reader(csvfile)
     data = [{"city" : row[0], "usage": float(row[1]),
              "type" : "estimate",
              "radius_95" : 1.96 * float(row[2]),
              "rating": float(row[7])}
         for row in reader if reader.line_num > 1]
     data.extend(values)
   usage_and_rating_by_city_cobalt_js = buildDataTableJs(
       data=data,
       var_name=USAGE_BY_CITY_JS_VAR_NAME,
       description={"city": ("string", "City"),
                    "usage": ("number", "Usage"),
                    "rating": ("number", "Rating"),
                    "type" : ("string", "Estimate or Actual"),
                    "radius_95": ("number", "95% conf. intlv. radius")},
       columns_order=("city", "usage", "rating", "type", "radius_95"),
       order_by=("estimate", "desc"))

   return (usage_and_rating_by_city_sc_js, usage_and_rating_by_city_cobalt_js)

 def buildUsageByHourJs():
   """Builds several strings defining variables used for visualization.

   Reads two CSV files containing the usage-by-hour data for the
   straight-counting pipeline and the Cobalt prototype pipeline respectively and
   uses them to build two JavaScript strings defining DataTables containing the
   data and one string describing basic RAPPOR parameters.

   Returns:
     {tuple of two strings} (sc_string, cobalt_string, params_string). The first
     two strings are of the form <var_name>=<json>, where |json| is a json string
     defining a data table. The |var_name|s are respectively
     USAGE_BY_HOUR_SC_JS_VAR_NAME and USAGE_BY_HOUR_JS_VAR_NAME.
     params_string is a json string containing RAPPOR parameters.
   """
   # straight-counting:
   # Read the data from the csv file and put it into a dictionary.
   with file_util.openForReading(
       file_util.USAGE_BY_HOUR_CSV_FILE_NAME) as csvfile:
     reader = csv.reader(csvfile)
     # |data| will be used to generate the visualiation data for the
     # straight-counting pipeline
     data = []
     # |values| will be used below to include the actual values along with
     # the RAPPOR estimates in the visualization of the Cobalt pipeline.
     values = []
     hour = 0
     for row in reader:
       data.append({"hour" : hour, "usage": int(row[0])})
       values.append(int(row[0]))
       hour += 1
   usage_by_hour_sc_js = buildDataTableJs(
       data=data,
       var_name=USAGE_BY_HOUR_SC_JS_VAR_NAME,
       description = {"hour": ("number", "Hour of Day"),
                      "usage": ("number", "Usage")},
       columns_order=("hour", "usage"))

   # cobalt:
   # Here the CSV file is the output of the RAPPOR analyzer.
   # We read it and put the data into a dictionary.
   # We skip row zero because it is the header row. We are going to visualize
   # the data as an interval chart and so we want to compute the high and
   # low 95% confidence interval values wich we may do using the "std_error"
   # column, column 2.
   with file_util.openForReading(
       file_util.HOUR_ANALYZER_OUTPUT_FILE_NAME) as csvfile:
     reader = csv.reader(csvfile)
     data = [{"hour" : int(row[0]), "estimate": max(float(row[1]), 0),
              "actual": values[int(row[0])],
              "low" : max(float(row[1])  - 1.96 * float(row[2]), 0),
              "high": float(row[1]) + 1.96 * float(row[2])}
         for row in reader if reader.line_num > 1]
   usage_by_hour_cobalt_js = buildDataTableJs(
       data=data,
       var_name=USAGE_BY_HOUR_JS_VAR_NAME,
       description={"hour": ("number", "Hour"),
                    "estimate": ("number", "Estimate"),
                    "actual": ("number", "Actual"),
                    # The role: 'interval' property is what tells the Google
                    # Visualization API to draw an interval chart.
                    "low": ("number", "Low", {'role': 'interval'}),
                    "high": ("number", "High", {'role': 'interval'})},
       columns_order=("hour", "estimate", "low", "high", "actual"),
       order_by=("hour", "asc"))

   # RAPPOR parameters
   rappor_params_js = "{} = {};".format(
       USAGE_BY_HOUR_PARAMS_JS_VAR_NAME,
       readRapporConfigParamsFromFile(file_util.RAPPOR_HOUR_CONFIG).to_json())

   return (usage_by_hour_sc_js, usage_by_hour_cobalt_js, rappor_params_js)

 def buildItemAndCountJs(filename, varname1, varname2, item_column,
                         item_description):
   """Builds several strings defining variables used for visualization.

   Reads a CSV file containing two columns, an item column and a
   count column, and and uses the data to build two JavaScript strings defining
   DataTables containing the data. The two DataTables will be the same except
   for the order of the columns: The first DataTable will have the count
   column first and the second DataTable will have the item column first.

   Args:
     filename: {string} The full path of the CSV file to read.
     varname1: {string} The name of the first javascript variable to generate.
     varname2: {string} The name of the second javascript variable to generate.
                        If this is None then the second returned string will
                        also be None.
     item_column: {string} The name of the item column to use in the generated
                           JS.
     item_description: The description string to use in the generated JS.

   Returns:
     {tuple of two string} of the form <varname>=<json>, where <json> is a json
     string defining a data table. In the first returned string <varname> will
     be |varname1| and the "count" column will come first in the DataTable.
     In the second returned string <varname> will be |varname1| and |item_column|
     will come first in the DataTable.
   """

   with file_util.openForReading(filename) as csvfile:
     reader = csv.reader(csvfile)
     data = [{item_column : row[0], "count": int(row[1])} for row in reader]
   count_first_string = buildDataTableJs(
       data=data,
       var_name=varname1,
       description={item_column: ("string", item_description),
                    "count": ("number", "Count")},
       columns_order=("count", item_column),
       order_by=(("count", "desc"), item_column))
   item_first_string = None
   if varname2 is not None:
     item_first_string = buildDataTableJs(
         data=data,
         var_name=varname2,
         description={item_column: ("string", item_description),
                     "count": ("number", "Count")},
         columns_order=(item_column, "count"),
         order_by=(("count", "desc"), item_column))
   return (count_first_string, item_first_string)

 def buildPopularUrlsJs():
   """Builds several strings defining variables used for visualization.

   Reads two CSV files containing the popular URL data for the straight-
   counting pipeline and the Cobalt prototype pipeline respectively and uses them
   to build three JavaScript strings defining DataTables containing the data.

  Returns:
     {tuple of three strings} (sc_string, sc_histogram_string, cobalt_string).
     Each of the three strings is of the form <var_name>=<json>, where |json|
     is a json string defining a data table. The |var_name|s are respectively
     POPULAR_URLS_SC_JS_VAR_NAME,
     POPULAR_URLS_HISTOGRAM_SC_JS_VAR_NAME,
     POPULAR_URLS_JS_VAR_NAME.
   """
   # straight-counting
   popular_urls_sc_js, popular_urls_histogram_sc_js = buildItemAndCountJs(
       file_util.POPULAR_URLS_CSV_FILE_NAME, POPULAR_URLS_SC_JS_VAR_NAME,
       POPULAR_URLS_HISTOGRAM_SC_JS_VAR_NAME,
       "url", "URL")

   # Cobalt.
   popular_urls_js, _ = buildItemAndCountJs(
     file_util.URL_ANALYZER_OUTPUT_FILE_NAME, POPULAR_URLS_JS_VAR_NAME, None,
     "url", "URL")

   return (popular_urls_sc_js, popular_urls_histogram_sc_js, popular_urls_js)

 def buildPopularHelpQueriesJs():
   """Builds several strings defining variables used for visualization.

   Reads two CSV files containing the popular help-qury data for the straight-
   counting pipeline and the Cobalt prototype pipeline respectively and uses them
   to build three JavaScript strings defining DataTables containing the data.

  Returns:
     {tuple of three strings} (sc_string, sc_histogram_string, cobalt_string).
     Each of the three strings is of the form <var_name>=<json>, where |json| is
     a json string defining a data table. The |var_name|s are respectively
     POPULAR_HELP_QUERIES_SC_JS_VAR_NAME,
     POPULAR_HELP_QUERIES_HISTOGRAM_SC_JS_VAR_NAME,
     and POPULAR_HELP_QUERIES_JS_VAR_NAME.
   """
   # straight-counting, table visualization
   popular_help_queries_sc_js, popular_help_queries_histogram_sc_js = \
       buildItemAndCountJs(file_util.POPULAR_HELP_QUERIES_CSV_FILE_NAME,
                           POPULAR_HELP_QUERIES_SC_JS_VAR_NAME,
                           POPULAR_HELP_QUERIES_HISTOGRAM_SC_JS_VAR_NAME,
                           "help_query ", "Help Query")

   # Cobalt.
   popular_help_queries_js, _ = buildItemAndCountJs(
     file_util.HELP_QUERY_ANALYZER_OUTPUT_FILE_NAME,
     POPULAR_HELP_QUERIES_JS_VAR_NAME, None,
     "help_query", "Help Query")

   return (popular_help_queries_sc_js, popular_help_queries_histogram_sc_js,
       popular_help_queries_js)

 def main():
   print "Generating visualization..."

   # Read the input file and build the JavaScript strings to write.
   usage_by_module_sc_js, usage_by_module_js, usage_by_module_with_pr_js, \
       usage_by_module_params_js, usage_by_module_with_pr_params_js = \
           buildUsageByModuleJs()
   usage_by_city_js, usage_by_city_sc_js = buildUsageAndRatingByCityJs()
   usage_by_hour_sc_js, usage_by_hour_js, usage_by_hour_params_js = \
       buildUsageByHourJs()
   (popular_urls_sc_js, popular_urls_histogram_sc_js,
    popular_urls_js) = buildPopularUrlsJs()
   (popular_help_queries_sc_js, popular_help_queries_histogram_sc_js,
       popular_help_queries_js) = buildPopularHelpQueriesJs()

   # Write the output file.
   with file_util.openForWriting(OUTPUT_JS_FILE_NAME) as f:
     f.write("// This js file is generated by the script "
             "generate_data_js.py\n\n")
     f.write("%s\n\n" % usage_by_module_sc_js)
     f.write("%s\n\n" % usage_by_module_js)
     f.write("%s\n\n" % usage_by_module_with_pr_js)
     f.write("%s\n\n" % usage_by_module_params_js)
     f.write("%s\n\n" % usage_by_module_with_pr_params_js)

     f.write("%s\n\n" % usage_by_city_sc_js)
     f.write("%s\n\n" % usage_by_city_js)

     f.write("%s\n\n" % usage_by_hour_sc_js)
     f.write("%s\n\n" % usage_by_hour_js)
     f.write("%s\n\n" % usage_by_hour_params_js)

     f.write("%s\n\n" % popular_urls_sc_js)
     f.write("%s\n\n" % popular_urls_histogram_sc_js)
     f.write("%s\n\n" % popular_urls_js)

     f.write("%s\n\n" % popular_help_queries_sc_js)
     f.write("%s\n\n" % popular_help_queries_histogram_sc_js)
     f.write("%s\n\n" % popular_help_queries_js)
     f.write("")

   print "View this file in your browser:"
   print "file://%s" % file_util.VISUALIZATION_FILE

 if __name__ == '__main__':
   main()
	#!/usr/bin/env python
	# Copyright 2016 The Fuchsia Authors
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	""" This script reads several csv files containing data to be visualized.
	It then uses the Google Data Visualization API to generate a JavaScript
	file that contains definitions of DataTables holding the data.
	This is used by visualization.html to generate visualizations.
	"""

	import csv
	import os
	import sys

	THIS_DIR = os.path.dirname(__file__)
	ROOT_DIR = os.path.abspath(os.path.join(THIS_DIR, os.path.pardir))
	sys.path.insert(0, ROOT_DIR)

	import utils.file_util as file_util
	import third_party.google_visualization.gviz_api as gviz_api

	from randomizers.randomizer import readRapporConfigParamsFromFile

	# The javascript variables to write. Note "_sc" refers to the data from
	# the "straight-counting pipeline" "_pr_" refers to the version of
	# a metric that uses differentially-private release.
	USAGE_BY_MODULE_JS_VAR_NAME = 'usage_by_module_data'
	USAGE_BY_MODULE_PR_JS_VAR_NAME = 'usage_by_module_pr_data'
	USAGE_BY_MODULE_SC_JS_VAR_NAME = 'usage_by_module_data_sc'
	USAGE_BY_MODULE_PARAMS_JS_VAR_NAME = 'usage_by_module_params'
	USAGE_BY_MODULE_PR_PARAMS_JS_VAR_NAME = 'usage_by_module_pr_params'

	USAGE_BY_CITY_SC_JS_VAR_NAME = 'usage_by_city_data_sc'
	USAGE_BY_CITY_JS_VAR_NAME = 'usage_by_city_data'

	USAGE_BY_HOUR_SC_JS_VAR_NAME = 'usage_by_hour_data_sc'
	USAGE_BY_HOUR_JS_VAR_NAME = 'usage_by_hour_data'
	USAGE_BY_HOUR_PARAMS_JS_VAR_NAME = 'usage_by_hour_params'

	POPULAR_URLS_JS_VAR_NAME = 'popular_urls_data'
	POPULAR_URLS_SC_JS_VAR_NAME = 'popular_urls_data_sc'
	POPULAR_URLS_HISTOGRAM_SC_JS_VAR_NAME = \
	'popular_urls_histogram_data_sc'

	POPULAR_HELP_QUERIES_JS_VAR_NAME = 'popular_help_queries_data'
	POPULAR_HELP_QUERIES_SC_JS_VAR_NAME = 'popular_help_queries_data_sc'
	POPULAR_HELP_QUERIES_HISTOGRAM_SC_JS_VAR_NAME = \
	'popular_help_queries_histogram_data_sc'

	# The output JavaScript file to be created.
	OUTPUT_JS_FILE_NAME = 'data.js'

	def buildDataTableJs(data=None, var_name=None, description=None,
	columns_order=None, order_by=()):
	"""Builds a JavaScript string defining a DataTable containing the given data.

	Args:
	data: {dictionary}: The data with which to populate the DataTable.
	var_name {string}: The name of the JavaScript variable to write.
	description {dictionary}: Passed to the constructor of gviz_api.DataTable()
	columns_order {tuple of string}: The names of the table columns in the
	order they should be written. Optional.
	order_by {tuple of string}: Optional. Specify something like ('foo', 'des')
	to sort the rows by the 'foo' column in descending order.

	Returns:
	{string} of the form \|var_name\|=<json>, where <json> is a json string
	defining a data table.
	"""
	# Load data into a gviz_api.DataTable
	data_table = gviz_api.DataTable(description)
	data_table.LoadData(data)
	json = data_table.ToJSon(columns_order=columns_order,order_by=order_by)

	return "%s=%s;" % (var_name, json)

	def buildUsageByModuleJsFromRapporOutput(sc_values, rappor_out_file, jsvar,
	params_jsvar, config_file):
	""" A helper function for buildUsageByModuleJs().

	This function will be
	invoked twice: once for the metric without differentially-private release
	and once for the metric with differentially-private release.

	Args:
	sc_values: {dictionary} A dictionary of actual values from the straigh-
	counting pipeline.

	rappor_out_file: {string} The path to the file containing RAPPOR output
	data.

	jsvar: {string} The name of the Javascript variable to be used for the
	data table.

	params_jsvar: {string} The name of the Javascript variable to be used for
	the RAPPOR parameters.

	config_file: {string} The path of the file containing RAPPOR config data.

	Returns:
	{tuple of two strings} (usage_data, rappor_parameters).
	See buildUsageByModuleJs() for details.
	"""
	# We skip row zero because it is the header row. We are going to visualize
	# the data as an interval chart and so we want to compute the high and
	# low 95% confidence interval values which we may do using the "std_error"
	# column, column 2.
	with file_util.openForReading(rappor_out_file) as csvfile:
	reader = csv.reader(csvfile)
	data = [{"module" : row[0], "estimate": float(row[1]),
	"actual" : sc_values.get(row[0], 0),
	"low" : float(row[1]) - 1.96 * float(row[2]),
	"high": float(row[1]) + 1.96 * float(row[2])}
	for row in reader if reader.line_num > 1]
	usage_data_js = buildDataTableJs(
	data=data,
	var_name=jsvar,
	description={"module": ("string", "Module"),
	"estimate": ("number", "Estimate"),
	"actual": ("number", "Actual"),
	# The role: 'interval' property is what tells the Google
	# Visualization API to draw an interval chart.
	"low": ("number", "Low", {'role': 'interval'}),
	"high": ("number", "High", {'role': 'interval'})},
	columns_order=("module", "estimate", "actual", "low", "high"),
	order_by=("estimate", "desc"))

	# RAPPOR parameters
	rappor_params_js = "{} = {};".format(params_jsvar,
	readRapporConfigParamsFromFile(config_file).to_json())

	return (usage_data_js, rappor_params_js)


	def buildUsageByModuleJs():
	""" Builds several strings defining variables used for visualization.

	Reads some CSV files containing the usage-by-module data for the straight-
	counting pipeline and the Cobalt prototype pipeline respectively and uses them
	to build three JavaScript strings defining DataTables containing the data
	and two JavaScript strings defining RAPPOR parameters.

	Returns:
	{tuple of five strings} (sc_string, cobalt_string, cobalt_with_pr_string,
	rappor_parameters, rappor_with_pr_parameters). The "_pr_" variables
	refer to the version of the RAPPOR metric that uses very weak RAPPOR
	parameters but then adds Laplace noise at the end to affect differentially
	private release. So "_pr_" for "private release". The first three
	strings are of the form <var_name>=<json>, where \|json\| is a json
	string defining a data table. The \|var_name\|s are respectively
	USAGE_BY_MODULE_SC_JS_VAR_NAME, USAGE_BY_MODULE_JS_VAR_NAME, and
	USAGE_BY_MODULE_PR_JS_VAR_NAME.
	rappor_parameters is a json string containing values for k, h, m, p, q, f.
	"""
	# straight-counting:
	# Read the data from the csv file and put it into a dictionary.
	with file_util.openForReading(
	file_util.USAGE_BY_MODULE_CSV_FILE_NAME) as csvfile:
	reader = csv.reader(csvfile)
	# \|data\| will be used to generate the visualiation data for the
	# straight-counting pipeline
	data = []
	# \|values\| will be used below to include the actual values along with
	# the RAPPOR estimates in the visualization of the Cobalt pipeline.
	values = {}
	for row in reader:
	data.append({"module" : row[0], "count": int(row[1])})
	values[row[0]] = int(row[1])
	usage_by_module_sc_js = buildDataTableJs(
	data=data,
	var_name=USAGE_BY_MODULE_SC_JS_VAR_NAME,
	description={"module": ("string", "Module"),
	"count": ("number", "Count")},
	columns_order=("module", "count"),
	order_by=("count", "desc"))

	# cobalt:
	# Here the CSV file is the output of the RAPPOR analyzer.
	# We read it and put the data into a dictionary.
	usage_by_module_cobalt_js, rappor_params_js = \
	buildUsageByModuleJsFromRapporOutput(values,
	file_util.MODULE_NAME_ANALYZER_OUTPUT_FILE_NAME,
	USAGE_BY_MODULE_JS_VAR_NAME,
	USAGE_BY_MODULE_PARAMS_JS_VAR_NAME,
	file_util.RAPPOR_MODULE_NAME_CONFIG)

	# cobalt with differentially-private release:
	usage_by_module_cobalt_with_pr_js, rappor_with_pr_params_js = \
	buildUsageByModuleJsFromRapporOutput(values,
	file_util.MODULE_NAME_PR_ANALYZER_OUTPUT_FILE_NAME,
	USAGE_BY_MODULE_PR_JS_VAR_NAME,
	USAGE_BY_MODULE_PR_PARAMS_JS_VAR_NAME,
	file_util.RAPPOR_MODULE_NAME_PR_CONFIG)

	return (usage_by_module_sc_js, usage_by_module_cobalt_js,
	usage_by_module_cobalt_with_pr_js,
	rappor_params_js, rappor_with_pr_params_js)


	def buildUsageAndRatingByCityJs():
	"""Builds several strings defining variables used for visualization.

	Reads a CSV file containing the usage-by-city data and uses it
	to build a JavaScript string defining a DataTable containing the data.

	Returns:
	{string} of the form <var_name>=<json>, where \|var_name\| is
	USAGE_BY_CITY_SC_JS_VAR_NAME and \|json\| is a json string defining
	a data table.
	"""
	# straight-counting:
	# Read the data from the csv file and put it into a dictionary.
	with file_util.openForReading(
	file_util.USAGE_BY_CITY_CSV_FILE_NAME) as csvfile:
	reader = csv.reader(csvfile)
	# \|data\| will be used to generate the visualiation data for the
	# straight-counting pipeline
	data = []
	# \|values\| will be used below to include the actual values along with
	# the RAPPOR estimates in the visualization of the Cobalt pipeline.
	values = []
	for row in reader:
	data.append({"city" : row[0], "usage": int(row[1]),
	"rating": float(row[2])})
	if int(row[1]) > 50:
	values.append({"city" : row[0], "usage": int(row[1]),
	"type" : "actual",
	"radius_95" : 0,
	"rating": float(row[2])})
	usage_and_rating_by_city_sc_js = buildDataTableJs(
	data=data,
	var_name=USAGE_BY_CITY_SC_JS_VAR_NAME,
	description = {"city": ("string", "City"),
	"rating": ("number", "Rating"),
	"usage": ("number", "Usage")},
	columns_order=("city", "rating", "usage"),
	order_by=("usage", "desc"))

	# cobalt:
	# Here the CSV file is the output of the RAPPOR analyzer.
	# We read it and put the data into a dictionary.
	# We skip row zero because it is the header row.
	with file_util.openForReading(
	file_util.CITY_RATINGS_ANALYZER_OUTPUT_FILE_NAME) as csvfile:
	reader = csv.reader(csvfile)
	data = [{"city" : row[0], "usage": float(row[1]),
	"type" : "estimate",
	"radius_95" : 1.96 * float(row[2]),
	"rating": float(row[7])}
	for row in reader if reader.line_num > 1]
	data.extend(values)
	usage_and_rating_by_city_cobalt_js = buildDataTableJs(
	data=data,
	var_name=USAGE_BY_CITY_JS_VAR_NAME,
	description={"city": ("string", "City"),
	"usage": ("number", "Usage"),
	"rating": ("number", "Rating"),
	"type" : ("string", "Estimate or Actual"),
	"radius_95": ("number", "95% conf. intlv. radius")},
	columns_order=("city", "usage", "rating", "type", "radius_95"),
	order_by=("estimate", "desc"))

	return (usage_and_rating_by_city_sc_js, usage_and_rating_by_city_cobalt_js)

	def buildUsageByHourJs():
	"""Builds several strings defining variables used for visualization.

	Reads two CSV files containing the usage-by-hour data for the
	straight-counting pipeline and the Cobalt prototype pipeline respectively and
	uses them to build two JavaScript strings defining DataTables containing the
	data and one string describing basic RAPPOR parameters.

	Returns:
	{tuple of two strings} (sc_string, cobalt_string, params_string). The first
	two strings are of the form <var_name>=<json>, where \|json\| is a json string
	defining a data table. The \|var_name\|s are respectively
	USAGE_BY_HOUR_SC_JS_VAR_NAME and USAGE_BY_HOUR_JS_VAR_NAME.
	params_string is a json string containing RAPPOR parameters.
	"""
	# straight-counting:
	# Read the data from the csv file and put it into a dictionary.
	with file_util.openForReading(
	file_util.USAGE_BY_HOUR_CSV_FILE_NAME) as csvfile:
	reader = csv.reader(csvfile)
	# \|data\| will be used to generate the visualiation data for the
	# straight-counting pipeline
	data = []
	# \|values\| will be used below to include the actual values along with
	# the RAPPOR estimates in the visualization of the Cobalt pipeline.
	values = []
	hour = 0
	for row in reader:
	data.append({"hour" : hour, "usage": int(row[0])})
	values.append(int(row[0]))
	hour += 1
	usage_by_hour_sc_js = buildDataTableJs(
	data=data,
	var_name=USAGE_BY_HOUR_SC_JS_VAR_NAME,
	description = {"hour": ("number", "Hour of Day"),
	"usage": ("number", "Usage")},
	columns_order=("hour", "usage"))

	# cobalt:
	# Here the CSV file is the output of the RAPPOR analyzer.
	# We read it and put the data into a dictionary.
	# We skip row zero because it is the header row. We are going to visualize
	# the data as an interval chart and so we want to compute the high and
	# low 95% confidence interval values wich we may do using the "std_error"
	# column, column 2.
	with file_util.openForReading(
	file_util.HOUR_ANALYZER_OUTPUT_FILE_NAME) as csvfile:
	reader = csv.reader(csvfile)
	data = [{"hour" : int(row[0]), "estimate": max(float(row[1]), 0),
	"actual": values[int(row[0])],
	"low" : max(float(row[1]) - 1.96 * float(row[2]), 0),
	"high": float(row[1]) + 1.96 * float(row[2])}
	for row in reader if reader.line_num > 1]
	usage_by_hour_cobalt_js = buildDataTableJs(
	data=data,
	var_name=USAGE_BY_HOUR_JS_VAR_NAME,
	description={"hour": ("number", "Hour"),
	"estimate": ("number", "Estimate"),
	"actual": ("number", "Actual"),
	# The role: 'interval' property is what tells the Google
	# Visualization API to draw an interval chart.
	"low": ("number", "Low", {'role': 'interval'}),
	"high": ("number", "High", {'role': 'interval'})},
	columns_order=("hour", "estimate", "low", "high", "actual"),
	order_by=("hour", "asc"))

	# RAPPOR parameters
	rappor_params_js = "{} = {};".format(
	USAGE_BY_HOUR_PARAMS_JS_VAR_NAME,
	readRapporConfigParamsFromFile(file_util.RAPPOR_HOUR_CONFIG).to_json())

	return (usage_by_hour_sc_js, usage_by_hour_cobalt_js, rappor_params_js)

	def buildItemAndCountJs(filename, varname1, varname2, item_column,
	item_description):
	"""Builds several strings defining variables used for visualization.

	Reads a CSV file containing two columns, an item column and a
	count column, and and uses the data to build two JavaScript strings defining
	DataTables containing the data. The two DataTables will be the same except
	for the order of the columns: The first DataTable will have the count
	column first and the second DataTable will have the item column first.

	Args:
	filename: {string} The full path of the CSV file to read.
	varname1: {string} The name of the first javascript variable to generate.
	varname2: {string} The name of the second javascript variable to generate.
	If this is None then the second returned string will
	also be None.
	item_column: {string} The name of the item column to use in the generated
	JS.
	item_description: The description string to use in the generated JS.

	Returns:
	{tuple of two string} of the form <varname>=<json>, where <json> is a json
	string defining a data table. In the first returned string <varname> will
	be \|varname1\| and the "count" column will come first in the DataTable.
	In the second returned string <varname> will be \|varname1\| and \|item_column\|
	will come first in the DataTable.
	"""

	with file_util.openForReading(filename) as csvfile:
	reader = csv.reader(csvfile)
	data = [{item_column : row[0], "count": int(row[1])} for row in reader]
	count_first_string = buildDataTableJs(
	data=data,
	var_name=varname1,
	description={item_column: ("string", item_description),
	"count": ("number", "Count")},
	columns_order=("count", item_column),
	order_by=(("count", "desc"), item_column))
	item_first_string = None
	if varname2 is not None:
	item_first_string = buildDataTableJs(
	data=data,
	var_name=varname2,
	description={item_column: ("string", item_description),
	"count": ("number", "Count")},
	columns_order=(item_column, "count"),
	order_by=(("count", "desc"), item_column))
	return (count_first_string, item_first_string)

	def buildPopularUrlsJs():
	"""Builds several strings defining variables used for visualization.

	Reads two CSV files containing the popular URL data for the straight-
	counting pipeline and the Cobalt prototype pipeline respectively and uses them
	to build three JavaScript strings defining DataTables containing the data.

	Returns:
	{tuple of three strings} (sc_string, sc_histogram_string, cobalt_string).
	Each of the three strings is of the form <var_name>=<json>, where \|json\|
	is a json string defining a data table. The \|var_name\|s are respectively
	POPULAR_URLS_SC_JS_VAR_NAME,
	POPULAR_URLS_HISTOGRAM_SC_JS_VAR_NAME,
	POPULAR_URLS_JS_VAR_NAME.
	"""
	# straight-counting
	popular_urls_sc_js, popular_urls_histogram_sc_js = buildItemAndCountJs(
	file_util.POPULAR_URLS_CSV_FILE_NAME, POPULAR_URLS_SC_JS_VAR_NAME,
	POPULAR_URLS_HISTOGRAM_SC_JS_VAR_NAME,
	"url", "URL")

	# Cobalt.
	popular_urls_js, _ = buildItemAndCountJs(
	file_util.URL_ANALYZER_OUTPUT_FILE_NAME, POPULAR_URLS_JS_VAR_NAME, None,
	"url", "URL")

	return (popular_urls_sc_js, popular_urls_histogram_sc_js, popular_urls_js)

	def buildPopularHelpQueriesJs():
	"""Builds several strings defining variables used for visualization.

	Reads two CSV files containing the popular help-qury data for the straight-
	counting pipeline and the Cobalt prototype pipeline respectively and uses them
	to build three JavaScript strings defining DataTables containing the data.

	Returns:
	{tuple of three strings} (sc_string, sc_histogram_string, cobalt_string).
	Each of the three strings is of the form <var_name>=<json>, where \|json\| is
	a json string defining a data table. The \|var_name\|s are respectively
	POPULAR_HELP_QUERIES_SC_JS_VAR_NAME,
	POPULAR_HELP_QUERIES_HISTOGRAM_SC_JS_VAR_NAME,
	and POPULAR_HELP_QUERIES_JS_VAR_NAME.
	"""
	# straight-counting, table visualization
	popular_help_queries_sc_js, popular_help_queries_histogram_sc_js = \
	buildItemAndCountJs(file_util.POPULAR_HELP_QUERIES_CSV_FILE_NAME,
	POPULAR_HELP_QUERIES_SC_JS_VAR_NAME,
	POPULAR_HELP_QUERIES_HISTOGRAM_SC_JS_VAR_NAME,
	"help_query ", "Help Query")

	# Cobalt.
	popular_help_queries_js, _ = buildItemAndCountJs(
	file_util.HELP_QUERY_ANALYZER_OUTPUT_FILE_NAME,
	POPULAR_HELP_QUERIES_JS_VAR_NAME, None,
	"help_query", "Help Query")

	return (popular_help_queries_sc_js, popular_help_queries_histogram_sc_js,
	popular_help_queries_js)

	def main():
	print "Generating visualization..."

	# Read the input file and build the JavaScript strings to write.
	usage_by_module_sc_js, usage_by_module_js, usage_by_module_with_pr_js, \
	usage_by_module_params_js, usage_by_module_with_pr_params_js = \
	buildUsageByModuleJs()
	usage_by_city_js, usage_by_city_sc_js = buildUsageAndRatingByCityJs()
	usage_by_hour_sc_js, usage_by_hour_js, usage_by_hour_params_js = \
	buildUsageByHourJs()
	(popular_urls_sc_js, popular_urls_histogram_sc_js,
	popular_urls_js) = buildPopularUrlsJs()
	(popular_help_queries_sc_js, popular_help_queries_histogram_sc_js,
	popular_help_queries_js) = buildPopularHelpQueriesJs()

	# Write the output file.
	with file_util.openForWriting(OUTPUT_JS_FILE_NAME) as f:
	f.write("// This js file is generated by the script "
	"generate_data_js.py\n\n")
	f.write("%s\n\n" % usage_by_module_sc_js)
	f.write("%s\n\n" % usage_by_module_js)
	f.write("%s\n\n" % usage_by_module_with_pr_js)
	f.write("%s\n\n" % usage_by_module_params_js)
	f.write("%s\n\n" % usage_by_module_with_pr_params_js)

	f.write("%s\n\n" % usage_by_city_sc_js)
	f.write("%s\n\n" % usage_by_city_js)

	f.write("%s\n\n" % usage_by_hour_sc_js)
	f.write("%s\n\n" % usage_by_hour_js)
	f.write("%s\n\n" % usage_by_hour_params_js)

	f.write("%s\n\n" % popular_urls_sc_js)
	f.write("%s\n\n" % popular_urls_histogram_sc_js)
	f.write("%s\n\n" % popular_urls_js)

	f.write("%s\n\n" % popular_help_queries_sc_js)
	f.write("%s\n\n" % popular_help_queries_histogram_sc_js)
	f.write("%s\n\n" % popular_help_queries_js)
	f.write("")

	print "View this file in your browser:"
	print "file://%s" % file_util.VISUALIZATION_FILE

	if __name__ == '__main__':
	main()