#!/usr/bin/env python
# Copyright 2016 The Fuchsia Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""" This script reads several csv files containing data to be visualized.
    It then uses the Google Data Visualization API to generate a JavaScript
    file that contains definitions of DataTables holding the data.
    This is used by visualization.html to generate visualizations.
"""

import csv
import os
import sys

THIS_DIR = os.path.dirname(__file__)
ROOT_DIR = os.path.abspath(os.path.join(THIS_DIR, os.path.pardir))
sys.path.insert(0, ROOT_DIR)

import utils.file_util as file_util
import third_party.google_visualization.gviz_api as gviz_api

from randomizers.randomizer import readRapporConfigParamsFromFile

# The javascript variables to write. Note "_sc" refers to the data from
# the "straight-counting pipeline" "_pr_" refers to the version of
# a metric that uses differentially-private release.
USAGE_BY_MODULE_JS_VAR_NAME = 'usage_by_module_data'
USAGE_BY_MODULE_PR_JS_VAR_NAME = 'usage_by_module_pr_data'
USAGE_BY_MODULE_SC_JS_VAR_NAME = 'usage_by_module_data_sc'
USAGE_BY_MODULE_PARAMS_JS_VAR_NAME = 'usage_by_module_params'
USAGE_BY_MODULE_PR_PARAMS_JS_VAR_NAME = 'usage_by_module_pr_params'

USAGE_BY_CITY_SC_JS_VAR_NAME = 'usage_by_city_data_sc'
USAGE_BY_CITY_JS_VAR_NAME = 'usage_by_city_data'

USAGE_BY_HOUR_SC_JS_VAR_NAME = 'usage_by_hour_data_sc'
USAGE_BY_HOUR_JS_VAR_NAME = 'usage_by_hour_data'
USAGE_BY_HOUR_PARAMS_JS_VAR_NAME = 'usage_by_hour_params'

POPULAR_URLS_JS_VAR_NAME = 'popular_urls_data'
POPULAR_URLS_SC_JS_VAR_NAME = 'popular_urls_data_sc'
POPULAR_URLS_HISTOGRAM_SC_JS_VAR_NAME = \
    'popular_urls_histogram_data_sc'

POPULAR_HELP_QUERIES_JS_VAR_NAME = 'popular_help_queries_data'
POPULAR_HELP_QUERIES_SC_JS_VAR_NAME = 'popular_help_queries_data_sc'
POPULAR_HELP_QUERIES_HISTOGRAM_SC_JS_VAR_NAME = \
    'popular_help_queries_histogram_data_sc'

# The output JavaScript file to be created.
OUTPUT_JS_FILE_NAME = 'data.js'

def buildDataTableJs(data=None, var_name=None, description=None,
    columns_order=None, order_by=()):
  """Builds a JavaScript string defining a DataTable containing the given data.

  Args:
    data: {dictionary}:  The data with which to populate the DataTable.
    var_name {string}: The name of the JavaScript variable to write.
    description {dictionary}: Passed to the constructor of gviz_api.DataTable()
    columns_order {tuple of string}: The names of the table columns in the
      order they should be written. Optional.
    order_by {tuple of string}: Optional. Specify something like ('foo', 'des')
      to sort the rows by the 'foo' column in descending order.

  Returns:
    {string} of the form |var_name|=<json>, where <json> is a json string
    defining a data table.
  """
  # Load data into a gviz_api.DataTable
  data_table = gviz_api.DataTable(description)
  data_table.LoadData(data)
  json = data_table.ToJSon(columns_order=columns_order,order_by=order_by)

  return "%s=%s;" % (var_name, json)

def buildUsageByModuleJsFromRapporOutput(sc_values, rappor_out_file, jsvar,
                                         params_jsvar, config_file):
  """ A helper function for buildUsageByModuleJs().

  This function will be
  invoked twice: once for the metric without differentially-private release
  and once for the metric with differentially-private release.

  Args:
    sc_values: {dictionary} A dictionary of actual values from the straigh-
    counting pipeline.

    rappor_out_file: {string} The path to the file containing RAPPOR output
    data.

    jsvar: {string} The name of the Javascript variable to be used for the
    data table.

    params_jsvar: {string} The name of the Javascript variable to be used for
    the RAPPOR parameters.

    config_file: {string} The path of the file containing RAPPOR config data.

  Returns:
    {tuple of two strings} (usage_data, rappor_parameters).
    See buildUsageByModuleJs() for details.
  """
  # We skip row zero because it is the header row. We are going to visualize
  # the data as an interval chart and so we want to compute the high and
  # low 95% confidence interval values which we may do using the "std_error"
  # column, column 2.
  with file_util.openForReading(rappor_out_file) as csvfile:
    reader = csv.reader(csvfile)
    data = [{"module" : row[0], "estimate": float(row[1]),
             "actual" : sc_values.get(row[0], 0),
             "low" : float(row[1]) - 1.96 * float(row[2]),
             "high": float(row[1]) + 1.96 * float(row[2])}
        for row in reader if reader.line_num > 1]
  usage_data_js = buildDataTableJs(
      data=data,
      var_name=jsvar,
      description={"module": ("string", "Module"),
                   "estimate": ("number", "Estimate"),
                   "actual": ("number", "Actual"),
                   # The role: 'interval' property is what tells the Google
                   # Visualization API to draw an interval chart.
                   "low": ("number", "Low", {'role': 'interval'}),
                   "high": ("number", "High", {'role': 'interval'})},
      columns_order=("module", "estimate", "actual", "low", "high"),
      order_by=("estimate", "desc"))

  # RAPPOR parameters
  rappor_params_js = "{} = {};".format(params_jsvar,
      readRapporConfigParamsFromFile(config_file).to_json())

  return (usage_data_js, rappor_params_js)


def buildUsageByModuleJs():
  """ Builds several strings defining variables used for visualization.

  Reads some CSV files containing the usage-by-module data for the straight-
  counting pipeline and the Cobalt prototype pipeline respectively and uses them
  to build three JavaScript strings defining DataTables containing the data
  and two JavaScript strings defining RAPPOR parameters.

  Returns:
    {tuple of five strings} (sc_string, cobalt_string, cobalt_with_pr_string,
    rappor_parameters, rappor_with_pr_parameters). The "_pr_" variables
    refer to the version of the RAPPOR metric that uses very weak RAPPOR
    parameters but then adds Laplace noise at the end to affect differentially
    private release. So "_pr_" for "private release". The first three
    strings are of the form <var_name>=<json>, where |json| is a json
    string defining a data table. The |var_name|s are respectively
    USAGE_BY_MODULE_SC_JS_VAR_NAME, USAGE_BY_MODULE_JS_VAR_NAME, and
    USAGE_BY_MODULE_PR_JS_VAR_NAME.
    rappor_parameters is a json string containing values for k, h, m, p, q, f.
  """
  # straight-counting:
  # Read the data from the csv file and put it into a dictionary.
  with file_util.openForReading(
      file_util.USAGE_BY_MODULE_CSV_FILE_NAME) as csvfile:
    reader = csv.reader(csvfile)
    # |data| will be used to generate the visualiation data for the
    # straight-counting pipeline
    data = []
    # |values| will be used below to include the actual values along with
    # the RAPPOR estimates in the visualization of the Cobalt pipeline.
    values = {}
    for row in reader:
      data.append({"module" : row[0], "count": int(row[1])})
      values[row[0]] = int(row[1])
  usage_by_module_sc_js = buildDataTableJs(
      data=data,
      var_name=USAGE_BY_MODULE_SC_JS_VAR_NAME,
      description={"module": ("string", "Module"),
                   "count": ("number", "Count")},
      columns_order=("module", "count"),
      order_by=("count", "desc"))

  # cobalt:
  # Here the CSV file is the output of the RAPPOR analyzer.
  # We read it and put the data into a dictionary.
  usage_by_module_cobalt_js, rappor_params_js = \
      buildUsageByModuleJsFromRapporOutput(values,
          file_util.MODULE_NAME_ANALYZER_OUTPUT_FILE_NAME,
          USAGE_BY_MODULE_JS_VAR_NAME,
          USAGE_BY_MODULE_PARAMS_JS_VAR_NAME,
          file_util.RAPPOR_MODULE_NAME_CONFIG)

  # cobalt with differentially-private release:
  usage_by_module_cobalt_with_pr_js, rappor_with_pr_params_js = \
      buildUsageByModuleJsFromRapporOutput(values,
          file_util.MODULE_NAME_PR_ANALYZER_OUTPUT_FILE_NAME,
          USAGE_BY_MODULE_PR_JS_VAR_NAME,
          USAGE_BY_MODULE_PR_PARAMS_JS_VAR_NAME,
          file_util.RAPPOR_MODULE_NAME_PR_CONFIG)

  return (usage_by_module_sc_js, usage_by_module_cobalt_js,
          usage_by_module_cobalt_with_pr_js,
          rappor_params_js, rappor_with_pr_params_js)


def buildUsageAndRatingByCityJs():
  """Builds several strings defining variables used for visualization.

  Reads a CSV file containing the usage-by-city data and uses it
  to build a JavaScript string defining a DataTable containing the data.

  Returns:
    {string} of the form <var_name>=<json>, where |var_name| is
    USAGE_BY_CITY_SC_JS_VAR_NAME and |json| is a json string defining
    a data table.
  """
  # straight-counting:
  # Read the data from the csv file and put it into a dictionary.
  with file_util.openForReading(
      file_util.USAGE_BY_CITY_CSV_FILE_NAME) as csvfile:
    reader = csv.reader(csvfile)
    # |data| will be used to generate the visualiation data for the
    # straight-counting pipeline
    data = []
    # |values| will be used below to include the actual values along with
    # the RAPPOR estimates in the visualization of the Cobalt pipeline.
    values = []
    for row in reader:
      data.append({"city" : row[0], "usage": int(row[1]),
                   "rating": float(row[2])})
      if int(row[1]) > 50:
        values.append({"city" : row[0], "usage": int(row[1]),
                       "type" : "actual",
                       "radius_95" : 0,
                       "rating": float(row[2])})
  usage_and_rating_by_city_sc_js = buildDataTableJs(
      data=data,
      var_name=USAGE_BY_CITY_SC_JS_VAR_NAME,
      description = {"city": ("string", "City"),
                     "rating": ("number", "Rating"),
                     "usage": ("number", "Usage")},
      columns_order=("city", "rating", "usage"),
      order_by=("usage", "desc"))

  # cobalt:
  # Here the CSV file is the output of the RAPPOR analyzer.
  # We read it and put the data into a dictionary.
  # We skip row zero because it is the header row.
  with file_util.openForReading(
      file_util.CITY_RATINGS_ANALYZER_OUTPUT_FILE_NAME) as csvfile:
    reader = csv.reader(csvfile)
    data = [{"city" : row[0], "usage": float(row[1]),
             "type" : "estimate",
             "radius_95" : 1.96 * float(row[2]),
             "rating": float(row[7])}
        for row in reader if reader.line_num > 1]
    data.extend(values)
  usage_and_rating_by_city_cobalt_js = buildDataTableJs(
      data=data,
      var_name=USAGE_BY_CITY_JS_VAR_NAME,
      description={"city": ("string", "City"),
                   "usage": ("number", "Usage"),
                   "rating": ("number", "Rating"),
                   "type" : ("string", "Estimate or Actual"),
                   "radius_95": ("number", "95% conf. intlv. radius")},
      columns_order=("city", "usage", "rating", "type", "radius_95"),
      order_by=("estimate", "desc"))

  return (usage_and_rating_by_city_sc_js, usage_and_rating_by_city_cobalt_js)

def buildUsageByHourJs():
  """Builds several strings defining variables used for visualization.

  Reads two CSV files containing the usage-by-hour data for the
  straight-counting pipeline and the Cobalt prototype pipeline respectively and
  uses them to build two JavaScript strings defining DataTables containing the
  data and one string describing basic RAPPOR parameters.

  Returns:
    {tuple of two strings} (sc_string, cobalt_string, params_string). The first
    two strings are of the form <var_name>=<json>, where |json| is a json string
    defining a data table. The |var_name|s are respectively
    USAGE_BY_HOUR_SC_JS_VAR_NAME and USAGE_BY_HOUR_JS_VAR_NAME.
    params_string is a json string containing RAPPOR parameters.
  """
  # straight-counting:
  # Read the data from the csv file and put it into a dictionary.
  with file_util.openForReading(
      file_util.USAGE_BY_HOUR_CSV_FILE_NAME) as csvfile:
    reader = csv.reader(csvfile)
    # |data| will be used to generate the visualiation data for the
    # straight-counting pipeline
    data = []
    # |values| will be used below to include the actual values along with
    # the RAPPOR estimates in the visualization of the Cobalt pipeline.
    values = []
    hour = 0
    for row in reader:
      data.append({"hour" : hour, "usage": int(row[0])})
      values.append(int(row[0]))
      hour += 1
  usage_by_hour_sc_js = buildDataTableJs(
      data=data,
      var_name=USAGE_BY_HOUR_SC_JS_VAR_NAME,
      description = {"hour": ("number", "Hour of Day"),
                     "usage": ("number", "Usage")},
      columns_order=("hour", "usage"))

  # cobalt:
  # Here the CSV file is the output of the RAPPOR analyzer.
  # We read it and put the data into a dictionary.
  # We skip row zero because it is the header row. We are going to visualize
  # the data as an interval chart and so we want to compute the high and
  # low 95% confidence interval values wich we may do using the "std_error"
  # column, column 2.
  with file_util.openForReading(
      file_util.HOUR_ANALYZER_OUTPUT_FILE_NAME) as csvfile:
    reader = csv.reader(csvfile)
    data = [{"hour" : int(row[0]), "estimate": max(float(row[1]), 0),
             "actual": values[int(row[0])],
             "low" : max(float(row[1])  - 1.96 * float(row[2]), 0),
             "high": float(row[1]) + 1.96 * float(row[2])}
        for row in reader if reader.line_num > 1]
  usage_by_hour_cobalt_js = buildDataTableJs(
      data=data,
      var_name=USAGE_BY_HOUR_JS_VAR_NAME,
      description={"hour": ("number", "Hour"),
                   "estimate": ("number", "Estimate"),
                   "actual": ("number", "Actual"),
                   # The role: 'interval' property is what tells the Google
                   # Visualization API to draw an interval chart.
                   "low": ("number", "Low", {'role': 'interval'}),
                   "high": ("number", "High", {'role': 'interval'})},
      columns_order=("hour", "estimate", "low", "high", "actual"),
      order_by=("hour", "asc"))

  # RAPPOR parameters
  rappor_params_js = "{} = {};".format(
      USAGE_BY_HOUR_PARAMS_JS_VAR_NAME,
      readRapporConfigParamsFromFile(file_util.RAPPOR_HOUR_CONFIG).to_json())

  return (usage_by_hour_sc_js, usage_by_hour_cobalt_js, rappor_params_js)

def buildItemAndCountJs(filename, varname1, varname2, item_column,
                        item_description):
  """Builds several strings defining variables used for visualization.

  Reads a CSV file containing two columns, an item column and a
  count column, and and uses the data to build two JavaScript strings defining
  DataTables containing the data. The two DataTables will be the same except
  for the order of the columns: The first DataTable will have the count
  column first and the second DataTable will have the item column first.

  Args:
    filename: {string} The full path of the CSV file to read.
    varname1: {string} The name of the first javascript variable to generate.
    varname2: {string} The name of the second javascript variable to generate.
                       If this is None then the second returned string will
                       also be None.
    item_column: {string} The name of the item column to use in the generated
                          JS.
    item_description: The description string to use in the generated JS.

  Returns:
    {tuple of two string} of the form <varname>=<json>, where <json> is a json
    string defining a data table. In the first returned string <varname> will
    be |varname1| and the "count" column will come first in the DataTable.
    In the second returned string <varname> will be |varname1| and |item_column|
    will come first in the DataTable.
  """

  with file_util.openForReading(filename) as csvfile:
    reader = csv.reader(csvfile)
    data = [{item_column : row[0], "count": int(row[1])} for row in reader]
  count_first_string = buildDataTableJs(
      data=data,
      var_name=varname1,
      description={item_column: ("string", item_description),
                   "count": ("number", "Count")},
      columns_order=("count", item_column),
      order_by=(("count", "desc"), item_column))
  item_first_string = None
  if varname2 is not None:
    item_first_string = buildDataTableJs(
        data=data,
        var_name=varname2,
        description={item_column: ("string", item_description),
                    "count": ("number", "Count")},
        columns_order=(item_column, "count"),
        order_by=(("count", "desc"), item_column))
  return (count_first_string, item_first_string)

def buildPopularUrlsJs():
  """Builds several strings defining variables used for visualization.

  Reads two CSV files containing the popular URL data for the straight-
  counting pipeline and the Cobalt prototype pipeline respectively and uses them
  to build three JavaScript strings defining DataTables containing the data.

 Returns:
    {tuple of three strings} (sc_string, sc_histogram_string, cobalt_string).
    Each of the three strings is of the form <var_name>=<json>, where |json|
    is a json string defining a data table. The |var_name|s are respectively
    POPULAR_URLS_SC_JS_VAR_NAME,
    POPULAR_URLS_HISTOGRAM_SC_JS_VAR_NAME,
    POPULAR_URLS_JS_VAR_NAME.
  """
  # straight-counting
  popular_urls_sc_js, popular_urls_histogram_sc_js = buildItemAndCountJs(
      file_util.POPULAR_URLS_CSV_FILE_NAME, POPULAR_URLS_SC_JS_VAR_NAME,
      POPULAR_URLS_HISTOGRAM_SC_JS_VAR_NAME,
      "url", "URL")

  # Cobalt.
  popular_urls_js, _ = buildItemAndCountJs(
    file_util.URL_ANALYZER_OUTPUT_FILE_NAME, POPULAR_URLS_JS_VAR_NAME, None,
    "url", "URL")

  return (popular_urls_sc_js, popular_urls_histogram_sc_js, popular_urls_js)

def buildPopularHelpQueriesJs():
  """Builds several strings defining variables used for visualization.

  Reads two CSV files containing the popular help-qury data for the straight-
  counting pipeline and the Cobalt prototype pipeline respectively and uses them
  to build three JavaScript strings defining DataTables containing the data.

 Returns:
    {tuple of three strings} (sc_string, sc_histogram_string, cobalt_string).
    Each of the three strings is of the form <var_name>=<json>, where |json| is
    a json string defining a data table. The |var_name|s are respectively
    POPULAR_HELP_QUERIES_SC_JS_VAR_NAME,
    POPULAR_HELP_QUERIES_HISTOGRAM_SC_JS_VAR_NAME,
    and POPULAR_HELP_QUERIES_JS_VAR_NAME.
  """
  # straight-counting, table visualization
  popular_help_queries_sc_js, popular_help_queries_histogram_sc_js = \
      buildItemAndCountJs(file_util.POPULAR_HELP_QUERIES_CSV_FILE_NAME,
                          POPULAR_HELP_QUERIES_SC_JS_VAR_NAME,
                          POPULAR_HELP_QUERIES_HISTOGRAM_SC_JS_VAR_NAME,
                          "help_query ", "Help Query")

  # Cobalt.
  popular_help_queries_js, _ = buildItemAndCountJs(
    file_util.HELP_QUERY_ANALYZER_OUTPUT_FILE_NAME,
    POPULAR_HELP_QUERIES_JS_VAR_NAME, None,
    "help_query", "Help Query")

  return (popular_help_queries_sc_js, popular_help_queries_histogram_sc_js,
      popular_help_queries_js)

def main():
  print "Generating visualization..."

  # Read the input file and build the JavaScript strings to write.
  usage_by_module_sc_js, usage_by_module_js, usage_by_module_with_pr_js, \
      usage_by_module_params_js, usage_by_module_with_pr_params_js = \
          buildUsageByModuleJs()
  usage_by_city_js, usage_by_city_sc_js = buildUsageAndRatingByCityJs()
  usage_by_hour_sc_js, usage_by_hour_js, usage_by_hour_params_js = \
      buildUsageByHourJs()
  (popular_urls_sc_js, popular_urls_histogram_sc_js,
   popular_urls_js) = buildPopularUrlsJs()
  (popular_help_queries_sc_js, popular_help_queries_histogram_sc_js,
      popular_help_queries_js) = buildPopularHelpQueriesJs()

  # Write the output file.
  with file_util.openForWriting(OUTPUT_JS_FILE_NAME) as f:
    f.write("// This js file is generated by the script "
            "generate_data_js.py\n\n")
    f.write("%s\n\n" % usage_by_module_sc_js)
    f.write("%s\n\n" % usage_by_module_js)
    f.write("%s\n\n" % usage_by_module_with_pr_js)
    f.write("%s\n\n" % usage_by_module_params_js)
    f.write("%s\n\n" % usage_by_module_with_pr_params_js)

    f.write("%s\n\n" % usage_by_city_sc_js)
    f.write("%s\n\n" % usage_by_city_js)

    f.write("%s\n\n" % usage_by_hour_sc_js)
    f.write("%s\n\n" % usage_by_hour_js)
    f.write("%s\n\n" % usage_by_hour_params_js)

    f.write("%s\n\n" % popular_urls_sc_js)
    f.write("%s\n\n" % popular_urls_histogram_sc_js)
    f.write("%s\n\n" % popular_urls_js)

    f.write("%s\n\n" % popular_help_queries_sc_js)
    f.write("%s\n\n" % popular_help_queries_histogram_sc_js)
    f.write("%s\n\n" % popular_help_queries_js)
    f.write("")

  print "View this file in your browser:"
  print "file://%s" % file_util.VISUALIZATION_FILE

if __name__ == '__main__':
  main()
