experimental/cache_analysis.py - third_party/safeside - Git at Google

 """Plot measurement data.

 Run instructions:

   python3 -m venv env
   source env/activate
   pip install -r experimental/requirements.lock
   python experimental/cache_analysis.py cache_size_results.csv
 """


 from matplotlib.pyplot import figure
 import matplotlib.pyplot as plt
 import io
 import numpy as np
 import pandas as pd
 from statistics import mean, median
 import collections
 import csv
 import seaborn as sns
 import os
 import sys


 def draw_single_run_plot(data, name):
   """ This functions draws and saves two types of plots for cases where each x value has only one y value.

   Here it is used to draw plots for stats (ie mean and median of all runs).
   It can be used for drawing each run individually.
   Args:
     data:
       A dictionary from x values to their corresponding y value
     name:
       The file name that is used for saving the plot
   """
   figure(num=None, figsize=(10, 8), dpi=80, facecolor='w', edgecolor='k')

   x = list(data.keys())
   y = list(data.values())

   plt.subplot(211)
   plt.plot(x,y)
   plt.subplot(212)
   plt.scatter(x,y)
   plt.savefig(name)
   plt.cla()


 def draw_boxplot(data, sizes, name):

   """ This functions draws and saves boxplot for all data points.

   Args:
     data:
       A list of lists, where each list contains the timing values for each size under analysis
       lists are in the asecending order of the sizes values (e.g. timing values for the smallest size corresponds to data[0])
     sizes:
       The list of sizes under analysis that is used as the values of x axis
     name:
       The file name that is used for saving the plot
   """
   fig = plt.figure(figsize=(30, 10))
   ax = fig.add_subplot(111)
   boxes = []
   for l in data:
     for i, time in enumerate(l):
       boxes.append([sizes[i%len(l)], time])

   dd = pd.DataFrame(boxes, columns = ["size (bytes)", "time (cycles)"])
   sns.set_theme(style="whitegrid")

   ax = sns.boxplot(data = dd, showfliers=False, x = 'size (bytes)', y = 'time (cycles)')
   ax.grid(False)
   fig = ax.get_figure()
   fig.savefig(name)


 def draw_all_datapoints(data, sizes, name):
   """ This functions draw and saves all data points as individual plots in a single figure in log scale

   Args:
     data:
       A list of lists, where each list contains the timing values for each size under analysis
         lists are in the asecending order of the sizes values (e.g. timing values for the smallest size corresponds to data[0])
     sizes:
       The list of sizes under analysis that is used as the values of x axis
     name:
       The file name that is used for saving the plot
   """
   fig = plt.figure(figsize=(30, 10))
   ax = fig.add_subplot(111)
   for l in data:
     ax.scatter(sizes,l)
   ax.set_yscale('log')
   ax.set_xscale('log', base = 2)
   plt.xlabel("size (bytes)")
   plt.ylabel("time (cycles)")
   plt.title(" all data points")
   plt.grid(False)
   plt.savefig(name)
   plt.cla()


 def main():
   if len(sys.argv) != 2:
     sys.exit(f"Expected 1 argument, got {len(sys.argv) - 1}")

   # read csv file to extract data
   df = pd.read_csv(sys.argv[1], header=None)


   count_row = df.shape[0]  # gives number of row count
   sizes = df[0].unique()  # sizes used for the analysis
   iterations = int(count_row/len(sizes))  # total experiments' repetitions
   print(f"In total the experiment was repeated {iterations} times")
   print(f"In total {len(sizes)} different values were tested")

   # extract all runs seperately
   all = []
   pre = 0
   for i in range(1, count_row):
       if (i+1) % len(sizes) == 0:
           l = df.iloc[pre: i+1]
           pre = i+1
           all.append(l)

   # extract each size results

   all_data = []
   for i in range(iterations):
       l = list(all[i][1])
       all_data.append(list(l))

   print("Extracted the results for each size under analysis")

   #computes the median and mean of iterations
   means = {}
   medians = {}

   grouped = df.groupby(df[0])

   for cache_size in sizes:
     times = grouped.get_group(cache_size)
     means[cache_size] = mean(times[1])
     medians[cache_size] = median(times[1])

   draw_single_run_plot(medians, "medians.pdf")
   draw_single_run_plot(means, "means.pdf")


   draw_boxplot(all_data, sizes, "boxplot.pdf")

   draw_all_datapoints(all_data, sizes, "all_runs.pdf")

 if __name__ == '__main__':
   main()
	"""Plot measurement data.

	Run instructions:

	python3 -m venv env
	source env/activate
	pip install -r experimental/requirements.lock
	python experimental/cache_analysis.py cache_size_results.csv
	"""


	from matplotlib.pyplot import figure
	import matplotlib.pyplot as plt
	import io
	import numpy as np
	import pandas as pd
	from statistics import mean, median
	import collections
	import csv
	import seaborn as sns
	import os
	import sys


	def draw_single_run_plot(data, name):
	""" This functions draws and saves two types of plots for cases where each x value has only one y value.

	Here it is used to draw plots for stats (ie mean and median of all runs).
	It can be used for drawing each run individually.
	Args:
	data:
	A dictionary from x values to their corresponding y value
	name:
	The file name that is used for saving the plot
	"""
	figure(num=None, figsize=(10, 8), dpi=80, facecolor='w', edgecolor='k')

	x = list(data.keys())
	y = list(data.values())

	plt.subplot(211)
	plt.plot(x,y)
	plt.subplot(212)
	plt.scatter(x,y)
	plt.savefig(name)
	plt.cla()


	def draw_boxplot(data, sizes, name):

	""" This functions draws and saves boxplot for all data points.

	Args:
	data:
	A list of lists, where each list contains the timing values for each size under analysis
	lists are in the asecending order of the sizes values (e.g. timing values for the smallest size corresponds to data[0])
	sizes:
	The list of sizes under analysis that is used as the values of x axis
	name:
	The file name that is used for saving the plot
	"""
	fig = plt.figure(figsize=(30, 10))
	ax = fig.add_subplot(111)
	boxes = []
	for l in data:
	for i, time in enumerate(l):
	boxes.append([sizes[i%len(l)], time])

	dd = pd.DataFrame(boxes, columns = ["size (bytes)", "time (cycles)"])
	sns.set_theme(style="whitegrid")

	ax = sns.boxplot(data = dd, showfliers=False, x = 'size (bytes)', y = 'time (cycles)')
	ax.grid(False)
	fig = ax.get_figure()
	fig.savefig(name)


	def draw_all_datapoints(data, sizes, name):
	""" This functions draw and saves all data points as individual plots in a single figure in log scale

	Args:
	data:
	A list of lists, where each list contains the timing values for each size under analysis
	lists are in the asecending order of the sizes values (e.g. timing values for the smallest size corresponds to data[0])
	sizes:
	The list of sizes under analysis that is used as the values of x axis
	name:
	The file name that is used for saving the plot
	"""
	fig = plt.figure(figsize=(30, 10))
	ax = fig.add_subplot(111)
	for l in data:
	ax.scatter(sizes,l)
	ax.set_yscale('log')
	ax.set_xscale('log', base = 2)
	plt.xlabel("size (bytes)")
	plt.ylabel("time (cycles)")
	plt.title(" all data points")
	plt.grid(False)
	plt.savefig(name)
	plt.cla()


	def main():
	if len(sys.argv) != 2:
	sys.exit(f"Expected 1 argument, got {len(sys.argv) - 1}")

	# read csv file to extract data
	df = pd.read_csv(sys.argv[1], header=None)


	count_row = df.shape[0] # gives number of row count
	sizes = df[0].unique() # sizes used for the analysis
	iterations = int(count_row/len(sizes)) # total experiments' repetitions
	print(f"In total the experiment was repeated {iterations} times")
	print(f"In total {len(sizes)} different values were tested")

	# extract all runs seperately
	all = []
	pre = 0
	for i in range(1, count_row):
	if (i+1) % len(sizes) == 0:
	l = df.iloc[pre: i+1]
	pre = i+1
	all.append(l)

	# extract each size results

	all_data = []
	for i in range(iterations):
	l = list(all[i][1])
	all_data.append(list(l))

	print("Extracted the results for each size under analysis")

	#computes the median and mean of iterations
	means = {}
	medians = {}

	grouped = df.groupby(df[0])

	for cache_size in sizes:
	times = grouped.get_group(cache_size)
	means[cache_size] = mean(times[1])
	medians[cache_size] = median(times[1])

	draw_single_run_plot(medians, "medians.pdf")
	draw_single_run_plot(means, "means.pdf")


	draw_boxplot(all_data, sizes, "boxplot.pdf")

	draw_all_datapoints(all_data, sizes, "all_runs.pdf")

	if __name__ == '__main__':
	main()