garnet/bin/perfcompare/perfcompare.py - fuchsia - Git at Google

 # Copyright 2019 The Fuchsia Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 # This tool is for invoking by the performance comparison trybots.  It
 # is intended for comparing the performance of two versions of
 # Fuchsia.  It can also compare binary sizes.

 import argparse
 import json
 import math
 import os
 import sys


 # For comparing results from a performance test, we calculate
 # confidence intervals for the mean running times of the test.  If the
 # confidence intervals are non-overlapping, we conclude that the
 # performance has improved or regressed for this test.
 #
 # Data is gathered from a 3-level sampling process:
 #
 #  1) Boot Fuchsia one or more times.  (Currently we only boot once.)
 #  2) For each boot, launch the perf test process one or more times.
 #  3) For each process launch, instantiate the performance test and
 #     run the body of the test some number of times.
 #
 # This is intended to account for variation across boots and across process
 # launches.
 #
 # Currently we use z-test confidence intervals.  In future we should either
 # use t-test confidence intervals, or (preferably) use bootstrap confidence
 # intervals.
 #
 #  * We apply the z-test confidence interval to the mean running times from
 #    each process instance (from level #3).  This means we treat the sample
 #    size as being the number of processes launches.  This is rather
 #    ad-hoc: it assumes that there is a lot of between-process variation
 #    and that we need to widen the confidence intervals to reflect that.
 #    Using bootstrapping with resampling across the 3 levels above should
 #    account for that variation without making ad-hoc assumptions.
 #
 #  * This assumes that the values we apply the z-test to are normally
 #    distributed, or approximately normally distributed.  Using
 #    bootstrapping instead would avoid this assumption.
 #
 #  * t-test confidence intervals would be better than z-test confidence
 #    intervals, especially for smaller sample sizes.  The former is easier
 #    to do if the SciPy library is available.  However, this code runs
 #    using infra's copy of Python, which doesn't make SciPy available at
 #    the moment.


 # ALPHA is a parameter for calculating confidence intervals.  It is
 # the probability that the true value for the statistic we're
 # estimating (here, the mean running time) lies outside the confidence
 # interval.
 #
 # TODO(IN-646): Figure out how to use SciPy with Python on the bots.
 # Then we can uncomment ALPHA here and avoid using the pre-calculated
 # Z_TEST_OFFSET below.
 #
 # ALPHA = 0.01

 # This is the value of scipy.stats.norm.ppf(ALPHA / 2).
 Z_TEST_OFFSET = -2.5758293035489008


 def Mean(values):
     if len(values) == 0:
         raise AssertionError('Mean is not defined for an empty sample')
     return float(sum(values)) / len(values)


 # Returns the mean and standard deviation of a sample.  This applies
 # Bessel's correction to the calculation of the standard deviation.
 def MeanAndStddev(values):
     if len(values) <= 1:
         raise AssertionError(
             "Sample size of %d is too small to calculate standard deviation "
             "with Bessel's correction" % len(values))
     mean_val = Mean(values)
     sum_of_squares = 0.0
     for val in values:
         diff = val - mean_val
         sum_of_squares += diff * diff
     stddev_val = math.sqrt(sum_of_squares / (len(values) - 1))
     return mean_val, stddev_val


 class Stats(object):

     def __init__(self, values):
         sample_size = len(values)
         mean, stddev = MeanAndStddev(values)
         offset = -Z_TEST_OFFSET * stddev / math.sqrt(sample_size)
         self._mean = mean
         self._offset = offset
         # Confidence interval for the mean.
         self.interval = (mean - offset, mean + offset)

     def FormatConfidenceInterval(self):
         return '%d +/- %d' % (self._mean, self._offset)


 def ReadJsonFile(filename):
     with open(filename, 'r') as fh:
         return json.load(fh)


 def ResultsFromDir(dir_path):
     results_map = {}
     # Sorting the result of os.listdir() is not essential, but it makes any
     # later behaviour more deterministic.
     for filename in sorted(os.listdir(dir_path)):
         if filename == 'summary.json':
             continue
         if filename.endswith('.json'):
             file_path = os.path.join(dir_path, filename)
             for data in ReadJsonFile(file_path):
                 new_value = Mean(data['values'])
                 results_map.setdefault(data['label'], []).append(new_value)
     return {name: Stats(values) for name, values in results_map.iteritems()}


 def FormatTable(rows, out_fh):
     assert len(rows) > 0
     column_count = len(rows[0])
     for row in rows:
         assert len(row) == column_count
     widths = [2 + max(len(row[col_number]) for row in rows)
               for col_number in xrange(column_count)]
     # Underline the header row.  This assumes that the first row is a
     # header row.
     rows.insert(1, ['-' * (width - 2) for width in widths])
     for row in rows:
         for col_number, value in enumerate(row):
             out_fh.write(value)
             if col_number < column_count - 1:
                 out_fh.write(' ' * (widths[col_number] - len(value)))
         out_fh.write('\n')


 def ComparePerf(args, out_fh):
     results_maps = [ResultsFromDir(args.results_dir_before),
                     ResultsFromDir(args.results_dir_after)]

     # Set of all test case names, including those added or removed.
     labels = set(results_maps[0].iterkeys())
     labels.update(results_maps[1].iterkeys())

     rows = [['Test case', 'Improve/regress?', 'Factor change',
              'Mean before', 'Mean after']]
     for label in sorted(labels):
         if label not in results_maps[0]:
             result = 'added'
             factor_range = '-'
             before_range = '-'
             after_range = results_maps[1][label].FormatConfidenceInterval()
         elif label not in results_maps[1]:
             result = 'removed'
             factor_range = '-'
             before_range = results_maps[0][label].FormatConfidenceInterval()
             after_range = '-'
         else:
             stats = [results_map[label] for results_map in results_maps]
             interval_before = stats[0].interval
             interval_after = stats[1].interval
             factor_min = interval_after[0] / interval_before[1]
             factor_max = interval_after[1] / interval_before[0]
             if interval_after[0] >= interval_before[1]:
                 result = 'slower'
             elif interval_after[1] <= interval_before[0]:
                 result = 'faster'
             else:
                 result = 'no_sig_diff'
             before_range = stats[0].FormatConfidenceInterval()
             after_range = stats[1].FormatConfidenceInterval()
             factor_range = '%.3f-%.3f' % (factor_min, factor_max)
         rows.append([
             label,
             result,
             factor_range,
             before_range,
             after_range])
     FormatTable(rows, out_fh)


 def TotalSize(snapshot_file):
     with open(snapshot_file) as fh:
         data = json.load(fh)
     return sum(info['size'] for info in data['blobs'].itervalues())


 def CompareSizes(args):
     filenames = [args.snapshot_before, args.snapshot_after]
     sizes = [TotalSize(filename) for filename in filenames]
     print 'Size before:  %d bytes' % sizes[0]
     print 'Size after:   %d bytes' % sizes[1]
     print 'Difference:   %d bytes' % (sizes[1] - sizes[0])
     if sizes[0] != 0:
         print 'Factor of:    %f' % (float(sizes[1]) / sizes[0])


 def Main(argv, out_fh):
     parser = argparse.ArgumentParser()
     subparsers = parser.add_subparsers()

     parser_compare_perf = subparsers.add_parser(
         'compare_perf',
         help='Compare two sets of perf test results')
     parser_compare_perf.add_argument('results_dir_before')
     parser_compare_perf.add_argument('results_dir_after')
     parser_compare_perf.set_defaults(
         func=lambda args: ComparePerf(args, out_fh))

     parser_compare_sizes = subparsers.add_parser(
         'compare_sizes',
         help='Compare file sizes specified by two system.snapshot files')
     parser_compare_sizes.add_argument('snapshot_before')
     parser_compare_sizes.add_argument('snapshot_after')
     parser_compare_sizes.set_defaults(func=CompareSizes)

     args = parser.parse_args(argv)
     args.func(args)


 if __name__ == '__main__':
     Main(sys.argv[1:], sys.stdout)
	# Copyright 2019 The Fuchsia Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	# This tool is for invoking by the performance comparison trybots. It
	# is intended for comparing the performance of two versions of
	# Fuchsia. It can also compare binary sizes.

	import argparse
	import json
	import math
	import os
	import sys


	# For comparing results from a performance test, we calculate
	# confidence intervals for the mean running times of the test. If the
	# confidence intervals are non-overlapping, we conclude that the
	# performance has improved or regressed for this test.
	#
	# Data is gathered from a 3-level sampling process:
	#
	# 1) Boot Fuchsia one or more times. (Currently we only boot once.)
	# 2) For each boot, launch the perf test process one or more times.
	# 3) For each process launch, instantiate the performance test and
	# run the body of the test some number of times.
	#
	# This is intended to account for variation across boots and across process
	# launches.
	#
	# Currently we use z-test confidence intervals. In future we should either
	# use t-test confidence intervals, or (preferably) use bootstrap confidence
	# intervals.
	#
	# * We apply the z-test confidence interval to the mean running times from
	# each process instance (from level #3). This means we treat the sample
	# size as being the number of processes launches. This is rather
	# ad-hoc: it assumes that there is a lot of between-process variation
	# and that we need to widen the confidence intervals to reflect that.
	# Using bootstrapping with resampling across the 3 levels above should
	# account for that variation without making ad-hoc assumptions.
	#
	# * This assumes that the values we apply the z-test to are normally
	# distributed, or approximately normally distributed. Using
	# bootstrapping instead would avoid this assumption.
	#
	# * t-test confidence intervals would be better than z-test confidence
	# intervals, especially for smaller sample sizes. The former is easier
	# to do if the SciPy library is available. However, this code runs
	# using infra's copy of Python, which doesn't make SciPy available at
	# the moment.


	# ALPHA is a parameter for calculating confidence intervals. It is
	# the probability that the true value for the statistic we're
	# estimating (here, the mean running time) lies outside the confidence
	# interval.
	#
	# TODO(IN-646): Figure out how to use SciPy with Python on the bots.
	# Then we can uncomment ALPHA here and avoid using the pre-calculated
	# Z_TEST_OFFSET below.
	#
	# ALPHA = 0.01

	# This is the value of scipy.stats.norm.ppf(ALPHA / 2).
	Z_TEST_OFFSET = -2.5758293035489008


	def Mean(values):
	if len(values) == 0:
	raise AssertionError('Mean is not defined for an empty sample')
	return float(sum(values)) / len(values)


	# Returns the mean and standard deviation of a sample. This applies
	# Bessel's correction to the calculation of the standard deviation.
	def MeanAndStddev(values):
	if len(values) <= 1:
	raise AssertionError(
	"Sample size of %d is too small to calculate standard deviation "
	"with Bessel's correction" % len(values))
	mean_val = Mean(values)
	sum_of_squares = 0.0
	for val in values:
	diff = val - mean_val
	sum_of_squares += diff * diff
	stddev_val = math.sqrt(sum_of_squares / (len(values) - 1))
	return mean_val, stddev_val


	class Stats(object):

	def __init__(self, values):
	sample_size = len(values)
	mean, stddev = MeanAndStddev(values)
	offset = -Z_TEST_OFFSET * stddev / math.sqrt(sample_size)
	self._mean = mean
	self._offset = offset
	# Confidence interval for the mean.
	self.interval = (mean - offset, mean + offset)

	def FormatConfidenceInterval(self):
	return '%d +/- %d' % (self._mean, self._offset)


	def ReadJsonFile(filename):
	with open(filename, 'r') as fh:
	return json.load(fh)


	def ResultsFromDir(dir_path):
	results_map = {}
	# Sorting the result of os.listdir() is not essential, but it makes any
	# later behaviour more deterministic.
	for filename in sorted(os.listdir(dir_path)):
	if filename == 'summary.json':
	continue
	if filename.endswith('.json'):
	file_path = os.path.join(dir_path, filename)
	for data in ReadJsonFile(file_path):
	new_value = Mean(data['values'])
	results_map.setdefault(data['label'], []).append(new_value)
	return {name: Stats(values) for name, values in results_map.iteritems()}


	def FormatTable(rows, out_fh):
	assert len(rows) > 0
	column_count = len(rows[0])
	for row in rows:
	assert len(row) == column_count
	widths = [2 + max(len(row[col_number]) for row in rows)
	for col_number in xrange(column_count)]
	# Underline the header row. This assumes that the first row is a
	# header row.
	rows.insert(1, ['-' * (width - 2) for width in widths])
	for row in rows:
	for col_number, value in enumerate(row):
	out_fh.write(value)
	if col_number < column_count - 1:
	out_fh.write(' ' * (widths[col_number] - len(value)))
	out_fh.write('\n')


	def ComparePerf(args, out_fh):
	results_maps = [ResultsFromDir(args.results_dir_before),
	ResultsFromDir(args.results_dir_after)]

	# Set of all test case names, including those added or removed.
	labels = set(results_maps[0].iterkeys())
	labels.update(results_maps[1].iterkeys())

	rows = [['Test case', 'Improve/regress?', 'Factor change',
	'Mean before', 'Mean after']]
	for label in sorted(labels):
	if label not in results_maps[0]:
	result = 'added'
	factor_range = '-'
	before_range = '-'
	after_range = results_maps[1][label].FormatConfidenceInterval()
	elif label not in results_maps[1]:
	result = 'removed'
	factor_range = '-'
	before_range = results_maps[0][label].FormatConfidenceInterval()
	after_range = '-'
	else:
	stats = [results_map[label] for results_map in results_maps]
	interval_before = stats[0].interval
	interval_after = stats[1].interval
	factor_min = interval_after[0] / interval_before[1]
	factor_max = interval_after[1] / interval_before[0]
	if interval_after[0] >= interval_before[1]:
	result = 'slower'
	elif interval_after[1] <= interval_before[0]:
	result = 'faster'
	else:
	result = 'no_sig_diff'
	before_range = stats[0].FormatConfidenceInterval()
	after_range = stats[1].FormatConfidenceInterval()
	factor_range = '%.3f-%.3f' % (factor_min, factor_max)
	rows.append([
	label,
	result,
	factor_range,
	before_range,
	after_range])
	FormatTable(rows, out_fh)


	def TotalSize(snapshot_file):
	with open(snapshot_file) as fh:
	data = json.load(fh)
	return sum(info['size'] for info in data['blobs'].itervalues())


	def CompareSizes(args):
	filenames = [args.snapshot_before, args.snapshot_after]
	sizes = [TotalSize(filename) for filename in filenames]
	print 'Size before: %d bytes' % sizes[0]
	print 'Size after: %d bytes' % sizes[1]
	print 'Difference: %d bytes' % (sizes[1] - sizes[0])
	if sizes[0] != 0:
	print 'Factor of: %f' % (float(sizes[1]) / sizes[0])


	def Main(argv, out_fh):
	parser = argparse.ArgumentParser()
	subparsers = parser.add_subparsers()

	parser_compare_perf = subparsers.add_parser(
	'compare_perf',
	help='Compare two sets of perf test results')
	parser_compare_perf.add_argument('results_dir_before')
	parser_compare_perf.add_argument('results_dir_after')
	parser_compare_perf.set_defaults(
	func=lambda args: ComparePerf(args, out_fh))

	parser_compare_sizes = subparsers.add_parser(
	'compare_sizes',
	help='Compare file sizes specified by two system.snapshot files')
	parser_compare_sizes.add_argument('snapshot_before')
	parser_compare_sizes.add_argument('snapshot_after')
	parser_compare_sizes.set_defaults(func=CompareSizes)

	args = parser.parse_args(argv)
	args.func(args)


	if __name__ == '__main__':
	Main(sys.argv[1:], sys.stdout)