blob: ae4da1bea79359ab7c27774ea6845f949f7bf340 [file] [log] [blame]
# Copyright 2019 The Fuchsia Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# This tool is for invoking by the performance comparison trybots. It
# is intended for comparing the performance of two versions of
# Fuchsia. It can also compare binary sizes.
import argparse
import json
import math
import os
import sys
# For comparing results from a performance test, we calculate
# confidence intervals for the mean running times of the test. If the
# confidence intervals are non-overlapping, we conclude that the
# performance has improved or regressed for this test.
#
# Data is gathered from a 3-level sampling process:
#
# 1) Boot Fuchsia one or more times. (Currently we only boot once.)
# 2) For each boot, launch the perf test process one or more times.
# 3) For each process launch, instantiate the performance test and
# run the body of the test some number of times.
#
# This is intended to account for variation across boots and across process
# launches.
#
# Currently we use z-test confidence intervals. In future we should either
# use t-test confidence intervals, or (preferably) use bootstrap confidence
# intervals.
#
# * We apply the z-test confidence interval to the mean running times from
# each process instance (from level #3). This means we treat the sample
# size as being the number of processes launches. This is rather
# ad-hoc: it assumes that there is a lot of between-process variation
# and that we need to widen the confidence intervals to reflect that.
# Using bootstrapping with resampling across the 3 levels above should
# account for that variation without making ad-hoc assumptions.
#
# * This assumes that the values we apply the z-test to are normally
# distributed, or approximately normally distributed. Using
# bootstrapping instead would avoid this assumption.
#
# * t-test confidence intervals would be better than z-test confidence
# intervals, especially for smaller sample sizes. The former is easier
# to do if the SciPy library is available. However, this code runs
# using infra's copy of Python, which doesn't make SciPy available at
# the moment.
# ALPHA is a parameter for calculating confidence intervals. It is
# the probability that the true value for the statistic we're
# estimating (here, the mean running time) lies outside the confidence
# interval.
#
# TODO(IN-646): Figure out how to use SciPy with Python on the bots.
# Then we can uncomment ALPHA here and avoid using the pre-calculated
# Z_TEST_OFFSET below.
#
# ALPHA = 0.01
# This is the value of scipy.stats.norm.ppf(ALPHA / 2).
Z_TEST_OFFSET = -2.5758293035489008
def Mean(values):
return float(sum(values)) / len(values)
# Returns the mean and standard deviation of a sample. This does the
# same as scipy.stats.norm.fit(). This does not apply Bessel's
# correction to the calculation of the standard deviation.
def MeanAndStddev(values):
mean_val = Mean(values)
sum_of_squares = 0.0
for val in values:
diff = val - mean_val
sum_of_squares += diff * diff
stddev_val = math.sqrt(sum_of_squares / len(values))
return mean_val, stddev_val
class Stats(object):
def __init__(self, values):
sample_size = len(values)
mean, stddev = MeanAndStddev(values)
offset = -Z_TEST_OFFSET * stddev / math.sqrt(sample_size)
self._mean = mean
self._offset = offset
# Confidence interval for the mean.
self.interval = (mean - offset, mean + offset)
def FormatConfidenceInterval(self):
return '%d +/- %d' % (self._mean, self._offset)
def ReadJsonFile(filename):
with open(filename, 'r') as fh:
return json.load(fh)
def ResultsFromDir(dir_path):
results_map = {}
# Sorting the result of os.listdir() is not essential, but it makes any
# later behaviour more deterministic.
for filename in sorted(os.listdir(dir_path)):
if filename == 'summary.json':
continue
if filename.endswith('.json'):
file_path = os.path.join(dir_path, filename)
for data in ReadJsonFile(file_path):
new_value = Mean(data['values'])
results_map.setdefault(data['label'], []).append(new_value)
return {name: Stats(values) for name, values in results_map.iteritems()}
def FormatTable(rows, out_fh):
assert len(rows) > 0
column_count = len(rows[0])
for row in rows:
assert len(row) == column_count
widths = [2 + max(len(row[col_number]) for row in rows)
for col_number in xrange(column_count)]
# Underline the header row. This assumes that the first row is a
# header row.
rows.insert(1, ['-' * (width - 2) for width in widths])
for row in rows:
for col_number, value in enumerate(row):
out_fh.write(value)
if col_number < column_count - 1:
out_fh.write(' ' * (widths[col_number] - len(value)))
out_fh.write('\n')
def ComparePerf(args, out_fh):
results_maps = [ResultsFromDir(args.results_dir_before),
ResultsFromDir(args.results_dir_after)]
# Set of all test case names, including those added or removed.
labels = set(results_maps[0].iterkeys())
labels.update(results_maps[1].iterkeys())
rows = [['Test case', 'Improve/regress?', 'Factor change',
'Mean before', 'Mean after']]
for label in sorted(labels):
if label not in results_maps[0]:
result = 'added'
factor_range = '-'
before_range = '-'
after_range = results_maps[1][label].FormatConfidenceInterval()
elif label not in results_maps[1]:
result = 'removed'
factor_range = '-'
before_range = results_maps[0][label].FormatConfidenceInterval()
after_range = '-'
else:
stats = [results_map[label] for results_map in results_maps]
interval_before = stats[0].interval
interval_after = stats[1].interval
factor_min = interval_after[0] / interval_before[1]
factor_max = interval_after[1] / interval_before[0]
if interval_after[0] >= interval_before[1]:
result = 'slower'
elif interval_after[1] <= interval_before[0]:
result = 'faster'
else:
result = 'no_sig_diff'
before_range = stats[0].FormatConfidenceInterval()
after_range = stats[1].FormatConfidenceInterval()
factor_range = '%.3f-%.3f' % (factor_min, factor_max)
rows.append([
label,
result,
factor_range,
before_range,
after_range])
FormatTable(rows, out_fh)
def TotalSize(snapshot_file):
with open(snapshot_file) as fh:
data = json.load(fh)
return sum(info['size'] for info in data['blobs'].itervalues())
def CompareSizes(args):
filenames = [args.snapshot_before, args.snapshot_after]
sizes = [TotalSize(filename) for filename in filenames]
print 'Size before: %d bytes' % sizes[0]
print 'Size after: %d bytes' % sizes[1]
print 'Difference: %d bytes' % (sizes[1] - sizes[0])
if sizes[0] != 0:
print 'Factor of: %f' % (float(sizes[1]) / sizes[0])
def Main(argv, out_fh):
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
parser_compare_perf = subparsers.add_parser(
'compare_perf',
help='Compare two sets of perf test results')
parser_compare_perf.add_argument('results_dir_before')
parser_compare_perf.add_argument('results_dir_after')
parser_compare_perf.set_defaults(
func=lambda args: ComparePerf(args, out_fh))
parser_compare_sizes = subparsers.add_parser(
'compare_sizes',
help='Compare file sizes specified by two system.snapshot files')
parser_compare_sizes.add_argument('snapshot_before')
parser_compare_sizes.add_argument('snapshot_after')
parser_compare_sizes.set_defaults(func=CompareSizes)
args = parser.parse_args(argv)
args.func(args)
if __name__ == '__main__':
Main(sys.argv[1:], sys.stdout)