blob: cb2156486e7c2068b996f160bacc7b5592dcfd23 [file] [log] [blame]
# Copyright 2019 The Fuchsia Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# This tool is for invoking by the performance comparison trybots. It
# is intended for comparing the performance of two versions of
# Fuchsia. It can also compare binary sizes.
import argparse
import json
import math
import os
import sys
import tarfile
import scipy.stats
# For comparing results from a performance test, we calculate
# confidence intervals for the mean running times of the test. If the
# confidence intervals are non-overlapping, we conclude that the
# performance has improved or regressed for this test.
#
# Data is gathered from a 3-level sampling process:
#
# 1) Boot Fuchsia one or more times. (Currently we only boot once.)
# 2) For each boot, launch the perf test process one or more times.
# 3) For each process launch, instantiate the performance test and
# run the body of the test some number of times.
#
# This is intended to account for variation across boots and across process
# launches.
#
# Currently we use t-test confidence intervals. In future we could instead
# use bootstrap confidence intervals.
#
# * We apply the t-test confidence interval to the mean running times from
# each process instance (from level #3). This means we treat the sample
# size as being the number of processes launches. This is rather
# ad-hoc: it assumes that there is a lot of between-process variation
# and that we need to widen the confidence intervals to reflect that.
# Using bootstrapping with resampling across the 3 levels above should
# account for that variation without making ad-hoc assumptions.
#
# * This assumes that the values we apply the t-test to are normally
# distributed, or approximately normally distributed. Using
# bootstrapping instead would avoid this assumption.
# ALPHA is a parameter for calculating confidence intervals. It is
# the probability that the true value for the statistic we're
# estimating (here, the mean running time) lies outside the confidence
# interval.
ALPHA = 0.01
def Mean(values):
if len(values) == 0:
raise AssertionError('Mean is not defined for an empty sample')
return float(sum(values)) / len(values)
# Returns the mean and standard deviation of a sample. This applies
# Bessel's correction to the calculation of the standard deviation.
def MeanAndStddev(values):
if len(values) <= 1:
raise AssertionError(
"Sample size of %d is too small to calculate standard deviation "
"with Bessel's correction" % len(values))
mean_val = Mean(values)
sum_of_squares = 0.0
for val in values:
diff = val - mean_val
sum_of_squares += diff * diff
stddev_val = math.sqrt(sum_of_squares / (len(values) - 1))
return mean_val, stddev_val
class Stats(object):
def __init__(self, values):
sample_size = len(values)
mean, stddev = MeanAndStddev(values)
offset = (-scipy.stats.t.ppf(ALPHA / 2, sample_size - 1)
* stddev / math.sqrt(sample_size))
self._mean = mean
self._offset = offset
# Confidence interval for the mean.
self.interval = (mean - offset, mean + offset)
def FormatConfidenceInterval(self):
return '%d +/- %d' % (self._mean, self._offset)
def ReadJsonFile(filename):
with open(filename, 'r') as fh:
return json.load(fh)
def IsResultsFilename(name):
return name.endswith('.json') and name != 'summary.json'
# Read the raw perf test results from a directory or a tar file. Returns a
# sequence (iterator) of JSON trees.
#
# Accepting tar files here is a convenience for when doing local testing of
# the statistics. The Swarming system used for the bots produces "out.tar"
# files as results.
def RawResultsFromDir(filename):
# Note that sorting the filename listing (from os.listdir() or from
# tarfile) is not essential, but it helps to make any later processing
# more deterministic.
if os.path.isfile(filename) and filename.endswith('.tar'):
# Read from tar file.
tar = tarfile.TarFile(filename)
for member in sorted(tar.getmembers(), key=lambda member: member.name):
if IsResultsFilename(member.name):
yield json.load(tar.extractfile(member))
else:
# Read from directory.
for name in sorted(os.listdir(filename)):
if IsResultsFilename(name):
yield ReadJsonFile(os.path.join(filename, name))
def ResultsFromDir(filename):
results_map = {}
for process_run_results in RawResultsFromDir(filename):
for test_case in process_run_results:
new_value = Mean(test_case['values'])
results_map.setdefault(test_case['label'], []).append(new_value)
return {name: Stats(values) for name, values in results_map.iteritems()}
def FormatTable(rows, out_fh):
assert len(rows) > 0
column_count = len(rows[0])
for row in rows:
assert len(row) == column_count
widths = [2 + max(len(row[col_number]) for row in rows)
for col_number in xrange(column_count)]
# Underline the header row. This assumes that the first row is a
# header row.
rows.insert(1, ['-' * (width - 2) for width in widths])
for row in rows:
for col_number, value in enumerate(row):
out_fh.write(value)
if col_number < column_count - 1:
out_fh.write(' ' * (widths[col_number] - len(value)))
out_fh.write('\n')
def ComparePerf(args, out_fh):
results_maps = [ResultsFromDir(args.results_dir_before),
ResultsFromDir(args.results_dir_after)]
# Set of all test case names, including those added or removed.
labels = set(results_maps[0].iterkeys())
labels.update(results_maps[1].iterkeys())
rows = [['Test case', 'Improve/regress?', 'Factor change',
'Mean before', 'Mean after']]
for label in sorted(labels):
if label not in results_maps[0]:
result = 'added'
factor_range = '-'
before_range = '-'
after_range = results_maps[1][label].FormatConfidenceInterval()
elif label not in results_maps[1]:
result = 'removed'
factor_range = '-'
before_range = results_maps[0][label].FormatConfidenceInterval()
after_range = '-'
else:
stats = [results_map[label] for results_map in results_maps]
interval_before = stats[0].interval
interval_after = stats[1].interval
factor_min = interval_after[0] / interval_before[1]
factor_max = interval_after[1] / interval_before[0]
if interval_after[0] >= interval_before[1]:
result = 'slower'
elif interval_after[1] <= interval_before[0]:
result = 'faster'
else:
result = 'no_sig_diff'
before_range = stats[0].FormatConfidenceInterval()
after_range = stats[1].FormatConfidenceInterval()
factor_range = '%.3f-%.3f' % (factor_min, factor_max)
rows.append([
label,
result,
factor_range,
before_range,
after_range])
FormatTable(rows, out_fh)
def TotalSize(snapshot_file):
with open(snapshot_file) as fh:
data = json.load(fh)
return sum(info['size'] for info in data['blobs'].itervalues())
def CompareSizes(args):
filenames = [args.snapshot_before, args.snapshot_after]
sizes = [TotalSize(filename) for filename in filenames]
print 'Size before: %d bytes' % sizes[0]
print 'Size after: %d bytes' % sizes[1]
print 'Difference: %d bytes' % (sizes[1] - sizes[0])
if sizes[0] != 0:
print 'Factor of: %f' % (float(sizes[1]) / sizes[0])
def Main(argv, out_fh):
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
parser_compare_perf = subparsers.add_parser(
'compare_perf',
help='Compare two sets of perf test results')
parser_compare_perf.add_argument('results_dir_before')
parser_compare_perf.add_argument('results_dir_after')
parser_compare_perf.set_defaults(
func=lambda args: ComparePerf(args, out_fh))
parser_compare_sizes = subparsers.add_parser(
'compare_sizes',
help='Compare file sizes specified by two system.snapshot files')
parser_compare_sizes.add_argument('snapshot_before')
parser_compare_sizes.add_argument('snapshot_after')
parser_compare_sizes.set_defaults(func=CompareSizes)
args = parser.parse_args(argv)
args.func(args)
if __name__ == '__main__':
Main(sys.argv[1:], sys.stdout)