blob: a7afa021f14ebd14afb8c72ea56788ce89ec79cf [file] [log] [blame]
# Copyright 2019 The Fuchsia Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# This tool is for invoking by the performance comparison trybots. It
# is intended for comparing the performance of two versions of
# Fuchsia. It can also compare binary sizes.
import argparse
import json
import math
import os
import sys
import tarfile
import scipy.stats
# For comparing results from a performance test, we calculate
# confidence intervals for the mean running times of the test. If the
# confidence intervals are non-overlapping, we conclude that the
# performance has improved or regressed for this test.
# Data is gathered from a 3-level sampling process:
# 1) Boot Fuchsia multiple times.
# 2) For each boot, launch the perf test process one or more times.
# 3) For each process launch, instantiate the performance test and
# run the body of the test some number of times.
# This is intended to account for variation across boots and across process
# launches.
# Currently we use t-test confidence intervals. This assumes that the
# values we apply the t-test to are normally distributed, or approximately
# normally distributed. In future we could instead use bootstrap
# confidence intervals, which would avoid that assumption.
# ALPHA is a parameter for calculating confidence intervals. It is
# the probability that the true value for the statistic we're
# estimating (here, the mean running time) lies outside the confidence
# interval.
ALPHA = 0.01
def Mean(values):
if len(values) == 0:
raise AssertionError('Mean is not defined for an empty sample')
return float(sum(values)) / len(values)
# Returns the mean and standard deviation of a sample. This applies
# Bessel's correction to the calculation of the standard deviation.
def MeanAndStddev(values):
if len(values) <= 1:
raise AssertionError(
"Sample size of %d is too small to calculate standard deviation "
"with Bessel's correction" % len(values))
mean_val = Mean(values)
sum_of_squares = 0.0
for val in values:
diff = val - mean_val
sum_of_squares += diff * diff
stddev_val = math.sqrt(sum_of_squares / (len(values) - 1))
return mean_val, stddev_val
class Stats(object):
def __init__(self, values):
sample_size = len(values)
mean, stddev = MeanAndStddev(values)
offset = (-scipy.stats.t.ppf(ALPHA / 2, sample_size - 1)
* stddev / math.sqrt(sample_size))
self._mean = mean
self._offset = offset
# Confidence interval for the mean.
self.interval = (mean - offset, mean + offset)
def FormatConfidenceInterval(self):
return '%d +/- %d' % (self._mean, self._offset)
# Returns the relative CI width, which is the width of the confidence
# interval divided by the mean.
def RelativeConfidenceIntervalWidth(self):
return self._offset * 2 / self._mean
def ReadJsonFile(filename):
with open(filename, 'r') as fh:
return json.load(fh)
def IsResultsFilename(name):
return name.endswith('.json') and name != 'summary.json'
# Read the raw perf test results from a directory or a tar file that
# contains results from a single boot of Fuchsia. Returns a sequence
# (iterator) of JSON trees.
# Accepting tar files here is a convenience for when doing local testing of
# the statistics. The Swarming system used for the bots produces "out.tar"
# files as results.
# The directory (or tar file) is expected to contain files with names
# of the following forms:
# <name-of-test-executable>_process<number>.json - results that are read here
# <name-of-test-executable>_process<number>.catapult_json - ignored here
# summary.json - ignored here
# Each *.json file (except for summary.json) contains results from a
# separate launch of a performance test process.
def RawResultsFromDir(filename):
# Note that sorting the filename listing (from os.listdir() or from
# tarfile) is not essential, but it helps to make any later processing
# more deterministic.
if os.path.isfile(filename):
# Read from tar file.
with as tar:
for member in sorted(tar.getmembers(),
key=lambda member:
if IsResultsFilename(
yield json.load(tar.extractfile(member))
# Read from directory.
for name in sorted(os.listdir(filename)):
if IsResultsFilename(name):
yield ReadJsonFile(os.path.join(filename, name))
# Takes a list of filenames of perf test results, each representing the
# results from one boot of Fuchsia, and each in the format accepted by
# RawResultsFromDir().
# Returns a dict mapping test names to Stats objects.
def ResultsFromDirs(filenames):
results_map = {}
for boot_results_path in filenames:
results_for_boot = {}
for process_run_results in RawResultsFromDir(boot_results_path):
for test_case in process_run_results:
# Skip the running time from the test's initial run within
# the process; treat it as a warmup run. The initial run
# is often slower than later runs, so it would skew the
# mean if we included it. The RoundTrip_*_MultiProcess
# tests are an extreme case, because the first run waits
# for a subprocess to start up. See PT-244.
new_value = Mean(test_case['values'][1:])
results_for_boot.setdefault(test_case['label'], []).append(
for label, values in results_for_boot.iteritems():
results_map.setdefault(label, []).append(Mean(values))
return {name: Stats(values) for name, values in results_map.iteritems()}
# This takes a directory representing perf test results from multiple boots
# of Fuchsia. It contains a "by_boot" subdir, which contains directories
# (or tar files) of the format read by RawResultsFromDir().
# This returns a dict mapping test names to Stats objects.
def ResultsFromDir(filename):
assert os.path.exists(filename)
by_boot_dir = os.path.join(filename, 'by_boot')
assert os.path.exists(by_boot_dir), by_boot_dir
filenames = [os.path.join(by_boot_dir, name)
for name in sorted(os.listdir(by_boot_dir))]
return ResultsFromDirs(filenames)
def FormatTable(heading_row, rows, out_fh):
column_count = len(heading_row)
for row in rows:
assert len(row) == column_count
rows = [heading_row] + rows
widths = [2 + max(len(row[col_number]) for row in rows)
for col_number in xrange(column_count)]
# Underline the heading row.
rows.insert(1, ['-' * (width - 2) for width in widths])
for row in rows:
for col_number, value in enumerate(row):
if col_number < column_count - 1:
out_fh.write(' ' * (widths[col_number] - len(value)))
def ComparePerf(args, out_fh):
results_maps = [ResultsFromDir(args.results_dir_before),
# Set of all test case names, including those added or removed.
labels = set(results_maps[0].iterkeys())
counts = {
'added': 0,
'removed': 0,
'faster': 0,
'slower': 0,
'no_sig_diff': 0,
heading_row = ['Test case', 'Improve/regress?', 'Factor change',
'Mean before', 'Mean after']
all_rows = []
diff_rows = []
for label in sorted(labels):
if label not in results_maps[0]:
result = 'added'
factor_range = '-'
before_range = '-'
after_range = results_maps[1][label].FormatConfidenceInterval()
elif label not in results_maps[1]:
result = 'removed'
factor_range = '-'
before_range = results_maps[0][label].FormatConfidenceInterval()
after_range = '-'
stats = [results_map[label] for results_map in results_maps]
interval_before = stats[0].interval
interval_after = stats[1].interval
factor_min = interval_after[0] / interval_before[1]
factor_max = interval_after[1] / interval_before[0]
if interval_after[0] >= interval_before[1]:
result = 'slower'
elif interval_after[1] <= interval_before[0]:
result = 'faster'
result = 'no_sig_diff'
before_range = stats[0].FormatConfidenceInterval()
after_range = stats[1].FormatConfidenceInterval()
factor_range = '%.3f-%.3f' % (factor_min, factor_max)
counts[result] += 1
row = [label, result, factor_range, before_range, after_range]
if result != 'no_sig_diff':
def FormatCount(count, text):
noun = 'test case' if count == 1 else 'test cases'
out_fh.write(' %d %s %s\n' % (count, noun, text))
out_fh.write('Summary counts:\n')
FormatCount(len(labels), 'in total')
'had no significant difference (no_sig_diff)')
FormatCount(counts['faster'], 'got faster')
FormatCount(counts['slower'], 'got slower')
FormatCount(counts['added'], 'added')
FormatCount(counts['removed'], 'removed')
if len(diff_rows) != 0:
out_fh.write('Results from test cases with differences:\n\n')
FormatTable(heading_row, diff_rows, out_fh)
out_fh.write('Results from all test cases:\n\n')
FormatTable(heading_row, all_rows, out_fh)
def IntervalsIntersect(interval1, interval2):
return not (interval2[0] >= interval1[1] or
interval2[1] <= interval1[0])
# Calculate the rate at which two intervals drawn (without replacement)
# from the given set of intervals will be non-intersecting.
def MismatchRate(intervals):
mismatch_count = sum(int(not IntervalsIntersect(intervals[i], intervals[j]))
for i in xrange(len(intervals))
for j in xrange(i))
comparisons_count = len(intervals) * (len(intervals) - 1) / 2
return float(mismatch_count) / comparisons_count
def ValidatePerfCompare(args, out_fh):
boot_count = len(args.results_dirs)
group_size = args.group_size
group_count = boot_count / group_size
results_maps = [
args.results_dirs[i * group_size : (i + 1) * group_size])
for i in xrange(group_count)]
# Group by test name (label).
by_test = {}
for results_map in results_maps:
for label, stats in results_map.iteritems():
by_test.setdefault(label, []).append(stats)
out_fh.write('Rate of mismatches (non-intersections) '
'of confidence intervals for each test:\n')
mismatch_rates = []
for label, stats_list in sorted(by_test.iteritems()):
mismatch_rate = MismatchRate([stats.interval for stats in stats_list])
out_fh.write('%f %s\n' % (mismatch_rate, label))
mean_relative_ci_width = Mean([
for results_map in results_maps
for stats in results_map.itervalues()])
mean_val = Mean(mismatch_rates)
out_fh.write('Mean mismatch rate: %f\n' % mean_val)
out_fh.write('Mean relative confidence interval width: %f\n'
% mean_relative_ci_width)
out_fh.write('Number of test cases: %d\n' % len(mismatch_rates))
out_fh.write('Number of result sets: %d groups of %d boots each'
' (ignoring %d leftover boots)\n'
% (group_count, group_size,
boot_count - group_size * group_count))
out_fh.write('Expected number of test cases with mismatches: %f\n'
% (mean_val * len(mismatch_rates)))
def TotalSize(snapshot_file):
with open(snapshot_file) as fh:
data = json.load(fh)
return sum(info['size'] for info in data['blobs'].itervalues())
def CompareSizes(args):
filenames = [args.snapshot_before, args.snapshot_after]
sizes = [TotalSize(filename) for filename in filenames]
print 'Size before: %d bytes' % sizes[0]
print 'Size after: %d bytes' % sizes[1]
print 'Difference: %d bytes' % (sizes[1] - sizes[0])
if sizes[0] != 0:
print 'Factor of: %f' % (float(sizes[1]) / sizes[0])
def Main(argv, out_fh):
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
parser_compare_perf = subparsers.add_parser(
help='Compare two sets of perf test results')
func=lambda args: ComparePerf(args, out_fh))
parser_validate_perfcompare = subparsers.add_parser(
help='Outputs statistics given multiple sets of perf test results'
' that come from the same build. This is for validating the'
' statistics used by the perfcompare tool. It can be used to check'
' the rate at which the tool will falsely indicate that performance'
' of a test case has regressed or improved.')
'-g', '--group_size', type=int, required=True,
help='Number of boots to put in each group. To get realistic'
' results that reflect how the perfcompare trybots would behave,'
' this should match the boots_per_revision setting in the'
' recipe. (Since that code is currently'
' not part of the Fuchsia checkout, we cannot make the settings'
' match automatically.)')
parser_validate_perfcompare.add_argument('results_dirs', nargs='+')
func=lambda args: ValidatePerfCompare(args, out_fh))
parser_compare_sizes = subparsers.add_parser(
help='Compare file sizes specified by two system.snapshot files')
args = parser.parse_args(argv)
if __name__ == '__main__':
Main(sys.argv[1:], sys.stdout)