blob: 315805a6d8705ff2a2909f4fc6ee83d8ee2f5542 [file] [log] [blame]
# Copyright 2019 The Fuchsia Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# This tool is for invoking by the performance comparison trybots. It
# is intended for comparing the performance of two versions of
# Fuchsia.
import argparse
import glob
import json
import math
import os
import subprocess
import sys
import tarfile
import scipy.stats
# For comparing results from a performance test, we calculate
# confidence intervals for the mean running times of the test. If the
# confidence intervals are non-overlapping, we conclude that the
# performance has improved or regressed for this test.
#
# Data is gathered from a 3-level sampling process:
#
# 1) Boot Fuchsia multiple times.
# 2) For each boot, launch the perf test process one or more times.
# 3) For each process launch, instantiate the performance test and
# run the body of the test some number of times.
#
# This is intended to account for variation across boots and across process
# launches.
#
# Currently we use t-test confidence intervals. This assumes that the
# values we apply the t-test to are normally distributed, or approximately
# normally distributed. In future we could instead use bootstrap
# confidence intervals, which would avoid that assumption.
# Dataset types:
#
# There are four types of dataset containing raw perf test results:
#
# * Process dataset: JSON data from a single *.fuchsiaperf.json file,
# which is usually from a single process launch. This may contain
# results from multiple test cases.
#
# * Boot dataset: Data from a single boot of Fuchsia. This may contain
# multiple process datasets.
#
# * Multi-boot dataset: Data from multiple boots of a single build of
# Fuchsia. This may contain multiple boot datasets.
#
# * Before/after dataset: Contains two multi-boot datasets, one from a
# "before" build of Fuchsia and one from an "after" build.
#
# Note that we use the term "dataset" rather than "results" because the
# former makes it easier to disambiguate using singular vs. plural. For
# example, "boot_results" is ambiguous as to whether it represents a single
# boot or whether it is a list where each entry is a "boot result"
# representing a single boot. In contrast, "boot_dataset" (always a single
# instance) vs. "boot_datasets" (always a list or iterable) avoids that
# ambiguity.
#
# The infra recipe represents those datasets on the filesystem as follows:
#
# * Process dataset: a single .fuchsiaperf.json file.
#
# * Boot dataset: a directory containing files with names of the following
# forms:
#
# <test-executable-name>_process<number>.fuchsiaperf.json - process dataset
# <test-executable-name>_process<number>.catapult_json - ignored here
# summary.json - ignored here
#
# The code below can read a boot dataset from tar files as well as from
# directories. Accepting tar files is a convenience for when doing
# local testing of the statistics (including for validate_perfcompare).
# The Swarming system used for the bots produces "out.tar" files as
# results.
#
# * Multi-boot dataset: a directory containing a "by_boot" subdirectory,
# which contains boot dataset directories.
#
# A before/after dataset is represented as two directories.
# ALPHA is a parameter for calculating confidence intervals. It is
# the probability that the true value for the statistic we're
# estimating (here, the mean running time) lies outside the confidence
# interval.
ALPHA = 0.01
def Mean(values):
if len(values) == 0:
raise AssertionError("Mean is not defined for an empty sample")
return float(sum(values)) / len(values)
# Returns the mean and standard deviation of a sample. This applies
# Bessel's correction to the calculation of the standard deviation.
#
# If the sample contains only a single value, this returns None for the
# standard deviation, because we cannot estimate the standard deviation
# with Bessel's correction in that case.
def MeanAndStddev(values):
mean_val = Mean(values)
if len(values) == 1:
return mean_val, None
sum_of_squares = 0.0
for val in values:
diff = val - mean_val
sum_of_squares += diff * diff
stddev_val = math.sqrt(sum_of_squares / (len(values) - 1))
return mean_val, stddev_val
def FormatDecimal(val, decimal_places):
return ("%%.%df" % decimal_places) % val
# Format the given "value +/- offset" confidence interval as a string.
#
# This prints a number of decimal fraction digits that is appropriate to
# the width of the confidence interval. The offset part is formatted to 2
# significant figures. The value part is formatted with the same number of
# decimal places as the offset.
def FormatConfidenceInterval(value, offset):
if math.isinf(offset) or math.isnan(offset) or offset <= 0:
return "%g +/- %g" % (value, offset)
significant_figures = 2
# Applying math.floor() ensures that powers of 10 and non powers of 10
# (e.g. 0.10 and 0.11) are both formatted with the same number of
# decimal places.
log_value = int(math.floor(math.log10(offset)))
decimal_places = max(significant_figures - log_value - 1, 0)
return "%s +/- %s" % (
FormatDecimal(value, decimal_places),
FormatDecimal(offset, decimal_places),
)
class Stats(object):
def __init__(self, values, unit):
self.unit = unit
sample_size = len(values)
mean, stddev = MeanAndStddev(values)
self._mean = mean
if stddev is None:
self._offset = None
self.interval = None
else:
self._offset = (
-scipy.stats.t.ppf(ALPHA / 2, sample_size - 1)
* stddev
/ math.sqrt(sample_size)
)
# Confidence interval for the mean.
self.interval = (mean - self._offset, mean + self._offset)
def FormatConfidenceInterval(self):
if self._offset is None:
# Point estimate only: We cannot calculate a confidence
# interval because the sample only contained a single value.
#
# Explicitly use 12 significant figures to match what str()
# does on floats in Python 2. This gives plenty of precision
# for practical use while suppressing rounding noise created by
# various operations.
return "%.12g %s" % (self._mean, self.unit)
return "%s %s" % (
FormatConfidenceInterval(self._mean, self._offset),
self.unit,
)
# Returns the relative CI width, which is the width of the confidence
# interval divided by the mean.
def RelativeConfidenceIntervalWidth(self):
assert self._offset is not None
return self._offset * 2 / self._mean
def StatsFormatConfidenceInterval(stats):
if stats is None:
return "-"
return stats.FormatConfidenceInterval()
def ReadJsonFile(filename):
with open(filename, "r") as fh:
return json.load(fh)
def IsResultsFilename(name):
return name.endswith(".fuchsiaperf.json")
class SingleBootDataset(object):
def __init__(self, filename):
self._filename = filename
def GetProcessDatasets(self):
# Note that sorting the filename listing (from os.walk() or from
# tarfile) is not essential, but it helps to make any later processing
# more deterministic.
if os.path.isfile(self._filename):
# Read from tar file.
with tarfile.open(self._filename) as tar:
for member in sorted(
tar.getmembers(), key=lambda member: member.name
):
if IsResultsFilename(member.name):
yield json.load(tar.extractfile(member))
else:
# Read from directory.
for dir_path, _, file_names in sorted(os.walk(self._filename)):
for name in sorted(file_names):
if IsResultsFilename(name):
yield ReadJsonFile(os.path.join(dir_path, name))
class MultiBootDataset(object):
def __init__(self, dir_path):
self._dir_path = dir_path
def GetBootDatasets(self):
by_boot_dir = os.path.join(self._dir_path, "by_boot")
assert os.path.exists(by_boot_dir), by_boot_dir
for name in sorted(os.listdir(by_boot_dir)):
yield SingleBootDataset(os.path.join(by_boot_dir, name))
# Takes a list of values that are collected from consecutive runs of a
# test. For libperftest tests, those are test runs within a process.
#
# Returns the mean of the values, but excluding the first run. We treat
# the initial run as a warmup run. The initial run is often slower than
# later runs, so it would skew the mean if we included it. The
# RoundTrip_*_MultiProcess tests are an extreme case, because the first run
# waits for a subprocess to start up. See https://fxbug.dev/42097154.
def MeanExcludingWarmup(values):
# Some tests report a single value per process run. For those tests,
# we use that value and don't discard it.
if len(values) == 1:
return values[0]
return Mean(values[1:])
def FormatTestName(results):
return "%s: %s" % (results["test_suite"], results["label"])
UNIT_ABBREVIATIONS = {"milliseconds": "ms", "nanoseconds": "ns"}
def FormatUnit(unit_set):
assert len(unit_set) > 0
if len(unit_set) > 1:
raise AssertionError("Inconsistent units for test case: %s" % unit_set)
unit = list(unit_set)[0]
return UNIT_ABBREVIATIONS.get(unit, unit)
# This is the set of unit strings that are accepted by catapult_converter
# and treated as bigger-is-better by catapult_converter, and that don't
# have an explicit "_biggerIsBetter" suffix.
BIGGERISBETTER_UNITS = {
"bits/second",
"bytes/second",
"frames/second",
}
def UnitBiggerIsBetter(unit):
# We don't attempt to do any validation of the unit string here. We
# assume that was done at an earlier stage when the test was run and
# when it invoked catapult_converter, which validates the unit string.
return unit in BIGGERISBETTER_UNITS or unit.endswith("_biggerIsBetter")
# Takes a sequence of boot datasets and produces summary statistics.
# Returns a dict mapping test names to Stats objects.
def StatsFromBootDatasets(boot_datasets):
# Mapping from test names to lists of values.
results_map = {}
# Mapping from test names to sets of strings (for units of measurement).
units_map = {}
for boot_dataset in boot_datasets:
results_for_boot = {}
for process_dataset in boot_dataset.GetProcessDatasets():
for test_case in process_dataset:
new_value = MeanExcludingWarmup(test_case["values"])
name = FormatTestName(test_case)
results_for_boot.setdefault(name, []).append(new_value)
units_map.setdefault(name, set()).add(test_case["unit"])
for label, values in results_for_boot.items():
results_map.setdefault(label, []).append(Mean(values))
return {
name: Stats(values, FormatUnit(units_map[name]))
for name, values in results_map.items()
}
def StatsFromMultiBootDataset(multi_boot_dataset):
return StatsFromBootDatasets(multi_boot_dataset.GetBootDatasets())
def FormatFactor(val_before, val_after):
# Avoid division by zero.
if val_before == 0:
return "inf"
return "%.3f" % (val_after / val_before)
def FormatFactorRange(interval_before, interval_after):
if interval_before == (0, 0) and interval_after == (0, 0):
return "no_change"
if interval_before[0] < 0 or interval_after[0] < 0:
return "ci_too_wide"
factor_min = FormatFactor(interval_before[1], interval_after[0])
factor_max = FormatFactor(interval_before[0], interval_after[1])
return "%s-%s" % (factor_min, factor_max)
def FormatTable(heading_row, rows, out_fh):
column_count = len(heading_row)
for row in rows:
assert len(row) == column_count
rows = [heading_row] + rows
widths = [
2 + max(len(row[col_number]) for row in rows)
for col_number in range(column_count)
]
# Underline the heading row.
rows.insert(1, ["-" * (width - 2) for width in widths])
for row in rows:
for col_number, value in enumerate(row):
out_fh.write(value)
if col_number < column_count - 1:
out_fh.write(" " * (widths[col_number] - len(value)))
out_fh.write("\n")
DIRECTION_MAP = {0: "no_sig_diff", -1: "improved", 1: "regressed"}
def CompareIntervals(stats_before, stats_after):
assert stats_before is not None or stats_after is not None
if stats_before is None:
return "added", "-"
if stats_after is None:
return "removed", "-"
if stats_before.interval is None or stats_after.interval is None:
return "point_estimate", "-"
# Using a ">" comparison rather than ">=" ensures that if the intervals
# are equal and zero-width, they are treated as "no_sig_diff".
if stats_after.interval[0] > stats_before.interval[1]:
direction = 1
elif stats_after.interval[1] < stats_before.interval[0]:
direction = -1
else:
direction = 0
# If the units changed, we use the new units to determine whether this
# is a bigger-is-better metric or a smaller-is-better metric. i.e. We
# assume that the change was intentional and the new units are more
# correct.
# TODO: We might want to warn in the cases where units are changed, or
# do automatic conversions where possible (such as between milliseconds
# and nanoseconds).
if UnitBiggerIsBetter(stats_after.unit):
direction = -direction
factor_range = FormatFactorRange(
stats_before.interval, stats_after.interval
)
return DIRECTION_MAP[direction], factor_range
def ComparePerf(args, out_fh):
results_maps = [
StatsFromMultiBootDataset(MultiBootDataset(dir_path))
for dir_path in args.results_dir
]
# Set of all test case names, including those added or removed.
labels = set()
for results_map in results_maps:
labels.update(results_map.keys())
if len(results_maps) != 2:
# Display the dataset(s) without doing any comparison.
heading_row = ["Test case"]
if len(results_maps) == 1:
heading_row.extend(["Mean"])
else:
heading_row.extend(
["Mean %d" % (idx + 1) for idx in range(len(results_maps))]
)
rows = []
for label in sorted(labels):
row = [label]
for results_map in results_maps:
row.append(
StatsFormatConfidenceInterval(results_map.get(label))
)
rows.append(row)
FormatTable(heading_row, rows, out_fh)
return
counts = {
"added": 0,
"removed": 0,
"improved": 0,
"regressed": 0,
"no_sig_diff": 0,
"point_estimate": 0,
}
heading_row = [
"Test case",
"Improve/regress?",
"Factor change",
"Mean before",
"Mean after",
]
all_rows = []
diff_rows = []
for label in sorted(labels):
stats = [results_map.get(label) for results_map in results_maps]
result, factor_range = CompareIntervals(stats[0], stats[1])
counts[result] += 1
row = [
label,
result,
factor_range,
StatsFormatConfidenceInterval(stats[0]),
StatsFormatConfidenceInterval(stats[1]),
]
all_rows.append(row)
if result not in ("no_sig_diff", "point_estimate"):
diff_rows.append(row)
def FormatCount(count, text):
noun = "test case" if count == 1 else "test cases"
out_fh.write(" %d %s %s\n" % (count, noun, text))
out_fh.write("Summary counts:\n")
FormatCount(len(labels), "in total")
FormatCount(
counts["no_sig_diff"], "had no significant difference (no_sig_diff)"
)
if counts["point_estimate"]:
FormatCount(
counts["point_estimate"],
"cannot be compared because we have point estimates only",
)
FormatCount(counts["improved"], "improved")
FormatCount(counts["regressed"], "regressed")
FormatCount(counts["added"], "added")
FormatCount(counts["removed"], "removed")
out_fh.write("\n\n")
if len(diff_rows) != 0:
out_fh.write("Results from test cases with differences:\n\n")
FormatTable(heading_row, diff_rows, out_fh)
out_fh.write("\n\n")
out_fh.write("Results from all test cases:\n\n")
FormatTable(heading_row, all_rows, out_fh)
def PrintMultibootDatasetTable(multiboot_dataset, out_fh):
stats_map = StatsFromMultiBootDataset(multiboot_dataset)
heading_row = ["Test case", "Mean"]
rows = []
for name, stats in sorted(stats_map.items()):
rows.append([name, stats.FormatConfidenceInterval()])
FormatTable(heading_row, rows, out_fh)
def RunLocal(args, out_fh, run_cmd):
if glob.glob(args.iter_file) != []:
# We check for this case so that we don't accidentally treat
# pre-existing files the same as files newly outputted by
# args.iter_cmd.
raise AssertionError(
"Temporary output file(s) %r already exist: try deleting them first"
% args.iter_file
)
if os.path.exists(args.dest):
raise AssertionError(
"Destination path %r already exists: either delete it or use"
" a different destination, because run_local will not"
" overwrite it or append to it" % args.dest
)
by_boot_dir = os.path.join(args.dest, "by_boot")
os.mkdir(args.dest)
os.mkdir(by_boot_dir)
for boot_idx in range(args.boots):
# This prefix enables error-checking in the shell commands, for
# both safety and convenience.
errexit_prefix = "set -o errexit -o nounset; "
run_cmd(errexit_prefix + args.reboot_cmd, shell=True)
run_cmd(errexit_prefix + args.iter_cmd, shell=True)
boot_dir = os.path.join(by_boot_dir, "boot%06d" % boot_idx)
os.mkdir(boot_dir)
dataset_files = sorted(glob.glob(args.iter_file))
for idx, dataset_file in enumerate(dataset_files):
new_filename = os.path.join(
boot_dir, "results%06d.fuchsiaperf.json" % idx
)
os.rename(dataset_file, new_filename)
# Print a table of the results so far. This prints confidence
# intervals, which requires having results from at least 2 boots.
if boot_idx >= 1:
out_fh.write("\nResults after %d boots:\n\n" % (boot_idx + 1))
PrintMultibootDatasetTable(MultiBootDataset(args.dest), out_fh)
out_fh.write("\n")
def IntervalsIntersect(interval1, interval2):
return not (interval2[0] >= interval1[1] or interval2[1] <= interval1[0])
# Calculate the rate at which two intervals drawn (without replacement)
# from the given set of intervals will be non-intersecting.
def MismatchRate(intervals):
mismatch_count = sum(
int(not IntervalsIntersect(intervals[i], intervals[j]))
for i in range(len(intervals))
for j in range(i)
)
comparisons_count = len(intervals) * (len(intervals) - 1) / 2
return float(mismatch_count) / comparisons_count
def ValidatePerfCompare(args, out_fh):
boot_datasets = [
SingleBootDataset(filename) for filename in args.results_dirs
]
boot_count = len(boot_datasets)
group_size = args.group_size
group_count = boot_count // group_size
results_maps = [
StatsFromBootDatasets(
boot_datasets[i * group_size : (i + 1) * group_size]
)
for i in range(group_count)
]
# Group by test name (label).
by_test = {}
for results_map in results_maps:
for label, stats in results_map.items():
by_test.setdefault(label, []).append(stats)
out_fh.write(
"Rate of mismatches (non-intersections) "
"of confidence intervals for each test:\n"
)
mismatch_rates = []
for label, stats_list in sorted(by_test.items()):
mismatch_rate = MismatchRate([stats.interval for stats in stats_list])
out_fh.write("%f %s\n" % (mismatch_rate, label))
mismatch_rates.append(mismatch_rate)
mean_relative_ci_width = Mean(
[
stats.RelativeConfidenceIntervalWidth()
for results_map in results_maps
for stats in results_map.values()
]
)
out_fh.write("\n")
mean_val = Mean(mismatch_rates)
out_fh.write("Mean mismatch rate: %f\n" % mean_val)
out_fh.write(
"Mean relative confidence interval width: %f\n" % mean_relative_ci_width
)
out_fh.write("Number of test cases: %d\n" % len(mismatch_rates))
out_fh.write(
"Number of result sets: %d groups of %d boots each"
" (ignoring %d leftover boots)\n"
% (group_count, group_size, boot_count - group_size * group_count)
)
out_fh.write(
"Expected number of test cases with mismatches: %f\n"
% (mean_val * len(mismatch_rates))
)
def Main(argv, out_fh, run_cmd=subprocess.check_call):
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(required=True)
subparser = subparsers.add_parser(
"compare_perf",
help="Display sets of perf test results. "
" If given two datasets, this will compare the two, showing whether"
" tests had regressions or improvements. "
" Otherwise (if given 1 or >2 datasets), the data is shown with no"
" comparisons.",
)
subparser.add_argument("results_dir", nargs="+")
subparser.set_defaults(func=lambda args: ComparePerf(args, out_fh))
subparser = subparsers.add_parser(
"run_local",
help="Gather a multi-boot dataset of performance test results"
" from a single version of Fuchsia by locally running the command"
" specified by --iter_cmd",
)
subparser.add_argument(
"--boots",
type=int,
required=True,
help="Number of (re)boots of Fuchsia to run",
)
subparser.add_argument(
"--iter_cmd",
required=True,
help="Command for running a performance test. "
" This command is run locally: it is passed to the shell. "
" This command is expected to write its output to the file (or files)"
" specified by --iter_file. "
" Note that error-checking is enabled for this shell command (using"
' "set -o errexit -o nounset")',
)
subparser.add_argument(
"--iter_file",
required=True,
help="File(s) that the performance test will write its results to. "
" This is a glob expression, so it may specify multiple files. "
" Each file is expected to be a process dataset in the"
" *.fuchsiaperf.json format. These files will be removed (renamed)"
" by this tool",
)
subparser.add_argument(
"--reboot_cmd",
default="fx reboot && fx wait",
help="Command to use for rebooting Fuchsia. This is optional. "
" The default is %(default)r. As with --iter_cmd, error-checking is"
" enabled for this shell command",
)
subparser.add_argument(
"--dest",
required=True,
help="Destination directory for writing the multi-boot dataset",
)
subparser.set_defaults(func=lambda args: RunLocal(args, out_fh, run_cmd))
subparser = subparsers.add_parser(
"validate_perfcompare",
help="Outputs statistics given multiple sets of perf test results"
" that come from the same build. This is for validating the"
" statistics used by the perfcompare tool. It can be used to check"
" the rate at which the tool will falsely indicate that performance"
" of a test case has regressed or improved.",
)
subparser.add_argument(
"-g",
"--group_size",
type=int,
required=True,
help="Number of boots to put in each group. To get realistic"
" results that reflect how the perfcompare trybots would behave,"
" this should match the boots_per_revision setting in the"
" infra recipe. (Since that code is currently not part of the"
" Fuchsia checkout, we cannot make the settings match"
" automatically.)",
)
subparser.add_argument("results_dirs", nargs="+")
subparser.set_defaults(func=lambda args: ValidatePerfCompare(args, out_fh))
args = parser.parse_args(argv)
args.func(args)
if __name__ == "__main__":
Main(sys.argv[1:], sys.stdout)