blob: c39ddd0ae89f8fc9d45425e4dbdb9eac8ea17952 [file] [log] [blame]
# Copyright 2019 The Fuchsia Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import io
import json
import os
import re
import shutil
import subprocess
import sys
import tarfile
import tempfile
import unittest
import numpy
import perfcompare
# Test case helper class for creating temporary directories that will
# be cleaned up when the test finishes.
class TempDirTestCase(unittest.TestCase):
def setUp(self):
self._on_teardown = []
def MakeTempDir(self):
temp_dir = tempfile.mkdtemp(
prefix='tmp_unittest_%s_' % self.__class__.__name__)
def tear_down():
shutil.rmtree(temp_dir)
self._on_teardown.append(tear_down)
return temp_dir
def tearDown(self):
for func in reversed(self._on_teardown):
func()
def WriteJsonFile(filename, json_data):
with open(filename, 'w') as fh:
json.dump(json_data, fh)
def ReadGoldenFile(filename):
with open(filename, 'r') as fh:
data = fh.read()
matches = list(re.finditer('\n\n### (.*)\n', data, re.M))
starts = [m.end() for m in matches]
ends = [m.start() for m in matches[1:]] + [len(data)]
for m, start, end in zip(matches, starts, ends):
yield m.group(1), data[start:end]
# Helper for checking against test expectations in a golden file.
# This provides an implementation of AssertCaseEq() that compares
# results against the golden file.
class GoldenDataInput(object):
def __init__(self, filename):
self._cases = dict(ReadGoldenFile(filename))
def AssertCaseEq(self, name, actual):
expected = self._cases[name]
if expected != actual:
raise AssertionError('"%s" != "%s"' % (actual, expected))
# This provides an implementation of AssertCaseEq() that updates the
# golden file with new expectations generated by the tests.
class GoldenDataOutput(object):
def __init__(self):
self._cases = {}
def AssertCaseEq(self, name, actual):
assert name not in self._cases, name
self._cases[name] = actual
def WriteFile(self, filename):
with open(filename, 'w') as fh:
for name, data in sorted(self._cases.items()):
fh.write('\n\n### %s\n%s' % (name, data))
GOLDEN_FILE = os.path.join(os.path.dirname(__file__),
'perfcompare_test_output.txt')
GOLDEN = GoldenDataInput(GOLDEN_FILE)
def TestMain():
global GOLDEN
if '--generate' in sys.argv:
sys.argv.pop(sys.argv.index('--generate'))
GOLDEN = GoldenDataOutput()
try:
unittest.main()
finally:
GOLDEN.WriteFile(GOLDEN_FILE)
else:
unittest.main()
# Test data from a normal distribution, generated using the following code:
# ', '.join('%.4f' % random.gauss(0, 1) for _ in xrange(100))
TEST_VALUES = [
0.4171, 2.1056, -0.0223, -1.6592, 0.4766, -0.6405, 0.3488, 1.5729,
2.0654, -0.1324, -0.8648, -0.2793, -0.7966, 0.2851, -0.9374, -2.0275,
0.8222, -0.2396, -0.6982, 0.9067, 0.9416, -2.2870, -0.1868, 1.0700,
-1.2531, 0.8455, 1.4755, 0.2979, 0.3441, 0.6694, -0.1808, -0.9038,
0.8267, -0.4320, -0.7166, 0.3757, -0.5135, -0.9497, 2.0372, -0.3364,
0.3879, -0.2970, 1.3872, 0.6538, 1.0674, 1.2349, -0.6873, -0.1807,
0.6867, -0.1150, -1.0526, -0.6853, -0.5858, -1.8460, 1.6041, -1.1638,
0.5459, -1.6476, -0.8711, -0.9001, 0.0788, -0.8170, 0.2439, 0.0129,
-0.8674, -1.1076, -0.0074, -0.6230, -0.4761, -2.2526, 0.4906, -0.5001,
-0.2050, 0.7623, -0.5511, -0.2837, -0.8797, -0.5374, -1.2910, 0.9551,
0.4483, -0.6352, -0.3334, -0.5105, 0.1073, 2.9131, -0.4941, -0.2808,
-0.2517, -1.9961, 0.9214, -0.6325, -1.1895, 0.8118, 1.5424, 0.5601,
-1.0322, 0.7135, -0.2780, -0.1128]
def GenerateTestData(mean, stddev):
return [x * stddev + mean for x in TEST_VALUES]
# This is an example of a slow running time value for an initial run of a
# test. This should be skipped by the software under test.
SLOW_INITIAL_RUN = [1e6]
class FormatConfidenceIntervalTest(unittest.TestCase):
def test_confidence_interval_formatting(self):
Format = perfcompare.FormatConfidenceInterval
self.assertEqual(Format(12345.6789, 2222), '12346 +/- 2222')
self.assertEqual(Format(12345.6789, 0.02222), '12345.679 +/- 0.022')
self.assertEqual(Format(12345.6789, 0.07777), '12345.679 +/- 0.078')
self.assertEqual(Format(12345.6789, 0.09911), '12345.679 +/- 0.099')
# Corner case: rounding 0.09950 to 2 significant figures produces
# 0.100, which looks like 3 significant figures rather than 2.
self.assertEqual(Format(12345.6789, 0.09950), '12345.679 +/- 0.100')
self.assertEqual(Format(12345.6789, 2e-5), '12345.678900 +/- 0.000020')
# Corner case: the offset is a power of 10.
self.assertEqual(Format(12345.6789, 0.1), '12345.68 +/- 0.10')
self.assertEqual(Format(12345.6789, 0.01), '12345.679 +/- 0.010')
# Corner case: zero offset.
self.assertEqual(Format(12345.6789, 0), '12345.7 +/- 0')
# Corner case: negative offset. This does not make sense for a
# confidence interval and should not happen, but let's ensure it
# gets formatted anyway in case that it useful for debugging.
self.assertEqual(Format(12345.6789, -1), '12345.7 +/- -1')
# Corner cases: infinity and NaN.
self.assertEqual(Format(12345.6789, numpy.inf), '12345.7 +/- inf')
self.assertEqual(Format(12345.6789, -numpy.inf), '12345.7 +/- -inf')
self.assertEqual(Format(12345.6789, numpy.nan), '12345.7 +/- nan')
self.assertEqual(Format(numpy.inf, 0.1234), 'inf +/- 0.12')
self.assertEqual(Format(-numpy.inf, 0.1234), '-inf +/- 0.12')
self.assertEqual(Format(numpy.nan, 0.1234), 'nan +/- 0.12')
# Generate some example perf test data, allowing variation at each level of
# the sampling process (per boot, per process, and per iteration within
# each process). This follows a random effects model. Returns a list of
# lists of lists of values.
def GenerateData(mean=1000,
stddev_across_boots=0,
stddev_across_processes=0,
stddev_across_iters=0):
it = iter(TEST_VALUES)
def GenerateValues(mean, stddev, count):
return [next(it) * stddev + mean for _ in range(count)]
# This reads 4**3 + 4**2 + 4 = 84 values from TEST_VALUES, so it does
# not exceed the number of values in TEST_VALUES.
return [[SLOW_INITIAL_RUN
+ GenerateValues(mean_within_process, stddev_across_iters, 4)
for mean_within_process in GenerateValues(
mean_within_boot, stddev_across_processes, 4)]
for mean_within_boot in GenerateValues(
mean, stddev_across_boots, 4)]
class StatisticsTest(TempDirTestCase):
def ResultsDictForValues(self, run_values):
return {'label': 'ExampleTest',
'test_suite': 'example_suite',
'unit': 'nanoseconds',
'values': run_values}
# Given data in the format returned by GenerateData(), writes this data
# to a temporary directory.
def DirOfData(self, data):
dir_path = self.MakeTempDir()
os.mkdir(os.path.join(dir_path, 'by_boot'))
for boot_idx, results_for_boot in enumerate(data):
test_dir = os.path.join(
dir_path, 'by_boot', 'boot%06d' % boot_idx, 'test-name',
'subdir')
os.makedirs(test_dir)
for process_idx, run_values in enumerate(results_for_boot):
dest_file = os.path.join(
test_dir,
'example_process%06d.fuchsiaperf.json' % process_idx)
WriteJsonFile(dest_file, [self.ResultsDictForValues(run_values)])
return dir_path
# Sanity-check that DirOfData() writes data in the correct format by
# reading back some simple test data.
def test_readback_of_data(self):
data = [[[1, 2], [3, 4]],
[[5, 6], [7, 8]]]
dataset = perfcompare.MultiBootDataset(self.DirOfData(data))
boot_datasets = list(dataset.GetBootDatasets())
self.assertEqual(len(boot_datasets), 2)
self.assertEqual(list(boot_datasets[0].GetProcessDatasets()),
[[self.ResultsDictForValues([1, 2])],
[self.ResultsDictForValues([3, 4])]])
self.assertEqual(list(boot_datasets[1].GetProcessDatasets()),
[[self.ResultsDictForValues([5, 6])],
[self.ResultsDictForValues([7, 8])]])
def TarFileOfDir(self, dir_path, write_mode):
tar_filename = os.path.join(self.MakeTempDir(), 'out.tar')
with tarfile.open(tar_filename, write_mode) as tar:
for name in os.listdir(dir_path):
tar.add(os.path.join(dir_path, name), arcname=name)
return tar_filename
def test_readback_of_data_from_tar_file(self):
data = [[[1, 2], [3, 4]]]
dir_path = self.DirOfData(data)
self.assertEqual(len(os.listdir(os.path.join(dir_path, 'by_boot'))), 1)
# Test the uncompressed and gzipped cases.
for write_mode in ('w', 'w:gz'):
tar_filename = self.TarFileOfDir(
os.path.join(dir_path, 'by_boot', 'boot000000'), write_mode)
boot_dataset = perfcompare.SingleBootDataset(tar_filename)
self.assertEqual(list(boot_dataset.GetProcessDatasets()),
[[self.ResultsDictForValues([1, 2])],
[self.ResultsDictForValues([3, 4])]])
def CheckConfidenceInterval(self, data, interval_string):
dir_path = self.DirOfData(data)
test_name = 'example_suite: ExampleTest'
stats = perfcompare.StatsFromMultiBootDataset(
perfcompare.MultiBootDataset(dir_path))[test_name]
self.assertEqual(stats.FormatConfidenceInterval(), interval_string)
# Test the CIs produced with variation at different levels of the
# multi-level sampling process.
def test_confidence_intervals(self):
self.CheckConfidenceInterval(GenerateData(), '1000 +/- 0 ns')
self.CheckConfidenceInterval(
GenerateData(stddev_across_boots=100), '1021 +/- 452 ns')
self.CheckConfidenceInterval(
GenerateData(stddev_across_processes=100), '1012 +/- 151 ns')
self.CheckConfidenceInterval(
GenerateData(stddev_across_iters=100), '981 +/- 74 ns')
# Test the case where just a single value is produced per process run.
def test_confidence_interval_with_single_value_per_process(self):
self.CheckConfidenceInterval(
[[[100]], [[101]]], '100 +/- 32 ns')
# If the "before" and "after" results have identical confidence
# intervals, that should be treated as "no difference", including when
# the CIs are zero-width (as tested here).
def test_comparing_equal_zero_width_confidence_intervals(self):
dir_path = self.DirOfData([[[200]], [[200]]])
stdout = io.StringIO()
perfcompare.Main(['compare_perf', dir_path, dir_path], stdout)
output = stdout.getvalue()
GOLDEN.AssertCaseEq('comparison_no_change_zero_width_ci', output)
class PerfCompareTest(TempDirTestCase):
def AddIgnoredFiles(self, dest_dir):
# Include a summary.json file to check that we skip reading it.
with open(os.path.join(dest_dir, 'summary.json'), 'w') as fh:
fh.write('dummy_data')
# Include a *.catapult_json file to check that we skip reading these.
with open(os.path.join(dest_dir, 'foo.catapult_json'), 'w') as fh:
fh.write('dummy_data')
def WriteExampleDataDir(self, dir_path, mean=1000, stddev=100,
drop_one=False, single_boot=False):
results = [('ClockGetTimeExample', GenerateTestData(mean, stddev))]
if not drop_one:
results.append(('SecondExample', GenerateTestData(2000, 300)))
if single_boot:
for test_name, values in results:
dest_dir = os.path.join(dir_path, 'by_boot', 'boot0')
dest_file = os.path.join(
dest_dir, '%s.fuchsiaperf.json' % test_name)
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
self.AddIgnoredFiles(dest_dir)
WriteJsonFile(
dest_file,
[{'label': test_name,
'test_suite': 'fuchsia.example',
'unit': 'nanoseconds',
'values': SLOW_INITIAL_RUN + values}])
else:
for test_name, values in results:
for idx, value in enumerate(values):
dest_dir = os.path.join(
dir_path, 'by_boot', 'boot%06d' % idx)
dest_file = os.path.join(
dest_dir, '%s.fuchsiaperf.json' % test_name)
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
self.AddIgnoredFiles(dest_dir)
WriteJsonFile(
dest_file,
[{'label': test_name,
'test_suite': 'fuchsia.example',
'unit': 'nanoseconds',
'values': SLOW_INITIAL_RUN + [value]}])
def ExampleDataDir(self, **kwargs):
dir_path = self.MakeTempDir()
self.WriteExampleDataDir(dir_path, **kwargs)
return dir_path
def test_reading_results_from_dir(self):
dir_path = self.ExampleDataDir()
results = perfcompare.StatsFromMultiBootDataset(
perfcompare.MultiBootDataset(dir_path))
test_name = 'fuchsia.example: ClockGetTimeExample'
self.assertEqual(
results[test_name].FormatConfidenceInterval(),
'992 +/- 26 ns')
# Returns the output of compare_perf when run on the given directories.
def ComparePerf(self, before_dir, after_dir):
stdout = io.StringIO()
perfcompare.Main(['compare_perf', before_dir, after_dir], stdout)
return stdout.getvalue()
def test_mean_and_stddev(self):
values = [10, 5, 15]
mean_val, stddev_val = perfcompare.MeanAndStddev(values)
self.assertEqual(mean_val, 10.0)
self.assertEqual(perfcompare.Mean(values), 10.0)
self.assertEqual(stddev_val, 5.0)
# Single-value sample.
self.assertEqual(perfcompare.MeanAndStddev([123]), (123.0, None))
# Check error cases.
self.assertRaises(AssertionError, lambda: perfcompare.Mean([]))
self.assertRaises(AssertionError, lambda: perfcompare.MeanAndStddev([]))
# Check that data written using the golden file helper reads back
# the same.
def test_golden_file_write_and_read(self):
temp_file = os.path.join(self.MakeTempDir(), 'file')
writer = GoldenDataOutput()
writer.AssertCaseEq('a_key', 'a_value')
writer.AssertCaseEq('b_key', 'line 1\n' 'line 2\n')
writer.WriteFile(temp_file)
reader = GoldenDataInput(temp_file)
reader.AssertCaseEq('a_key', 'a_value')
reader.AssertCaseEq('b_key', 'line 1\n' 'line 2\n')
self.assertRaises(AssertionError,
lambda: reader.AssertCaseEq('a_key', 'other_value'))
def test_comparison_no_change(self):
before_dir = self.ExampleDataDir()
after_dir = self.ExampleDataDir()
output = self.ComparePerf(before_dir, after_dir)
GOLDEN.AssertCaseEq('comparison_no_change', output)
# Test a regression that is large enough to be flagged.
def test_comparison_regression(self):
before_dir = self.ExampleDataDir(mean=1500, stddev=100)
after_dir = self.ExampleDataDir(mean=1600, stddev=100)
output = self.ComparePerf(before_dir, after_dir)
GOLDEN.AssertCaseEq('comparison_regression', output)
# Test an improvement that is large enough to be flagged.
def test_comparison_improvement(self):
before_dir = self.ExampleDataDir(mean=1500, stddev=100)
after_dir = self.ExampleDataDir(mean=1400, stddev=100)
output = self.ComparePerf(before_dir, after_dir)
GOLDEN.AssertCaseEq('comparison_improvement', output)
# Test an improvement that is not large enough to be flagged.
def test_comparison_improvement_small(self):
before_dir = self.ExampleDataDir(mean=1500, stddev=100)
after_dir = self.ExampleDataDir(mean=1450, stddev=100)
output = self.ComparePerf(before_dir, after_dir)
GOLDEN.AssertCaseEq('comparison_improvement_small', output)
def test_adding_test(self):
before_dir = self.ExampleDataDir(drop_one=True)
after_dir = self.ExampleDataDir()
output = self.ComparePerf(before_dir, after_dir)
GOLDEN.AssertCaseEq('adding_test', output)
def test_removing_test(self):
before_dir = self.ExampleDataDir()
after_dir = self.ExampleDataDir(drop_one=True)
output = self.ComparePerf(before_dir, after_dir)
GOLDEN.AssertCaseEq('removing_test', output)
def test_display_single_dataset(self):
dataset_dir = self.ExampleDataDir()
stdout = io.StringIO()
perfcompare.Main(['compare_perf', dataset_dir], stdout)
output = stdout.getvalue()
GOLDEN.AssertCaseEq('display_single_dataset', output)
def test_display_three_datasets(self):
dataset_dirs = [self.ExampleDataDir(mean=1000),
self.ExampleDataDir(mean=2000, drop_one=True),
self.ExampleDataDir(mean=3000)]
stdout = io.StringIO()
perfcompare.Main(['compare_perf'] + dataset_dirs, stdout)
output = stdout.getvalue()
GOLDEN.AssertCaseEq('display_three_datasets', output)
# Test printing a table of point estimates.
def test_display_single_boot_single_dataset(self):
dataset_dir = self.ExampleDataDir(single_boot=True)
stdout = io.StringIO()
perfcompare.Main(['compare_perf', dataset_dir], stdout)
output = stdout.getvalue()
GOLDEN.AssertCaseEq('display_single_boot_single_dataset', output)
# Test printing a table of point estimates.
def test_display_single_boot_two_datasets(self):
dataset_dirs = [self.ExampleDataDir(mean=1000, single_boot=True),
self.ExampleDataDir(mean=2000, single_boot=True,
drop_one=True)]
stdout = io.StringIO()
perfcompare.Main(['compare_perf'] + dataset_dirs, stdout)
output = stdout.getvalue()
GOLDEN.AssertCaseEq('display_single_boot_two_datasets', output)
def test_factor_range_formatting(self):
# Construct an interval pair of the same type used in the
# software-under-test, checking that the interval is well-formed.
def Interval(min_val, max_val):
assert min_val <= max_val
return (numpy.float64(min_val), numpy.float64(max_val))
# Check that the values are of the same type as in the
# software-under-test.
interval_test = Interval(10, 20)
interval_real = perfcompare.Stats([1, 2, 3], 'some_unit').interval
self.assertEqual(type(interval_test[0]), type(interval_real[0]))
self.assertEqual(type(interval_test[1]), type(interval_real[1]))
def Format(interval_before, interval_after):
return perfcompare.FormatFactorRange(Interval(*interval_before),
Interval(*interval_after))
self.assertEqual(Format((1, 2), (3, 4)), '1.500-4.000')
# Test zero "min" values.
self.assertEqual(Format((0, 2), (3, 4)), '1.500-inf')
self.assertEqual(Format((1, 2), (0, 4)), '0.000-4.000')
# Test zero "min" and "max" values.
self.assertEqual(Format((0, 0), (3, 4)), 'inf-inf')
self.assertEqual(Format((1, 2), (0, 0)), '0.000-0.000')
# Test zero "max" values, with negative "min".
self.assertEqual(Format((-1, 0), (3, 4)), 'ci_too_wide')
self.assertEqual(Format((1, 2), (-3, 0)), 'ci_too_wide')
# All values zero.
self.assertEqual(Format((0, 0), (0, 0)), 'no_change')
def test_mismatch_rate(self):
self.assertEqual(perfcompare.MismatchRate([(0,1), (2,3)]), 1)
self.assertEqual(perfcompare.MismatchRate([(0,2), (1,3)]), 0)
self.assertEqual(perfcompare.MismatchRate([(0,2), (1,3), (4,5)]), 2./3)
def test_validate_perfcompare(self):
def MakeExampleDirs(**kwargs):
by_boot_dir = os.path.join(self.ExampleDataDir(**kwargs), 'by_boot')
return [os.path.join(by_boot_dir, name)
for name in sorted(os.listdir(by_boot_dir))]
# This is an example input dataset that gives a high mismatch rate,
# because the data is drawn from two very different distributions.
results_dirs = (MakeExampleDirs(mean=100, stddev=10) +
MakeExampleDirs(mean=200, stddev=10))
stdout = io.StringIO()
perfcompare.Main(['validate_perfcompare', '--group_size=5']
+ results_dirs, stdout)
output = stdout.getvalue()
GOLDEN.AssertCaseEq('validate_perfcompare', output)
class RunLocalTest(TempDirTestCase):
# Test basic operation of the "run_local" subcommand.
def test_run_local(self):
# Destination directory for the full multiboot dataset. Use a
# destination path that does not exist yet.
dest_dir = os.path.join(self.MakeTempDir(), 'new_dir')
# Destination pathnames for process dataset files.
iter_temp_dir = self.MakeTempDir()
iter_temp_file = os.path.join(iter_temp_dir, 'result.fuchsiaperf.json')
iter_temp_glob = os.path.join(iter_temp_dir, '*.fuchsiaperf.json')
data = GenerateData(mean=1000,
stddev_across_boots=10,
stddev_across_processes=10,
stddev_across_iters=10)
commands = []
# Dummy version of subprocess.check_call() for testing.
def DummyRunCmd(cmd, shell=False):
self.assertEqual(shell, True)
commands.append(cmd)
if cmd == 'set -o errexit -o nounset; my_iter_cmd':
WriteJsonFile(iter_temp_file,
[{'label': 'MyTest',
'test_suite': 'example_suite',
'unit': 'nanoseconds',
'values': data.pop(0)[0]}])
stdout = io.StringIO()
perfcompare.Main(['run_local',
'--boots=4',
'--iter_file', iter_temp_glob,
'--iter_cmd', 'my_iter_cmd',
'--reboot_cmd', 'my_reboot_cmd',
'--dest', dest_dir],
stdout, run_cmd=DummyRunCmd)
self.assertEqual(commands,
['set -o errexit -o nounset; my_reboot_cmd',
'set -o errexit -o nounset; my_iter_cmd'] * 4)
GOLDEN.AssertCaseEq('run_local', stdout.getvalue())
# "run_local" should give an error if the temporary files specified by
# --iter_file already exist.
def test_error_if_dest_files_already_exist(self):
dest_dir = os.path.join(self.MakeTempDir(), 'new_dir')
iter_temp_file = os.path.join(
self.MakeTempDir(), 'result.fuchsiaperf.json')
WriteJsonFile(iter_temp_file, [])
args = ['run_local',
'--boots=4',
'--iter_file', iter_temp_file,
'--iter_cmd', 'my_iter_cmd',
'--reboot_cmd', 'my_reboot_cmd',
'--dest', dest_dir]
self.assertRaises(AssertionError,
lambda: perfcompare.Main(args, sys.stdout))
# Check that error-checking is enabled in the shell commands that
# run_local runs.
def test_errexit_error_checking_in_shell_commands(self):
iter_temp_file = os.path.join(
self.MakeTempDir(), 'result.fuchsiaperf.json')
stdout = io.StringIO()
def get_args():
dest_dir = os.path.join(self.MakeTempDir(), 'new_dir')
return ['run_local',
'--boots=4',
'--iter_file', iter_temp_file,
'--dest', dest_dir]
perfcompare.Main(
get_args() + ['--iter_cmd', 'true', '--reboot_cmd', 'true'],
stdout)
# Check that the failure of the "false" command gets caught.
self.assertRaises(
subprocess.CalledProcessError,
lambda: perfcompare.Main(
get_args() + ['--iter_cmd', 'false; true',
'--reboot_cmd', 'true'],
stdout))
self.assertRaises(
subprocess.CalledProcessError,
lambda: perfcompare.Main(
get_args() + ['--iter_cmd', 'true',
'--reboot_cmd', 'false; true'],
stdout))
if __name__ == '__main__':
TestMain()