garnet/bin/perfcompare/perfcompare_test.py - fuchsia - Git at Google

 # Copyright 2019 The Fuchsia Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 import io
 import json
 import os
 import re
 import shutil
 import subprocess
 import sys
 import tarfile
 import tempfile
 import unittest

 import numpy

 import perfcompare


 # Test case helper class for creating temporary directories that will
 # be cleaned up when the test finishes.
 class TempDirTestCase(unittest.TestCase):

     def setUp(self):
         self._on_teardown = []

     def MakeTempDir(self):
         temp_dir = tempfile.mkdtemp(
             prefix='tmp_unittest_%s_' % self.__class__.__name__)
         def tear_down():
             shutil.rmtree(temp_dir)
         self._on_teardown.append(tear_down)
         return temp_dir

     def tearDown(self):
         for func in reversed(self._on_teardown):
             func()


 def WriteJsonFile(filename, json_data):
     with open(filename, 'w') as fh:
         json.dump(json_data, fh)


 def ReadGoldenFile(filename):
     with open(filename, 'r') as fh:
         data = fh.read()
     matches = list(re.finditer('\n\n### (.*)\n', data, re.M))
     starts = [m.end() for m in matches]
     ends = [m.start() for m in matches[1:]] + [len(data)]
     for m, start, end in zip(matches, starts, ends):
         yield m.group(1), data[start:end]


 # Helper for checking against test expectations in a golden file.
 # This provides an implementation of AssertCaseEq() that compares
 # results against the golden file.
 class GoldenDataInput(object):

     def __init__(self, filename):
         self._cases = dict(ReadGoldenFile(filename))

     def AssertCaseEq(self, name, actual):
         expected = self._cases[name]
         if expected != actual:
             raise AssertionError('"%s" != "%s"' % (actual, expected))


 # This provides an implementation of AssertCaseEq() that updates the
 # golden file with new expectations generated by the tests.
 class GoldenDataOutput(object):

     def __init__(self):
         self._cases = {}

     def AssertCaseEq(self, name, actual):
         assert name not in self._cases, name
         self._cases[name] = actual

     def WriteFile(self, filename):
         with open(filename, 'w') as fh:
             for name, data in sorted(self._cases.items()):
                 fh.write('\n\n### %s\n%s' % (name, data))


 GOLDEN_FILE = os.path.join(os.path.dirname(__file__),
                            'perfcompare_test_output.txt')
 GOLDEN = GoldenDataInput(GOLDEN_FILE)


 def TestMain():
     global GOLDEN
     if '--generate' in sys.argv:
         sys.argv.pop(sys.argv.index('--generate'))
         GOLDEN = GoldenDataOutput()
         try:
             unittest.main()
         finally:
             GOLDEN.WriteFile(GOLDEN_FILE)
     else:
         unittest.main()


 # Test data from a normal distribution, generated using the following code:
 # ', '.join('%.4f' % random.gauss(0, 1) for _ in xrange(100))
 TEST_VALUES = [
     0.4171, 2.1056, -0.0223, -1.6592, 0.4766, -0.6405, 0.3488, 1.5729,
     2.0654, -0.1324, -0.8648, -0.2793, -0.7966, 0.2851, -0.9374, -2.0275,
     0.8222, -0.2396, -0.6982, 0.9067, 0.9416, -2.2870, -0.1868, 1.0700,
     -1.2531, 0.8455, 1.4755, 0.2979, 0.3441, 0.6694, -0.1808, -0.9038,
     0.8267, -0.4320, -0.7166, 0.3757, -0.5135, -0.9497, 2.0372, -0.3364,
     0.3879, -0.2970, 1.3872, 0.6538, 1.0674, 1.2349, -0.6873, -0.1807,
     0.6867, -0.1150, -1.0526, -0.6853, -0.5858, -1.8460, 1.6041, -1.1638,
     0.5459, -1.6476, -0.8711, -0.9001, 0.0788, -0.8170, 0.2439, 0.0129,
     -0.8674, -1.1076, -0.0074, -0.6230, -0.4761, -2.2526, 0.4906, -0.5001,
     -0.2050, 0.7623, -0.5511, -0.2837, -0.8797, -0.5374, -1.2910, 0.9551,
     0.4483, -0.6352, -0.3334, -0.5105, 0.1073, 2.9131, -0.4941, -0.2808,
     -0.2517, -1.9961, 0.9214, -0.6325, -1.1895, 0.8118, 1.5424, 0.5601,
     -1.0322, 0.7135, -0.2780, -0.1128]

 def GenerateTestData(mean, stddev):
     return [x * stddev + mean for x in TEST_VALUES]


 # This is an example of a slow running time value for an initial run of a
 # test.  This should be skipped by the software under test.
 SLOW_INITIAL_RUN = [1e6]


 class FormatConfidenceIntervalTest(unittest.TestCase):

     def test_confidence_interval_formatting(self):
         Format = perfcompare.FormatConfidenceInterval

         self.assertEqual(Format(12345.6789, 2222), '12346 +/- 2222')
         self.assertEqual(Format(12345.6789, 0.02222), '12345.679 +/- 0.022')
         self.assertEqual(Format(12345.6789, 0.07777), '12345.679 +/- 0.078')
         self.assertEqual(Format(12345.6789, 0.09911), '12345.679 +/- 0.099')
         # Corner case: rounding 0.09950 to 2 significant figures produces
         # 0.100, which looks like 3 significant figures rather than 2.
         self.assertEqual(Format(12345.6789, 0.09950), '12345.679 +/- 0.100')
         self.assertEqual(Format(12345.6789, 2e-5), '12345.678900 +/- 0.000020')

         # Corner case: the offset is a power of 10.
         self.assertEqual(Format(12345.6789, 0.1), '12345.68 +/- 0.10')
         self.assertEqual(Format(12345.6789, 0.01), '12345.679 +/- 0.010')

         # Corner case: zero offset.
         self.assertEqual(Format(12345.6789, 0), '12345.7 +/- 0')

         # Corner case: negative offset.  This does not make sense for a
         # confidence interval and should not happen, but let's ensure it
         # gets formatted anyway in case that it useful for debugging.
         self.assertEqual(Format(12345.6789, -1), '12345.7 +/- -1')

         # Corner cases: infinity and NaN.
         self.assertEqual(Format(12345.6789, numpy.inf), '12345.7 +/- inf')
         self.assertEqual(Format(12345.6789, -numpy.inf), '12345.7 +/- -inf')
         self.assertEqual(Format(12345.6789, numpy.nan), '12345.7 +/- nan')
         self.assertEqual(Format(numpy.inf, 0.1234), 'inf +/- 0.12')
         self.assertEqual(Format(-numpy.inf, 0.1234), '-inf +/- 0.12')
         self.assertEqual(Format(numpy.nan, 0.1234), 'nan +/- 0.12')


 # Generate some example perf test data, allowing variation at each level of
 # the sampling process (per boot, per process, and per iteration within
 # each process).  This follows a random effects model.  Returns a list of
 # lists of lists of values.
 def GenerateData(mean=1000,
                  stddev_across_boots=0,
                  stddev_across_processes=0,
                  stddev_across_iters=0):
     it = iter(TEST_VALUES)

     def GenerateValues(mean, stddev, count):
         return [next(it) * stddev + mean for _ in range(count)]

     # This reads 4**3 + 4**2 + 4 = 84 values from TEST_VALUES, so it does
     # not exceed the number of values in TEST_VALUES.
     return [[SLOW_INITIAL_RUN
              + GenerateValues(mean_within_process, stddev_across_iters, 4)
              for mean_within_process in GenerateValues(
                      mean_within_boot, stddev_across_processes, 4)]
             for mean_within_boot in GenerateValues(
                     mean, stddev_across_boots, 4)]


 class StatisticsTest(TempDirTestCase):

     def ResultsDictForValues(self, run_values):
         return {'label': 'ExampleTest',
                 'test_suite': 'example_suite',
                 'unit': 'nanoseconds',
                 'values': run_values}

     # Given data in the format returned by GenerateData(), writes this data
     # to a temporary directory.
     def DirOfData(self, data):
         dir_path = self.MakeTempDir()
         os.mkdir(os.path.join(dir_path, 'by_boot'))
         for boot_idx, results_for_boot in enumerate(data):
             test_dir = os.path.join(
                 dir_path, 'by_boot', 'boot%06d' % boot_idx, 'test-name',
                 'subdir')
             os.makedirs(test_dir)
             for process_idx, run_values in enumerate(results_for_boot):
                 dest_file = os.path.join(
                     test_dir,
                     'example_process%06d.fuchsiaperf.json' % process_idx)
                 WriteJsonFile(dest_file, [self.ResultsDictForValues(run_values)])
         return dir_path

     # Sanity-check that DirOfData() writes data in the correct format by
     # reading back some simple test data.
     def test_readback_of_data(self):
         data = [[[1, 2], [3, 4]],
                 [[5, 6], [7, 8]]]
         dataset = perfcompare.MultiBootDataset(self.DirOfData(data))
         boot_datasets = list(dataset.GetBootDatasets())
         self.assertEqual(len(boot_datasets), 2)
         self.assertEqual(list(boot_datasets[0].GetProcessDatasets()),
                          [[self.ResultsDictForValues([1, 2])],
                           [self.ResultsDictForValues([3, 4])]])
         self.assertEqual(list(boot_datasets[1].GetProcessDatasets()),
                          [[self.ResultsDictForValues([5, 6])],
                           [self.ResultsDictForValues([7, 8])]])

     def TarFileOfDir(self, dir_path, write_mode):
         tar_filename = os.path.join(self.MakeTempDir(), 'out.tar')
         with tarfile.open(tar_filename, write_mode) as tar:
             for name in os.listdir(dir_path):
                 tar.add(os.path.join(dir_path, name), arcname=name)
         return tar_filename

     def test_readback_of_data_from_tar_file(self):
         data = [[[1, 2], [3, 4]]]
         dir_path = self.DirOfData(data)
         self.assertEqual(len(os.listdir(os.path.join(dir_path, 'by_boot'))), 1)
         # Test the uncompressed and gzipped cases.
         for write_mode in ('w', 'w:gz'):
             tar_filename = self.TarFileOfDir(
                 os.path.join(dir_path, 'by_boot', 'boot000000'), write_mode)
             boot_dataset = perfcompare.SingleBootDataset(tar_filename)
             self.assertEqual(list(boot_dataset.GetProcessDatasets()),
                              [[self.ResultsDictForValues([1, 2])],
                               [self.ResultsDictForValues([3, 4])]])

     def CheckConfidenceInterval(self, data, interval_string):
         dir_path = self.DirOfData(data)
         test_name = 'example_suite: ExampleTest'
         stats = perfcompare.StatsFromMultiBootDataset(
             perfcompare.MultiBootDataset(dir_path))[test_name]
         self.assertEqual(stats.FormatConfidenceInterval(), interval_string)

     # Test the CIs produced with variation at different levels of the
     # multi-level sampling process.
     def test_confidence_intervals(self):
         self.CheckConfidenceInterval(GenerateData(), '1000 +/- 0 ns')
         self.CheckConfidenceInterval(
             GenerateData(stddev_across_boots=100), '1021 +/- 452 ns')
         self.CheckConfidenceInterval(
             GenerateData(stddev_across_processes=100), '1012 +/- 151 ns')
         self.CheckConfidenceInterval(
             GenerateData(stddev_across_iters=100), '981 +/- 74 ns')

     # Test the case where just a single value is produced per process run.
     def test_confidence_interval_with_single_value_per_process(self):
         self.CheckConfidenceInterval(
             [[[100]], [[101]]], '100 +/- 32 ns')

     # If the "before" and "after" results have identical confidence
     # intervals, that should be treated as "no difference", including when
     # the CIs are zero-width (as tested here).
     def test_comparing_equal_zero_width_confidence_intervals(self):
         dir_path = self.DirOfData([[[200]], [[200]]])
         stdout = io.StringIO()
         perfcompare.Main(['compare_perf', dir_path, dir_path], stdout)
         output = stdout.getvalue()
         GOLDEN.AssertCaseEq('comparison_no_change_zero_width_ci', output)


 class PerfCompareTest(TempDirTestCase):

     def AddIgnoredFiles(self, dest_dir):
         # Include a summary.json file to check that we skip reading it.
         with open(os.path.join(dest_dir, 'summary.json'), 'w') as fh:
             fh.write('dummy_data')
         # Include a *.catapult_json file to check that we skip reading these.
         with open(os.path.join(dest_dir, 'foo.catapult_json'), 'w') as fh:
             fh.write('dummy_data')

     def WriteExampleDataDir(self, dir_path, mean=1000, stddev=100,
                             drop_one=False, single_boot=False):
         results = [('ClockGetTimeExample', GenerateTestData(mean, stddev))]
         if not drop_one:
             results.append(('SecondExample', GenerateTestData(2000, 300)))

         if single_boot:
             for test_name, values in results:
                 dest_dir = os.path.join(dir_path, 'by_boot', 'boot0')
                 dest_file = os.path.join(
                     dest_dir, '%s.fuchsiaperf.json' % test_name)
                 if not os.path.exists(dest_dir):
                     os.makedirs(dest_dir)
                     self.AddIgnoredFiles(dest_dir)
                 WriteJsonFile(
                     dest_file,
                     [{'label': test_name,
                       'test_suite': 'fuchsia.example',
                       'unit': 'nanoseconds',
                       'values': SLOW_INITIAL_RUN + values}])
         else:
             for test_name, values in results:
                 for idx, value in enumerate(values):
                     dest_dir = os.path.join(
                         dir_path, 'by_boot', 'boot%06d' % idx)
                     dest_file = os.path.join(
                         dest_dir, '%s.fuchsiaperf.json' % test_name)
                     if not os.path.exists(dest_dir):
                         os.makedirs(dest_dir)
                         self.AddIgnoredFiles(dest_dir)
                     WriteJsonFile(
                         dest_file,
                         [{'label': test_name,
                           'test_suite': 'fuchsia.example',
                           'unit': 'nanoseconds',
                           'values': SLOW_INITIAL_RUN + [value]}])

     def ExampleDataDir(self, **kwargs):
         dir_path = self.MakeTempDir()
         self.WriteExampleDataDir(dir_path, **kwargs)
         return dir_path

     def test_reading_results_from_dir(self):
         dir_path = self.ExampleDataDir()
         results = perfcompare.StatsFromMultiBootDataset(
             perfcompare.MultiBootDataset(dir_path))
         test_name = 'fuchsia.example: ClockGetTimeExample'
         self.assertEqual(
             results[test_name].FormatConfidenceInterval(),
             '992 +/- 26 ns')

     # Returns the output of compare_perf when run on the given directories.
     def ComparePerf(self, before_dir, after_dir):
         stdout = io.StringIO()
         perfcompare.Main(['compare_perf', before_dir, after_dir], stdout)
         return stdout.getvalue()

     def test_mean_and_stddev(self):
         values = [10, 5, 15]
         mean_val, stddev_val = perfcompare.MeanAndStddev(values)
         self.assertEqual(mean_val, 10.0)
         self.assertEqual(perfcompare.Mean(values), 10.0)
         self.assertEqual(stddev_val, 5.0)
         # Single-value sample.
         self.assertEqual(perfcompare.MeanAndStddev([123]), (123.0, None))
         # Check error cases.
         self.assertRaises(AssertionError, lambda: perfcompare.Mean([]))
         self.assertRaises(AssertionError, lambda: perfcompare.MeanAndStddev([]))

     # Check that data written using the golden file helper reads back
     # the same.
     def test_golden_file_write_and_read(self):
         temp_file = os.path.join(self.MakeTempDir(), 'file')
         writer = GoldenDataOutput()
         writer.AssertCaseEq('a_key', 'a_value')
         writer.AssertCaseEq('b_key', 'line 1\n' 'line 2\n')
         writer.WriteFile(temp_file)
         reader = GoldenDataInput(temp_file)
         reader.AssertCaseEq('a_key', 'a_value')
         reader.AssertCaseEq('b_key', 'line 1\n' 'line 2\n')
         self.assertRaises(AssertionError,
                           lambda: reader.AssertCaseEq('a_key', 'other_value'))

     def test_comparison_no_change(self):
         before_dir = self.ExampleDataDir()
         after_dir = self.ExampleDataDir()
         output = self.ComparePerf(before_dir, after_dir)
         GOLDEN.AssertCaseEq('comparison_no_change', output)

     # Test a regression that is large enough to be flagged.
     def test_comparison_regression(self):
         before_dir = self.ExampleDataDir(mean=1500, stddev=100)
         after_dir = self.ExampleDataDir(mean=1600, stddev=100)
         output = self.ComparePerf(before_dir, after_dir)
         GOLDEN.AssertCaseEq('comparison_regression', output)

     # Test an improvement that is large enough to be flagged.
     def test_comparison_improvement(self):
         before_dir = self.ExampleDataDir(mean=1500, stddev=100)
         after_dir = self.ExampleDataDir(mean=1400, stddev=100)
         output = self.ComparePerf(before_dir, after_dir)
         GOLDEN.AssertCaseEq('comparison_improvement', output)

     # Test an improvement that is not large enough to be flagged.
     def test_comparison_improvement_small(self):
         before_dir = self.ExampleDataDir(mean=1500, stddev=100)
         after_dir = self.ExampleDataDir(mean=1450, stddev=100)
         output = self.ComparePerf(before_dir, after_dir)
         GOLDEN.AssertCaseEq('comparison_improvement_small', output)

     def test_adding_test(self):
         before_dir = self.ExampleDataDir(drop_one=True)
         after_dir = self.ExampleDataDir()
         output = self.ComparePerf(before_dir, after_dir)
         GOLDEN.AssertCaseEq('adding_test', output)

     def test_removing_test(self):
         before_dir = self.ExampleDataDir()
         after_dir = self.ExampleDataDir(drop_one=True)
         output = self.ComparePerf(before_dir, after_dir)
         GOLDEN.AssertCaseEq('removing_test', output)

     def test_display_single_dataset(self):
         dataset_dir = self.ExampleDataDir()
         stdout = io.StringIO()
         perfcompare.Main(['compare_perf', dataset_dir], stdout)
         output = stdout.getvalue()
         GOLDEN.AssertCaseEq('display_single_dataset', output)

     def test_display_three_datasets(self):
         dataset_dirs = [self.ExampleDataDir(mean=1000),
                         self.ExampleDataDir(mean=2000, drop_one=True),
                         self.ExampleDataDir(mean=3000)]
         stdout = io.StringIO()
         perfcompare.Main(['compare_perf'] + dataset_dirs, stdout)
         output = stdout.getvalue()
         GOLDEN.AssertCaseEq('display_three_datasets', output)

     # Test printing a table of point estimates.
     def test_display_single_boot_single_dataset(self):
         dataset_dir = self.ExampleDataDir(single_boot=True)
         stdout = io.StringIO()
         perfcompare.Main(['compare_perf', dataset_dir], stdout)
         output = stdout.getvalue()
         GOLDEN.AssertCaseEq('display_single_boot_single_dataset', output)

     # Test printing a table of point estimates.
     def test_display_single_boot_two_datasets(self):
         dataset_dirs = [self.ExampleDataDir(mean=1000, single_boot=True),
                         self.ExampleDataDir(mean=2000, single_boot=True,
                                             drop_one=True)]
         stdout = io.StringIO()
         perfcompare.Main(['compare_perf'] + dataset_dirs, stdout)
         output = stdout.getvalue()
         GOLDEN.AssertCaseEq('display_single_boot_two_datasets', output)

     def test_factor_range_formatting(self):
         # Construct an interval pair of the same type used in the
         # software-under-test, checking that the interval is well-formed.
         def Interval(min_val, max_val):
             assert min_val <= max_val
             return (numpy.float64(min_val), numpy.float64(max_val))

         # Check that the values are of the same type as in the
         # software-under-test.
         interval_test = Interval(10, 20)
         interval_real = perfcompare.Stats([1, 2, 3], 'some_unit').interval
         self.assertEqual(type(interval_test[0]), type(interval_real[0]))
         self.assertEqual(type(interval_test[1]), type(interval_real[1]))

         def Format(interval_before, interval_after):
             return perfcompare.FormatFactorRange(Interval(*interval_before),
                                                  Interval(*interval_after))

         self.assertEqual(Format((1, 2), (3, 4)), '1.500-4.000')
         # Test zero "min" values.
         self.assertEqual(Format((0, 2), (3, 4)), '1.500-inf')
         self.assertEqual(Format((1, 2), (0, 4)), '0.000-4.000')
         # Test zero "min" and "max" values.
         self.assertEqual(Format((0, 0), (3, 4)), 'inf-inf')
         self.assertEqual(Format((1, 2), (0, 0)), '0.000-0.000')
         # Test zero "max" values, with negative "min".
         self.assertEqual(Format((-1, 0), (3, 4)), 'ci_too_wide')
         self.assertEqual(Format((1, 2), (-3, 0)), 'ci_too_wide')
         # All values zero.
         self.assertEqual(Format((0, 0), (0, 0)), 'no_change')

     def test_mismatch_rate(self):
         self.assertEqual(perfcompare.MismatchRate([(0,1), (2,3)]), 1)
         self.assertEqual(perfcompare.MismatchRate([(0,2), (1,3)]), 0)
         self.assertEqual(perfcompare.MismatchRate([(0,2), (1,3), (4,5)]), 2./3)

     def test_validate_perfcompare(self):
         def MakeExampleDirs(**kwargs):
             by_boot_dir = os.path.join(self.ExampleDataDir(**kwargs), 'by_boot')
             return [os.path.join(by_boot_dir, name)
                     for name in sorted(os.listdir(by_boot_dir))]

         # This is an example input dataset that gives a high mismatch rate,
         # because the data is drawn from two very different distributions.
         results_dirs = (MakeExampleDirs(mean=100, stddev=10) +
                         MakeExampleDirs(mean=200, stddev=10))
         stdout = io.StringIO()
         perfcompare.Main(['validate_perfcompare', '--group_size=5']
                          + results_dirs, stdout)
         output = stdout.getvalue()
         GOLDEN.AssertCaseEq('validate_perfcompare', output)


 class RunLocalTest(TempDirTestCase):

     # Test basic operation of the "run_local" subcommand.
     def test_run_local(self):
         # Destination directory for the full multiboot dataset.  Use a
         # destination path that does not exist yet.
         dest_dir = os.path.join(self.MakeTempDir(), 'new_dir')

         # Destination pathnames for process dataset files.
         iter_temp_dir = self.MakeTempDir()
         iter_temp_file = os.path.join(iter_temp_dir, 'result.fuchsiaperf.json')
         iter_temp_glob = os.path.join(iter_temp_dir, '*.fuchsiaperf.json')

         data = GenerateData(mean=1000,
                             stddev_across_boots=10,
                             stddev_across_processes=10,
                             stddev_across_iters=10)
         commands = []
         # Dummy version of subprocess.check_call() for testing.
         def DummyRunCmd(cmd, shell=False):
             self.assertEqual(shell, True)
             commands.append(cmd)
             if cmd == 'set -o errexit -o nounset; my_iter_cmd':
                 WriteJsonFile(iter_temp_file,
                               [{'label': 'MyTest',
                                 'test_suite': 'example_suite',
                                 'unit': 'nanoseconds',
                                 'values': data.pop(0)[0]}])

         stdout = io.StringIO()
         perfcompare.Main(['run_local',
                           '--boots=4',
                           '--iter_file', iter_temp_glob,
                           '--iter_cmd', 'my_iter_cmd',
                           '--reboot_cmd', 'my_reboot_cmd',
                           '--dest', dest_dir],
                          stdout, run_cmd=DummyRunCmd)
         self.assertEqual(commands,
                          ['set -o errexit -o nounset; my_reboot_cmd',
                           'set -o errexit -o nounset; my_iter_cmd'] * 4)
         GOLDEN.AssertCaseEq('run_local', stdout.getvalue())

     # "run_local" should give an error if the temporary files specified by
     # --iter_file already exist.
     def test_error_if_dest_files_already_exist(self):
         dest_dir = os.path.join(self.MakeTempDir(), 'new_dir')
         iter_temp_file = os.path.join(
             self.MakeTempDir(), 'result.fuchsiaperf.json')
         WriteJsonFile(iter_temp_file, [])
         args = ['run_local',
                 '--boots=4',
                 '--iter_file', iter_temp_file,
                 '--iter_cmd', 'my_iter_cmd',
                 '--reboot_cmd', 'my_reboot_cmd',
                 '--dest', dest_dir]
         self.assertRaises(AssertionError,
                           lambda: perfcompare.Main(args, sys.stdout))

     # Check that error-checking is enabled in the shell commands that
     # run_local runs.
     def test_errexit_error_checking_in_shell_commands(self):
         iter_temp_file = os.path.join(
             self.MakeTempDir(), 'result.fuchsiaperf.json')
         stdout = io.StringIO()

         def get_args():
             dest_dir = os.path.join(self.MakeTempDir(), 'new_dir')
             return ['run_local',
                     '--boots=4',
                     '--iter_file', iter_temp_file,
                     '--dest', dest_dir]

         perfcompare.Main(
             get_args() + ['--iter_cmd', 'true', '--reboot_cmd', 'true'],
             stdout)
         # Check that the failure of the "false" command gets caught.
         self.assertRaises(
             subprocess.CalledProcessError,
             lambda: perfcompare.Main(
                 get_args() + ['--iter_cmd', 'false; true',
                               '--reboot_cmd', 'true'],
                 stdout))
         self.assertRaises(
             subprocess.CalledProcessError,
             lambda: perfcompare.Main(
                 get_args() + ['--iter_cmd', 'true',
                               '--reboot_cmd', 'false; true'],
                 stdout))


 if __name__ == '__main__':
     TestMain()