benchmark/scripts/compare_perf_tests.py - third_party/swift - Git at Google

 #!/usr/bin/python
 # -*- coding: utf-8 -*-

 # ===--- compare_perf_tests.py -------------------------------------------===//
 #
 #  This source file is part of the Swift.org open source project
 #
 #  Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
 #  Licensed under Apache License v2.0 with Runtime Library Exception
 #
 #  See https://swift.org/LICENSE.txt for license information
 #  See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
 #
 # ===---------------------------------------------------------------------===//
 """
 This script compares performance test logs and issues a formatted report.

 Invoke `$ compare_perf_tests.py -h ` for complete list of options.

 class `Sample` is single benchmark measurement.
 class `PerformanceTestSamples` is collection of `Sample`s and their statistics.
 class `PerformanceTestResult` is a summary of performance test execution.
 class `LogParser` converts log files into `PerformanceTestResult`s.
 class `ResultComparison` compares new and old `PerformanceTestResult`s.
 class `TestComparator` analyzes changes betweeen the old and new test results.
 class `ReportFormatter` creates the test comparison report in specified format.

 """

 from __future__ import print_function

 import argparse
 import re
 import sys
 from bisect import bisect, bisect_left, bisect_right
 from collections import namedtuple
 from math import ceil, sqrt


 class Sample(namedtuple('Sample', 'i num_iters runtime')):
     u"""Single benchmark measurement.

     Initialized with:
     `i`: ordinal number of the sample taken,
     `num-num_iters`:  number or iterations used to compute it,
     `runtime`: in microseconds (μs).
     """

     def __repr__(self):
         """Shorter Sample formating for debugging purposes."""
         return 's({0.i!r}, {0.num_iters!r}, {0.runtime!r})'.format(self)


 class Yield(namedtuple('Yield', 'before_sample after')):
     u"""Meta-measurement of when the Benchmark_X voluntarily yielded process.

     `before_sample`: index of measurement taken just after returning from yield
     `after`: time elapsed since the previous yield in microseconds (μs)
     """


 class PerformanceTestSamples(object):
     """Collection of runtime samples from the benchmark execution.

     Computes the sample population statistics.
     """

     def __init__(self, name, samples=None):
         """Initialize with benchmark name and optional list of Samples."""
         self.name = name  # Name of the performance test
         self.samples = []
         self.outliers = []
         self._runtimes = []
         self.mean = 0.0
         self.S_runtime = 0.0  # For computing running variance
         for sample in samples or []:
             self.add(sample)

     def __str__(self):
         """Text summary of benchmark statistics."""
         return (
             '{0.name!s} n={0.count!r} '
             'Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} '
             'Max={0.max!r} '
             'R={0.range!r} {0.spread:.2%} IQR={0.iqr!r} '
             'Mean={0.mean:.0f} SD={0.sd:.0f} CV={0.cv:.2%}'
             .format(self) if self.samples else
             '{0.name!s} n=0'.format(self))

     def add(self, sample):
         """Add sample to collection and recompute statistics."""
         assert isinstance(sample, Sample)
         self._update_stats(sample)
         i = bisect(self._runtimes, sample.runtime)
         self._runtimes.insert(i, sample.runtime)
         self.samples.insert(i, sample)

     def _update_stats(self, sample):
         old_stats = (self.count, self.mean, self.S_runtime)
         _, self.mean, self.S_runtime = (
             self.running_mean_variance(old_stats, sample.runtime))

     def exclude_outliers(self, top_only=False):
         """Exclude outliers by applying Interquartile Range Rule.

         Moves the samples outside of the inner fences
         (Q1 - 1.5*IQR and Q3 + 1.5*IQR) into outliers list and recomputes
         statistics for the remaining sample population. Optionally apply
         only the top inner fence, preserving the small outliers.

         Experimentally, this rule seems to perform well-enough on the
         benchmark runtimes in the microbenchmark range to filter out
         the environment noise caused by preemtive multitasking.
         """
         lo = (0 if top_only else
               bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr)))
         hi = bisect_right(self._runtimes, int(self.q3 + 1.5 * self.iqr))

         outliers = self.samples[:lo] + self.samples[hi:]
         samples = self.samples[lo:hi]

         self.__init__(self.name)  # re-initialize
         for sample in samples:  # and
             self.add(sample)  # re-compute stats
         self.outliers = outliers

     @property
     def count(self):
         """Number of samples used to compute the statistics."""
         return len(self.samples)

     @property
     def num_samples(self):
         """Number of all samples in the collection."""
         return len(self.samples) + len(self.outliers)

     @property
     def all_samples(self):
         """List of all samples in ascending order."""
         return sorted(self.samples + self.outliers, key=lambda s: s.i)

     @property
     def min(self):
         """Minimum sampled value."""
         return self.samples[0].runtime

     @property
     def max(self):
         """Maximum sampled value."""
         return self.samples[-1].runtime

     def quantile(self, q):
         """Return runtime for given quantile.

         Equivalent to quantile estimate type R-1, SAS-3. See:
         https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample
         """
         index = max(0, int(ceil(self.count * float(q))) - 1)
         return self.samples[index].runtime

     @property
     def median(self):
         """Median sampled value."""
         return self.quantile(0.5)

     @property
     def q1(self):
         """First Quartile (25th Percentile)."""
         return self.quantile(0.25)

     @property
     def q3(self):
         """Third Quartile (75th Percentile)."""
         return self.quantile(0.75)

     @property
     def iqr(self):
         """Interquartile Range."""
         return self.q3 - self.q1

     @property
     def sd(self):
         u"""Standard Deviation (μs)."""
         return (0 if self.count < 2 else
                 sqrt(self.S_runtime / (self.count - 1)))

     @staticmethod
     def running_mean_variance((k, M_, S_), x):
         """Compute running variance, B. P. Welford's method.

         See Knuth TAOCP vol 2, 3rd edition, page 232, or
         https://www.johndcook.com/blog/standard_deviation/
         M is mean, Standard Deviation is defined as sqrt(S/k-1)
         """
         k = float(k + 1)
         M = M_ + (x - M_) / k
         S = S_ + (x - M_) * (x - M)
         return (k, M, S)

     @property
     def cv(self):
         """Coeficient of Variation (%)."""
         return (self.sd / self.mean) if self.mean else 0

     @property
     def range(self):
         """Range of samples values (Max - Min)."""
         return self.max - self.min

     @property
     def spread(self):
         """Sample Spread; i.e. Range as (%) of Min."""
         return self.range / float(self.min) if self.min else 0


 class PerformanceTestResult(object):
     u"""Result from executing an individual Swift Benchmark Suite benchmark.

     Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize
     or Benchmark_Driver).

     It suppors 2 log formats emitted by the test driver. Legacy format with
     statistics for normal distribution (MEAN, SD):
         #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
     And new quantiles format with variable number of columns:
         #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
         #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
     The number of columns between MIN and MAX depends on the test driver's
     `--quantile`parameter. In both cases, the last column, MAX_RSS is optional.
     """

     def __init__(self, csv_row, quantiles=False, memory=False, delta=False):
         """Initialize from a row of multiple columns with benchmark summary.

         The row is an iterable, such as a row provided by the CSV parser.
         """
         self.test_num = csv_row[0]          # Ordinal number of the test
         self.name = csv_row[1]              # Name of the performance test
         self.num_samples = int(csv_row[2])  # Number of measurements taken

         if quantiles:  # Variable number of columns representing quantiles
             runtimes = csv_row[3:-1] if memory else csv_row[3:]
             if delta:
                 runtimes = [int(x) if x else 0 for x in runtimes]
                 runtimes = reduce(lambda l, x: l.append(l[-1] + x) or  # runnin
                                   l if l else [x], runtimes, None)     # total
             num_values = len(runtimes)
             if self.num_samples < num_values:  # remove repeated samples
                 quantile = num_values - 1
                 qs = [float(i) / float(quantile) for i in range(0, num_values)]
                 indices = [max(0, int(ceil(self.num_samples * float(q))) - 1)
                            for q in qs]
                 runtimes = [runtimes[indices.index(i)]
                             for i in range(0, self.num_samples)]

             self.samples = PerformanceTestSamples(
                 self.name,
                 [Sample(None, None, int(runtime)) for runtime in runtimes])
             self.samples.exclude_outliers(top_only=True)
             sams = self.samples
             self.min, self.max, self.median, self.mean, self.sd = \
                 sams.min, sams.max, sams.median, sams.mean, sams.sd
             self.max_rss = (                # Maximum Resident Set Size (B)
                 int(csv_row[-1]) if memory else None)
         else:  # Legacy format with statistics for normal distribution.
             self.min = int(csv_row[3])      # Minimum runtime (μs)
             self.max = int(csv_row[4])      # Maximum runtime (μs)
             self.mean = float(csv_row[5])   # Mean (average) runtime (μs)
             self.sd = float(csv_row[6])     # Standard Deviation (μs)
             self.median = int(csv_row[7])   # Median runtime (μs)
             self.max_rss = (                # Maximum Resident Set Size (B)
                 int(csv_row[8]) if len(csv_row) > 8 else None)
             self.samples = None
         self.yields = None
         self.setup = None

     def __repr__(self):
         """Short summary for debugging purposes."""
         return (
             '<PerformanceTestResult name:{0.name!r} '
             'samples:{0.num_samples!r} min:{0.min!r} max:{0.max!r} '
             'mean:{0.mean:.0f} sd:{0.sd:.0f} median:{0.median!r}>'
             .format(self))

     def merge(self, r):
         """Merge two results.

         Recomputes min, max and mean statistics. If all `samples` are
         avaliable, it recomputes all the statistics.
         The use case here is comparing test results parsed from concatenated
         log files from multiple runs of benchmark driver.
         """
         # Statistics
         if self.samples and r.samples:
             map(self.samples.add, r.samples.samples)
             sams = self.samples
             self.num_samples = sams.num_samples
             self.min, self.max, self.median, self.mean, self.sd = \
                 sams.min, sams.max, sams.median, sams.mean, sams.sd
         else:
             self.min = min(self.min, r.min)
             self.max = max(self.max, r.max)
             self.mean = (  # pooled mean is the weighted sum of means
                 (self.mean * self.num_samples) + (r.mean * r.num_samples)
             ) / float(self.num_samples + r.num_samples)
             self.num_samples += r.num_samples
             self.median, self.sd = None, None

         # Metadata
         def minimum(a, b):  # work around None being less than everything
             return (min(filter(lambda x: x is not None, [a, b])) if any([a, b])
                     else None)
         self.max_rss = minimum(self.max_rss, r.max_rss)
         self.setup = minimum(self.setup, r.setup)


 class ResultComparison(object):
     """ResultComparison compares MINs from new and old PerformanceTestResult.

     It computes speedup ratio and improvement delta (%).
     """

     def __init__(self, old, new):
         """Initialize with old and new `PerformanceTestResult`s to compare."""
         self.old = old
         self.new = new
         assert old.name == new.name
         self.name = old.name  # Test name, convenience accessor

         # Speedup ratio
         self.ratio = (old.min + 0.001) / (new.min + 0.001)

         # Test runtime improvement in %
         ratio = (new.min + 0.001) / (old.min + 0.001)
         self.delta = ((ratio - 1) * 100)

         # Indication of dubious changes: when result's MIN falls inside the
         # (MIN, MAX) interval of result they are being compared with.
         self.is_dubious = ((old.min < new.min and new.min < old.max) or
                            (new.min < old.min and old.min < new.max))


 class LogParser(object):
     """Converts log outputs into `PerformanceTestResult`s.

     Supports various formats produced by the `Benchmark_Driver` and
     `Benchmark_O`('Onone', 'Osize'). It can also merge together the
     results from concatenated log files.
     """

     def __init__(self):
         """Create instance of `LogParser`."""
         self.results = []
         self.quantiles, self.delta, self.memory = False, False, False
         self._reset()

     def _reset(self):
         """Reset parser to the default state for reading a new result."""
         self.samples, self.yields, self.num_iters = [], [], 1
         self.setup, self.max_rss, self.mem_pages = None, None, None
         self.voluntary_cs, self.involuntary_cs = None, None

     # Parse lines like this
     # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
     results_re = re.compile(
         r'( *\d+[, \t]+[\w.]+[, \t]+' +  # #,TEST
         r'[, \t]+'.join([r'\d+'] * 2) +  # at least 2...
         r'(?:[, \t]+\d*)*)')             # ...or more numeric columns

     def _append_result(self, result):
         columns = result.split(',') if ',' in result else result.split()
         r = PerformanceTestResult(
             columns, quantiles=self.quantiles, memory=self.memory,
             delta=self.delta)
         r.setup = self.setup
         r.max_rss = r.max_rss or self.max_rss
         r.mem_pages = self.mem_pages
         r.voluntary_cs = self.voluntary_cs
         r.involuntary_cs = self.involuntary_cs
         if self.samples:
             r.samples = PerformanceTestSamples(r.name, self.samples)
             r.samples.exclude_outliers()
         self.results.append(r)
         r.yields = self.yields or None
         self._reset()

     def _store_memory_stats(self, max_rss, mem_pages):
         self.max_rss = int(max_rss)
         self.mem_pages = int(mem_pages)

     def _configure_format(self, header):
         self.quantiles = 'MEAN' not in header
         self.memory = 'MAX_RSS' in header
         self.delta = '𝚫' in header

     # Regular expression and action to take when it matches the parsed line
     state_actions = {
         results_re: _append_result,

         # Verbose mode adds new productions:
         # Adaptively determined N; test loop multiple adjusting runtime to ~1s
         re.compile(r'\s+Measuring with scale (\d+).'):
         (lambda self, num_iters: setattr(self, 'num_iters', num_iters)),

         re.compile(r'\s+Sample (\d+),(\d+)'):
         (lambda self, i, runtime:
          self.samples.append(
              Sample(int(i), int(self.num_iters), int(runtime)))),

         re.compile(r'\s+SetUp (\d+)'):
         (lambda self, setup: setattr(self, 'setup', int(setup))),

         re.compile(r'\s+Yielding after ~(\d+) μs'):
         (lambda self, since_last_yield:
             self.yields.append(
                 Yield(len(self.samples), int(since_last_yield)))),

         re.compile(r'( *#[, \t]+TEST[, \t]+SAMPLES[, \t]+MIN.*)'):
         _configure_format,

         # Environmental statistics: memory usage and context switches
         re.compile(r'\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)'):
         _store_memory_stats,

         re.compile(r'\s+VCS \d+ - \d+ = (\d+)'):
         (lambda self, vcs: setattr(self, 'voluntary_cs', int(vcs))),

         re.compile(r'\s+ICS \d+ - \d+ = (\d+)'):
         (lambda self, ics: setattr(self, 'involuntary_cs', int(ics))),
     }

     def parse_results(self, lines):
         """Parse results from the lines of the log output from Benchmark*.

         Returns a list of `PerformanceTestResult`s.
         """
         for line in lines:
             for regexp, action in LogParser.state_actions.items():
                 match = regexp.match(line)
                 if match:
                     action(self, *match.groups())
                     break  # stop after 1st match
             else:  # If none matches, skip the line.
                 # print('skipping: ' + line.rstrip('\n'))
                 continue
         return self.results

     @staticmethod
     def _results_from_lines(lines):
         tests = LogParser().parse_results(lines)

         def add_or_merge(names, r):
             if r.name not in names:
                 names[r.name] = r
             else:
                 names[r.name].merge(r)
             return names

         return reduce(add_or_merge, tests, dict())

     @staticmethod
     def results_from_string(log_contents):
         """Parse `PerformanceTestResult`s from the supplied string.

         Returns dictionary of test names and `PerformanceTestResult`s.
         """
         return LogParser._results_from_lines(log_contents.splitlines())

     @staticmethod
     def results_from_file(log_file):
         """Parse `PerformanceTestResult`s from the log file.

         Returns dictionary of test names and `PerformanceTestResult`s.
         """
         with open(log_file) as f:
             return LogParser._results_from_lines(f.readlines())


 class TestComparator(object):
     """Analyzes changes betweeen the old and new test results.

     It determines which tests were `added`, `removed` and which can be
     compared. It then splits the `ResultComparison`s into 3 groups according to
     the `delta_threshold` by the change in performance: `increased`,
     `descreased` and `unchanged`. Whole computaion is performed during
     initialization and results are provided as properties on this object.

     The lists of `added`, `removed` and `unchanged` tests are sorted
     alphabetically. The `increased` and `decreased` lists are sorted in
     descending order by the amount of change.
     """

     def __init__(self, old_results, new_results, delta_threshold):
         """Initialize with dictionaries of old and new benchmark results.

         Dictionary keys are benchmark names, values are
         `PerformanceTestResult`s.
         """
         old_tests = set(old_results.keys())
         new_tests = set(new_results.keys())
         comparable_tests = new_tests.intersection(old_tests)
         added_tests = new_tests.difference(old_tests)
         removed_tests = old_tests.difference(new_tests)

         self.added = sorted([new_results[t] for t in added_tests],
                             key=lambda r: r.name)
         self.removed = sorted([old_results[t] for t in removed_tests],
                               key=lambda r: r.name)

         def compare(name):
             return ResultComparison(old_results[name], new_results[name])

         comparisons = map(compare, comparable_tests)

         def partition(l, p):
             return reduce(lambda x, y: x[not p(y)].append(y) or x, l, ([], []))

         decreased, not_decreased = partition(
             comparisons, lambda c: c.ratio < (1 - delta_threshold))
         increased, unchanged = partition(
             not_decreased, lambda c: c.ratio > (1 + delta_threshold))

         # sorted partitions
         names = [c.name for c in comparisons]
         comparisons = dict(zip(names, comparisons))
         self.decreased = [comparisons[c.name]
                           for c in sorted(decreased, key=lambda c: -c.delta)]
         self.increased = [comparisons[c.name]
                           for c in sorted(increased, key=lambda c: c.delta)]
         self.unchanged = [comparisons[c.name]
                           for c in sorted(unchanged, key=lambda c: c.name)]


 class ReportFormatter(object):
     """Creates the report from perfromance test comparison in specified format.

     `ReportFormatter` formats the `PerformanceTestResult`s and
     `ResultComparison`s provided by `TestComparator` into report table.
     Supported formats are: `markdown` (used for displaying benchmark results on
     GitHub), `git` and `html`.
     """

     def __init__(self, comparator, changes_only,
                  single_table=False):
         """Initialize with `TestComparator` and names of branches."""
         self.comparator = comparator
         self.changes_only = changes_only
         self.single_table = single_table

     MARKDOWN_DETAIL = """
 <details {3}>
   <summary>{0} ({1})</summary>
   {2}
 </details>
 """
     GIT_DETAIL = """
 {0} ({1}): {2}"""

     PERFORMANCE_TEST_RESULT_HEADER = ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS')
     RESULT_COMPARISON_HEADER = ('TEST', 'OLD', 'NEW', 'DELTA', 'RATIO')

     @staticmethod
     def header_for(result):
         """Column labels for header row in results table."""
         return (ReportFormatter.PERFORMANCE_TEST_RESULT_HEADER
                 if isinstance(result, PerformanceTestResult) else
                 # isinstance(result, ResultComparison)
                 ReportFormatter.RESULT_COMPARISON_HEADER)

     @staticmethod
     def values(result):
         """Format values from PerformanceTestResult or ResultComparison.

         Returns tuple of strings to display in the results table.
         """
         return (
             (result.name,
              str(result.min), str(result.max), str(int(result.mean)),
              str(result.max_rss) if result.max_rss else '—')
             if isinstance(result, PerformanceTestResult) else
             # isinstance(result, ResultComparison)
             (result.name,
              str(result.old.min), str(result.new.min),
              '{0:+.1f}%'.format(result.delta),
              '{0:.2f}x{1}'.format(result.ratio,
                                   ' (?)' if result.is_dubious else ''))
         )

     def markdown(self):
         """Report results of benchmark comparisons in Markdown format."""
         return self._formatted_text(
             ROW='{0} | {1} | {2} | {3} | {4} \n',
             HEADER_SEPARATOR='---',
             DETAIL=self.MARKDOWN_DETAIL)

     def git(self):
         """Report results of benchmark comparisons in 'git' format."""
         return self._formatted_text(
             ROW='{0}   {1}   {2}   {3}   {4} \n',
             HEADER_SEPARATOR='   ',
             DETAIL=self.GIT_DETAIL)

     def _column_widths(self):
         changed = self.comparator.decreased + self.comparator.increased
         results = (changed if self.changes_only else
                    changed + self.comparator.unchanged)
         results += self.comparator.added + self.comparator.removed

         widths = [
             map(len, columns) for columns in
             [ReportFormatter.PERFORMANCE_TEST_RESULT_HEADER,
              ReportFormatter.RESULT_COMPARISON_HEADER] +
             [ReportFormatter.values(r) for r in results]
         ]

         def max_widths(maximum, widths):
             return tuple(map(max, zip(maximum, widths)))

         return reduce(max_widths, widths, tuple([0] * 5))

     def _formatted_text(self, ROW, HEADER_SEPARATOR, DETAIL):
         widths = self._column_widths()
         self.header_printed = False

         def justify_columns(contents):
             return tuple([c.ljust(w) for w, c in zip(widths, contents)])

         def row(contents):
             return ROW.format(*justify_columns(contents))

         def header(header):
             return '\n' + row(header) + row(tuple([HEADER_SEPARATOR] * 5))

         def format_columns(r, strong):
             return (r if not strong else
                     r[:-1] + ('**{0}**'.format(r[-1]), ))

         def table(title, results, is_strong=False, is_open=False):
             rows = [
                 row(format_columns(ReportFormatter.values(r), is_strong))
                 for r in results
             ]
             if not rows:
                 return ''

             if self.single_table:
                 t = ''
                 if not self.header_printed:
                     t += header(ReportFormatter.header_for(results[0]))
                     self.header_printed = True
                 t += row(('**' + title + '**', '', '', '', ''))
                 t += ''.join(rows)
                 return t

             return DETAIL.format(
                 *[
                     title, len(results),
                     (header(ReportFormatter.header_for(results[0])) +
                      ''.join(rows)),
                     ('open' if is_open else '')
                 ])

         return ''.join([
             table('Regression', self.comparator.decreased, True, True),
             table('Improvement', self.comparator.increased, True),
             ('' if self.changes_only else
              table('No Changes', self.comparator.unchanged)),
             table('Added', self.comparator.added, is_open=True),
             table('Removed', self.comparator.removed, is_open=True)
         ])

     HTML = """
 <!DOCTYPE html>
 <html>
 <head>
     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
     <style>
         body {{ font-family: -apple-system, sans-serif; font-size: 14px; }}
         table {{ border-spacing: 2px; border-color: gray; border-spacing: 0;
                 border-collapse: collapse; }}
         table tr {{ background-color: #fff; border-top: 1px solid #c6cbd1; }}
         table th, table td {{ padding: 6px 13px; border: 1px solid #dfe2e5; }}
         th {{ text-align: center; padding-top: 130px; }}
         td {{ text-align: right; }}
         table td:first-child {{ text-align: left; }}
         tr:nth-child(even) {{ background-color: #000000; }}
         tr:nth-child(2n) {{ background-color: #f6f8fa; }}
     </style>
 </head>
 <body>
 <table>
 {0}
 </table>
 </body>
 </html>"""

     HTML_HEADER_ROW = """
         <tr>
                 <th align='left'>{0} ({1})</th>
                 <th align='left'>{2}</th>
                 <th align='left'>{3}</th>
                 <th align='left'>{4}</th>
                 <th align='left'>{5}</th>
         </tr>
 """

     HTML_ROW = """
         <tr>
                 <td align='left'>{0}</td>
                 <td align='left'>{1}</td>
                 <td align='left'>{2}</td>
                 <td align='left'>{3}</td>
                 <td align='left'><font color='{4}'>{5}</font></td>
         </tr>
 """

     def html(self):
         """Report results of benchmark comparisons in HTML format."""
         def row(name, old, new, delta, speedup, speedup_color):
             return self.HTML_ROW.format(
                 name, old, new, delta, speedup_color, speedup)

         def header(contents):
             return self.HTML_HEADER_ROW.format(* contents)

         def table(title, results, speedup_color):
             rows = [
                 row(*(ReportFormatter.values(r) + (speedup_color,)))
                 for r in results
             ]
             return ('' if not rows else
                     header((title, len(results)) +
                            ReportFormatter.header_for(results[0])[1:]) +
                     ''.join(rows))

         return self.HTML.format(
             ''.join([
                 table('Regression', self.comparator.decreased, 'red'),
                 table('Improvement', self.comparator.increased, 'green'),
                 ('' if self.changes_only else
                  table('No Changes', self.comparator.unchanged, 'black')),
                 table('Added', self.comparator.added, ''),
                 table('Removed', self.comparator.removed, '')
             ]))


 def parse_args(args):
     """Parse command line arguments and set default values."""
     parser = argparse.ArgumentParser(description='Compare Performance tests.')
     parser.add_argument('--old-file',
                         help='Baseline performance test suite (csv file)',
                         required=True)
     parser.add_argument('--new-file',
                         help='New performance test suite (csv file)',
                         required=True)
     parser.add_argument('--format',
                         choices=['markdown', 'git', 'html'],
                         help='Output format. Default is markdown.',
                         default="markdown")
     parser.add_argument('--output', help='Output file name')
     parser.add_argument('--changes-only',
                         help='Output only affected tests', action='store_true')
     parser.add_argument(
         '--single-table',
         help='Combine data in a single table in git and markdown formats',
         action='store_true')
     parser.add_argument('--delta-threshold',
                         help='Delta threshold. Default 0.05.',
                         type=float, default=0.05)
     return parser.parse_args(args)


 def create_report(old_results, new_results, delta_threshold, format,
                   changes_only=True, single_table=True):
     comparator = TestComparator(old_results, new_results, delta_threshold)
     formatter = ReportFormatter(comparator, changes_only, single_table)
     formats = {
         'markdown': formatter.markdown,
         'git': formatter.git,
         'html': formatter.html
     }

     report = formats[format]()
     return report


 def main():
     """Compare benchmarks for changes in a formatted report."""
     args = parse_args(sys.argv[1:])
     report = create_report(LogParser.results_from_file(args.old_file),
                            LogParser.results_from_file(args.new_file),
                            args.delta_threshold, args.format,
                            args.changes_only, args.single_table)
     print(report)

     if args.output:
         with open(args.output, 'w') as f:
             f.write(report)


 if __name__ == '__main__':
     sys.exit(main())