garnet/bin/perfcompare/perfcompare.py - fuchsia - Git at Google

 # Copyright 2019 The Fuchsia Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 # This tool is for invoking by the performance comparison trybots.  It
 # is intended for comparing the performance of two versions of
 # Fuchsia.  It can also compare binary sizes.

 import argparse
 import json
 import math
 import os
 import sys
 import tarfile

 import scipy.stats


 # For comparing results from a performance test, we calculate
 # confidence intervals for the mean running times of the test.  If the
 # confidence intervals are non-overlapping, we conclude that the
 # performance has improved or regressed for this test.
 #
 # Data is gathered from a 3-level sampling process:
 #
 #  1) Boot Fuchsia multiple times.
 #  2) For each boot, launch the perf test process one or more times.
 #  3) For each process launch, instantiate the performance test and
 #     run the body of the test some number of times.
 #
 # This is intended to account for variation across boots and across process
 # launches.
 #
 # Currently we use t-test confidence intervals.  This assumes that the
 # values we apply the t-test to are normally distributed, or approximately
 # normally distributed.  In future we could instead use bootstrap
 # confidence intervals, which would avoid that assumption.


 # ALPHA is a parameter for calculating confidence intervals.  It is
 # the probability that the true value for the statistic we're
 # estimating (here, the mean running time) lies outside the confidence
 # interval.
 ALPHA = 0.01


 def Mean(values):
     if len(values) == 0:
         raise AssertionError('Mean is not defined for an empty sample')
     return float(sum(values)) / len(values)


 # Returns the mean and standard deviation of a sample.  This applies
 # Bessel's correction to the calculation of the standard deviation.
 def MeanAndStddev(values):
     if len(values) <= 1:
         raise AssertionError(
             "Sample size of %d is too small to calculate standard deviation "
             "with Bessel's correction" % len(values))
     mean_val = Mean(values)
     sum_of_squares = 0.0
     for val in values:
         diff = val - mean_val
         sum_of_squares += diff * diff
     stddev_val = math.sqrt(sum_of_squares / (len(values) - 1))
     return mean_val, stddev_val


 class Stats(object):

     def __init__(self, values):
         sample_size = len(values)
         mean, stddev = MeanAndStddev(values)
         offset = (-scipy.stats.t.ppf(ALPHA / 2, sample_size - 1)
                   * stddev / math.sqrt(sample_size))
         self._mean = mean
         self._offset = offset
         # Confidence interval for the mean.
         self.interval = (mean - offset, mean + offset)

     def FormatConfidenceInterval(self):
         return '%d +/- %d' % (self._mean, self._offset)

     # Returns the relative CI width, which is the width of the confidence
     # interval divided by the mean.
     def RelativeConfidenceIntervalWidth(self):
         return self._offset * 2 / self._mean


 def ReadJsonFile(filename):
     with open(filename, 'r') as fh:
         return json.load(fh)


 def IsResultsFilename(name):
     return name.endswith('.json') and name != 'summary.json'


 # Read the raw perf test results from a directory or a tar file that
 # contains results from a single boot of Fuchsia.  Returns a sequence
 # (iterator) of JSON trees.
 #
 # Accepting tar files here is a convenience for when doing local testing of
 # the statistics.  The Swarming system used for the bots produces "out.tar"
 # files as results.
 #
 # The directory (or tar file) is expected to contain files with names
 # of the following forms:
 #
 #   <name-of-test-executable>_process<number>.json - results that are read here
 #   <name-of-test-executable>_process<number>.catapult_json - ignored here
 #   summary.json - ignored here
 #
 # Each *.json file (except for summary.json) contains results from a
 # separate launch of a performance test process.
 def RawResultsFromDir(filename):
     # Note that sorting the filename listing (from os.listdir() or from
     # tarfile) is not essential, but it helps to make any later processing
     # more deterministic.
     if os.path.isfile(filename):
         # Read from tar file.
         with tarfile.open(filename) as tar:
             for member in sorted(tar.getmembers(),
                                  key=lambda member: member.name):
                 if IsResultsFilename(member.name):
                     yield json.load(tar.extractfile(member))
     else:
         # Read from directory.
         for name in sorted(os.listdir(filename)):
             if IsResultsFilename(name):
                 yield ReadJsonFile(os.path.join(filename, name))


 # Takes a list of filenames of perf test results, each representing the
 # results from one boot of Fuchsia, and each in the format accepted by
 # RawResultsFromDir().
 #
 # Returns a dict mapping test names to Stats objects.
 def ResultsFromDirs(filenames):
     results_map = {}
     for boot_results_path in filenames:
         results_for_boot = {}
         for process_run_results in RawResultsFromDir(boot_results_path):
             for test_case in process_run_results:
                 # Skip the running time from the test's initial run within
                 # the process; treat it as a warmup run.  The initial run
                 # is often slower than later runs, so it would skew the
                 # mean if we included it.  The RoundTrip_*_MultiProcess
                 # tests are an extreme case, because the first run waits
                 # for a subprocess to start up.  See PT-244.
                 new_value = Mean(test_case['values'][1:])
                 results_for_boot.setdefault(test_case['label'], []).append(
                     new_value)
         for label, values in results_for_boot.iteritems():
             results_map.setdefault(label, []).append(Mean(values))
     return {name: Stats(values) for name, values in results_map.iteritems()}


 # This takes a directory representing perf test results from multiple boots
 # of Fuchsia.  It contains a "by_boot" subdir, which contains directories
 # (or tar files) of the format read by RawResultsFromDir().
 #
 # This returns a dict mapping test names to Stats objects.
 def ResultsFromDir(filename):
     assert os.path.exists(filename)
     by_boot_dir = os.path.join(filename, 'by_boot')
     assert os.path.exists(by_boot_dir), by_boot_dir
     filenames = [os.path.join(by_boot_dir, name)
                  for name in sorted(os.listdir(by_boot_dir))]
     return ResultsFromDirs(filenames)


 def FormatTable(heading_row, rows, out_fh):
     column_count = len(heading_row)
     for row in rows:
         assert len(row) == column_count
     rows = [heading_row] + rows
     widths = [2 + max(len(row[col_number]) for row in rows)
               for col_number in xrange(column_count)]
     # Underline the heading row.
     rows.insert(1, ['-' * (width - 2) for width in widths])
     for row in rows:
         for col_number, value in enumerate(row):
             out_fh.write(value)
             if col_number < column_count - 1:
                 out_fh.write(' ' * (widths[col_number] - len(value)))
         out_fh.write('\n')


 def ComparePerf(args, out_fh):
     results_maps = [ResultsFromDir(args.results_dir_before),
                     ResultsFromDir(args.results_dir_after)]

     # Set of all test case names, including those added or removed.
     labels = set(results_maps[0].iterkeys())
     labels.update(results_maps[1].iterkeys())

     counts = {
         'added': 0,
         'removed': 0,
         'faster': 0,
         'slower': 0,
         'no_sig_diff': 0,
     }
     heading_row = ['Test case', 'Improve/regress?', 'Factor change',
                    'Mean before', 'Mean after']
     all_rows = []
     diff_rows = []
     for label in sorted(labels):
         if label not in results_maps[0]:
             result = 'added'
             factor_range = '-'
             before_range = '-'
             after_range = results_maps[1][label].FormatConfidenceInterval()
         elif label not in results_maps[1]:
             result = 'removed'
             factor_range = '-'
             before_range = results_maps[0][label].FormatConfidenceInterval()
             after_range = '-'
         else:
             stats = [results_map[label] for results_map in results_maps]
             interval_before = stats[0].interval
             interval_after = stats[1].interval
             factor_min = interval_after[0] / interval_before[1]
             factor_max = interval_after[1] / interval_before[0]
             if interval_after[0] >= interval_before[1]:
                 result = 'slower'
             elif interval_after[1] <= interval_before[0]:
                 result = 'faster'
             else:
                 result = 'no_sig_diff'
             before_range = stats[0].FormatConfidenceInterval()
             after_range = stats[1].FormatConfidenceInterval()
             factor_range = '%.3f-%.3f' % (factor_min, factor_max)
         counts[result] += 1
         row = [label, result, factor_range, before_range, after_range]
         all_rows.append(row)
         if result != 'no_sig_diff':
             diff_rows.append(row)

     def FormatCount(count, text):
         noun = 'test case' if count == 1 else 'test cases'
         out_fh.write('  %d %s %s\n' % (count, noun, text))

     out_fh.write('Summary counts:\n')
     FormatCount(len(labels), 'in total')
     FormatCount(counts['no_sig_diff'],
                 'had no significant difference (no_sig_diff)')
     FormatCount(counts['faster'], 'got faster')
     FormatCount(counts['slower'], 'got slower')
     FormatCount(counts['added'], 'added')
     FormatCount(counts['removed'], 'removed')
     out_fh.write('\n\n')

     if len(diff_rows) != 0:
         out_fh.write('Results from test cases with differences:\n\n')
         FormatTable(heading_row, diff_rows, out_fh)
         out_fh.write('\n\n')

     out_fh.write('Results from all test cases:\n\n')
     FormatTable(heading_row, all_rows, out_fh)


 def IntervalsIntersect(interval1, interval2):
     return not (interval2[0] >= interval1[1] or
                 interval2[1] <= interval1[0])


 # Calculate the rate at which two intervals drawn (without replacement)
 # from the given set of intervals will be non-intersecting.
 def MismatchRate(intervals):
     mismatch_count = sum(int(not IntervalsIntersect(intervals[i], intervals[j]))
                          for i in xrange(len(intervals))
                          for j in xrange(i))
     comparisons_count = len(intervals) * (len(intervals) - 1) / 2
     return float(mismatch_count) / comparisons_count


 def ValidatePerfCompare(args, out_fh):
     boot_count = len(args.results_dirs)
     group_size = args.group_size
     group_count = boot_count / group_size

     results_maps = [
         ResultsFromDirs(
             args.results_dirs[i * group_size : (i + 1) * group_size])
         for i in xrange(group_count)]

     # Group by test name (label).
     by_test = {}
     for results_map in results_maps:
         for label, stats in results_map.iteritems():
             by_test.setdefault(label, []).append(stats)

     out_fh.write('Rate of mismatches (non-intersections) '
                  'of confidence intervals for each test:\n')
     mismatch_rates = []
     for label, stats_list in sorted(by_test.iteritems()):
         mismatch_rate = MismatchRate([stats.interval for stats in stats_list])
         out_fh.write('%f %s\n' % (mismatch_rate, label))
         mismatch_rates.append(mismatch_rate)

     mean_relative_ci_width = Mean([
         stats.RelativeConfidenceIntervalWidth()
         for results_map in results_maps
         for stats in results_map.itervalues()])

     out_fh.write('\n')
     mean_val = Mean(mismatch_rates)
     out_fh.write('Mean mismatch rate: %f\n' % mean_val)
     out_fh.write('Mean relative confidence interval width: %f\n'
                  % mean_relative_ci_width)
     out_fh.write('Number of test cases: %d\n' % len(mismatch_rates))
     out_fh.write('Number of result sets: %d groups of %d boots each'
                  ' (ignoring %d leftover boots)\n'
                  % (group_count, group_size,
                     boot_count - group_size * group_count))
     out_fh.write('Expected number of test cases with mismatches: %f\n'
                  % (mean_val * len(mismatch_rates)))


 def TotalSize(snapshot_file):
     with open(snapshot_file) as fh:
         data = json.load(fh)
     return sum(info['size'] for info in data['blobs'].itervalues())


 def CompareSizes(args):
     filenames = [args.snapshot_before, args.snapshot_after]
     sizes = [TotalSize(filename) for filename in filenames]
     print 'Size before:  %d bytes' % sizes[0]
     print 'Size after:   %d bytes' % sizes[1]
     print 'Difference:   %d bytes' % (sizes[1] - sizes[0])
     if sizes[0] != 0:
         print 'Factor of:    %f' % (float(sizes[1]) / sizes[0])


 def Main(argv, out_fh):
     parser = argparse.ArgumentParser()
     subparsers = parser.add_subparsers()

     parser_compare_perf = subparsers.add_parser(
         'compare_perf',
         help='Compare two sets of perf test results')
     parser_compare_perf.add_argument('results_dir_before')
     parser_compare_perf.add_argument('results_dir_after')
     parser_compare_perf.set_defaults(
         func=lambda args: ComparePerf(args, out_fh))

     parser_validate_perfcompare = subparsers.add_parser(
         'validate_perfcompare',
         help='Outputs statistics given multiple sets of perf test results'
         ' that come from the same build.  This is for validating the'
         ' statistics used by the perfcompare tool.  It can be used to check'
         ' the rate at which the tool will falsely indicate that performance'
         ' of a test case has regressed or improved.')
     parser_validate_perfcompare.add_argument(
         '-g', '--group_size', type=int, required=True,
         help='Number of boots to put in each group.  To get realistic'
         ' results that reflect how the perfcompare trybots would behave,'
         ' this should match the boots_per_revision setting in the'
         ' fuchsia_perfcompare.py recipe.  (Since that code is currently'
         ' not part of the Fuchsia checkout, we cannot make the settings'
         ' match automatically.)')
     parser_validate_perfcompare.add_argument('results_dirs', nargs='+')
     parser_validate_perfcompare.set_defaults(
         func=lambda args: ValidatePerfCompare(args, out_fh))

     parser_compare_sizes = subparsers.add_parser(
         'compare_sizes',
         help='Compare file sizes specified by two system.snapshot files')
     parser_compare_sizes.add_argument('snapshot_before')
     parser_compare_sizes.add_argument('snapshot_after')
     parser_compare_sizes.set_defaults(func=CompareSizes)

     args = parser.parse_args(argv)
     args.func(args)


 if __name__ == '__main__':
     Main(sys.argv[1:], sys.stdout)
	# Copyright 2019 The Fuchsia Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	# This tool is for invoking by the performance comparison trybots. It
	# is intended for comparing the performance of two versions of
	# Fuchsia. It can also compare binary sizes.

	import argparse
	import json
	import math
	import os
	import sys
	import tarfile

	import scipy.stats


	# For comparing results from a performance test, we calculate
	# confidence intervals for the mean running times of the test. If the
	# confidence intervals are non-overlapping, we conclude that the
	# performance has improved or regressed for this test.
	#
	# Data is gathered from a 3-level sampling process:
	#
	# 1) Boot Fuchsia multiple times.
	# 2) For each boot, launch the perf test process one or more times.
	# 3) For each process launch, instantiate the performance test and
	# run the body of the test some number of times.
	#
	# This is intended to account for variation across boots and across process
	# launches.
	#
	# Currently we use t-test confidence intervals. This assumes that the
	# values we apply the t-test to are normally distributed, or approximately
	# normally distributed. In future we could instead use bootstrap
	# confidence intervals, which would avoid that assumption.


	# ALPHA is a parameter for calculating confidence intervals. It is
	# the probability that the true value for the statistic we're
	# estimating (here, the mean running time) lies outside the confidence
	# interval.
	ALPHA = 0.01


	def Mean(values):
	if len(values) == 0:
	raise AssertionError('Mean is not defined for an empty sample')
	return float(sum(values)) / len(values)


	# Returns the mean and standard deviation of a sample. This applies
	# Bessel's correction to the calculation of the standard deviation.
	def MeanAndStddev(values):
	if len(values) <= 1:
	raise AssertionError(
	"Sample size of %d is too small to calculate standard deviation "
	"with Bessel's correction" % len(values))
	mean_val = Mean(values)
	sum_of_squares = 0.0
	for val in values:
	diff = val - mean_val
	sum_of_squares += diff * diff
	stddev_val = math.sqrt(sum_of_squares / (len(values) - 1))
	return mean_val, stddev_val


	class Stats(object):

	def __init__(self, values):
	sample_size = len(values)
	mean, stddev = MeanAndStddev(values)
	offset = (-scipy.stats.t.ppf(ALPHA / 2, sample_size - 1)
	* stddev / math.sqrt(sample_size))
	self._mean = mean
	self._offset = offset
	# Confidence interval for the mean.
	self.interval = (mean - offset, mean + offset)

	def FormatConfidenceInterval(self):
	return '%d +/- %d' % (self._mean, self._offset)

	# Returns the relative CI width, which is the width of the confidence
	# interval divided by the mean.
	def RelativeConfidenceIntervalWidth(self):
	return self._offset * 2 / self._mean


	def ReadJsonFile(filename):
	with open(filename, 'r') as fh:
	return json.load(fh)


	def IsResultsFilename(name):
	return name.endswith('.json') and name != 'summary.json'


	# Read the raw perf test results from a directory or a tar file that
	# contains results from a single boot of Fuchsia. Returns a sequence
	# (iterator) of JSON trees.
	#
	# Accepting tar files here is a convenience for when doing local testing of
	# the statistics. The Swarming system used for the bots produces "out.tar"
	# files as results.
	#
	# The directory (or tar file) is expected to contain files with names
	# of the following forms:
	#
	# <name-of-test-executable>_process<number>.json - results that are read here
	# <name-of-test-executable>_process<number>.catapult_json - ignored here
	# summary.json - ignored here
	#
	# Each *.json file (except for summary.json) contains results from a
	# separate launch of a performance test process.
	def RawResultsFromDir(filename):
	# Note that sorting the filename listing (from os.listdir() or from
	# tarfile) is not essential, but it helps to make any later processing
	# more deterministic.
	if os.path.isfile(filename):
	# Read from tar file.
	with tarfile.open(filename) as tar:
	for member in sorted(tar.getmembers(),
	key=lambda member: member.name):
	if IsResultsFilename(member.name):
	yield json.load(tar.extractfile(member))
	else:
	# Read from directory.
	for name in sorted(os.listdir(filename)):
	if IsResultsFilename(name):
	yield ReadJsonFile(os.path.join(filename, name))


	# Takes a list of filenames of perf test results, each representing the
	# results from one boot of Fuchsia, and each in the format accepted by
	# RawResultsFromDir().
	#
	# Returns a dict mapping test names to Stats objects.
	def ResultsFromDirs(filenames):
	results_map = {}
	for boot_results_path in filenames:
	results_for_boot = {}
	for process_run_results in RawResultsFromDir(boot_results_path):
	for test_case in process_run_results:
	# Skip the running time from the test's initial run within
	# the process; treat it as a warmup run. The initial run
	# is often slower than later runs, so it would skew the
	# mean if we included it. The RoundTrip_*_MultiProcess
	# tests are an extreme case, because the first run waits
	# for a subprocess to start up. See PT-244.
	new_value = Mean(test_case['values'][1:])
	results_for_boot.setdefault(test_case['label'], []).append(
	new_value)
	for label, values in results_for_boot.iteritems():
	results_map.setdefault(label, []).append(Mean(values))
	return {name: Stats(values) for name, values in results_map.iteritems()}


	# This takes a directory representing perf test results from multiple boots
	# of Fuchsia. It contains a "by_boot" subdir, which contains directories
	# (or tar files) of the format read by RawResultsFromDir().
	#
	# This returns a dict mapping test names to Stats objects.
	def ResultsFromDir(filename):
	assert os.path.exists(filename)
	by_boot_dir = os.path.join(filename, 'by_boot')
	assert os.path.exists(by_boot_dir), by_boot_dir
	filenames = [os.path.join(by_boot_dir, name)
	for name in sorted(os.listdir(by_boot_dir))]
	return ResultsFromDirs(filenames)


	def FormatTable(heading_row, rows, out_fh):
	column_count = len(heading_row)
	for row in rows:
	assert len(row) == column_count
	rows = [heading_row] + rows
	widths = [2 + max(len(row[col_number]) for row in rows)
	for col_number in xrange(column_count)]
	# Underline the heading row.
	rows.insert(1, ['-' * (width - 2) for width in widths])
	for row in rows:
	for col_number, value in enumerate(row):
	out_fh.write(value)
	if col_number < column_count - 1:
	out_fh.write(' ' * (widths[col_number] - len(value)))
	out_fh.write('\n')


	def ComparePerf(args, out_fh):
	results_maps = [ResultsFromDir(args.results_dir_before),
	ResultsFromDir(args.results_dir_after)]

	# Set of all test case names, including those added or removed.
	labels = set(results_maps[0].iterkeys())
	labels.update(results_maps[1].iterkeys())

	counts = {
	'added': 0,
	'removed': 0,
	'faster': 0,
	'slower': 0,
	'no_sig_diff': 0,
	}
	heading_row = ['Test case', 'Improve/regress?', 'Factor change',
	'Mean before', 'Mean after']
	all_rows = []
	diff_rows = []
	for label in sorted(labels):
	if label not in results_maps[0]:
	result = 'added'
	factor_range = '-'
	before_range = '-'
	after_range = results_maps[1][label].FormatConfidenceInterval()
	elif label not in results_maps[1]:
	result = 'removed'
	factor_range = '-'
	before_range = results_maps[0][label].FormatConfidenceInterval()
	after_range = '-'
	else:
	stats = [results_map[label] for results_map in results_maps]
	interval_before = stats[0].interval
	interval_after = stats[1].interval
	factor_min = interval_after[0] / interval_before[1]
	factor_max = interval_after[1] / interval_before[0]
	if interval_after[0] >= interval_before[1]:
	result = 'slower'
	elif interval_after[1] <= interval_before[0]:
	result = 'faster'
	else:
	result = 'no_sig_diff'
	before_range = stats[0].FormatConfidenceInterval()
	after_range = stats[1].FormatConfidenceInterval()
	factor_range = '%.3f-%.3f' % (factor_min, factor_max)
	counts[result] += 1
	row = [label, result, factor_range, before_range, after_range]
	all_rows.append(row)
	if result != 'no_sig_diff':
	diff_rows.append(row)

	def FormatCount(count, text):
	noun = 'test case' if count == 1 else 'test cases'
	out_fh.write(' %d %s %s\n' % (count, noun, text))

	out_fh.write('Summary counts:\n')
	FormatCount(len(labels), 'in total')
	FormatCount(counts['no_sig_diff'],
	'had no significant difference (no_sig_diff)')
	FormatCount(counts['faster'], 'got faster')
	FormatCount(counts['slower'], 'got slower')
	FormatCount(counts['added'], 'added')
	FormatCount(counts['removed'], 'removed')
	out_fh.write('\n\n')

	if len(diff_rows) != 0:
	out_fh.write('Results from test cases with differences:\n\n')
	FormatTable(heading_row, diff_rows, out_fh)
	out_fh.write('\n\n')

	out_fh.write('Results from all test cases:\n\n')
	FormatTable(heading_row, all_rows, out_fh)


	def IntervalsIntersect(interval1, interval2):
	return not (interval2[0] >= interval1[1] or
	interval2[1] <= interval1[0])


	# Calculate the rate at which two intervals drawn (without replacement)
	# from the given set of intervals will be non-intersecting.
	def MismatchRate(intervals):
	mismatch_count = sum(int(not IntervalsIntersect(intervals[i], intervals[j]))
	for i in xrange(len(intervals))
	for j in xrange(i))
	comparisons_count = len(intervals) * (len(intervals) - 1) / 2
	return float(mismatch_count) / comparisons_count


	def ValidatePerfCompare(args, out_fh):
	boot_count = len(args.results_dirs)
	group_size = args.group_size
	group_count = boot_count / group_size

	results_maps = [
	ResultsFromDirs(
	args.results_dirs[i * group_size : (i + 1) * group_size])
	for i in xrange(group_count)]

	# Group by test name (label).
	by_test = {}
	for results_map in results_maps:
	for label, stats in results_map.iteritems():
	by_test.setdefault(label, []).append(stats)

	out_fh.write('Rate of mismatches (non-intersections) '
	'of confidence intervals for each test:\n')
	mismatch_rates = []
	for label, stats_list in sorted(by_test.iteritems()):
	mismatch_rate = MismatchRate([stats.interval for stats in stats_list])
	out_fh.write('%f %s\n' % (mismatch_rate, label))
	mismatch_rates.append(mismatch_rate)

	mean_relative_ci_width = Mean([
	stats.RelativeConfidenceIntervalWidth()
	for results_map in results_maps
	for stats in results_map.itervalues()])

	out_fh.write('\n')
	mean_val = Mean(mismatch_rates)
	out_fh.write('Mean mismatch rate: %f\n' % mean_val)
	out_fh.write('Mean relative confidence interval width: %f\n'
	% mean_relative_ci_width)
	out_fh.write('Number of test cases: %d\n' % len(mismatch_rates))
	out_fh.write('Number of result sets: %d groups of %d boots each'
	' (ignoring %d leftover boots)\n'
	% (group_count, group_size,
	boot_count - group_size * group_count))
	out_fh.write('Expected number of test cases with mismatches: %f\n'
	% (mean_val * len(mismatch_rates)))


	def TotalSize(snapshot_file):
	with open(snapshot_file) as fh:
	data = json.load(fh)
	return sum(info['size'] for info in data['blobs'].itervalues())


	def CompareSizes(args):
	filenames = [args.snapshot_before, args.snapshot_after]
	sizes = [TotalSize(filename) for filename in filenames]
	print 'Size before: %d bytes' % sizes[0]
	print 'Size after: %d bytes' % sizes[1]
	print 'Difference: %d bytes' % (sizes[1] - sizes[0])
	if sizes[0] != 0:
	print 'Factor of: %f' % (float(sizes[1]) / sizes[0])


	def Main(argv, out_fh):
	parser = argparse.ArgumentParser()
	subparsers = parser.add_subparsers()

	parser_compare_perf = subparsers.add_parser(
	'compare_perf',
	help='Compare two sets of perf test results')
	parser_compare_perf.add_argument('results_dir_before')
	parser_compare_perf.add_argument('results_dir_after')
	parser_compare_perf.set_defaults(
	func=lambda args: ComparePerf(args, out_fh))

	parser_validate_perfcompare = subparsers.add_parser(
	'validate_perfcompare',
	help='Outputs statistics given multiple sets of perf test results'
	' that come from the same build. This is for validating the'
	' statistics used by the perfcompare tool. It can be used to check'
	' the rate at which the tool will falsely indicate that performance'
	' of a test case has regressed or improved.')
	parser_validate_perfcompare.add_argument(
	'-g', '--group_size', type=int, required=True,
	help='Number of boots to put in each group. To get realistic'
	' results that reflect how the perfcompare trybots would behave,'
	' this should match the boots_per_revision setting in the'
	' fuchsia_perfcompare.py recipe. (Since that code is currently'
	' not part of the Fuchsia checkout, we cannot make the settings'
	' match automatically.)')
	parser_validate_perfcompare.add_argument('results_dirs', nargs='+')
	parser_validate_perfcompare.set_defaults(
	func=lambda args: ValidatePerfCompare(args, out_fh))

	parser_compare_sizes = subparsers.add_parser(
	'compare_sizes',
	help='Compare file sizes specified by two system.snapshot files')
	parser_compare_sizes.add_argument('snapshot_before')
	parser_compare_sizes.add_argument('snapshot_after')
	parser_compare_sizes.set_defaults(func=CompareSizes)

	args = parser.parse_args(argv)
	args.func(args)


	if __name__ == '__main__':
	Main(sys.argv[1:], sys.stdout)