catapult/histogram.go - infra/infra - Git at Google

 // Copyright 2018 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file

 package catapult

 import (
 	"errors"
 	"log"
 	"math"
 	"strings"

 	schema "fuchsia.googlesource.com/infra/infra/perf/schema/v1"
 	uuid "github.com/satori/go.uuid"
 )

 // Histogram is a Catapult histogram object.
 //
 // See https://github.com/catapult-project/catapult/blob/master/docs/histogram-set-json-format.md
 // for more information on the format.
 //
 // TODO(kjharland): Add these missing fields as needed
 //   ShortName
 //   BinBoundaries
 //   NanDiagnostics
 //   AllBins
 //   SummaryOptions
 //   SampleValues
 type Histogram struct {
 	Name               string    `json:"name"`
 	GUID               string    `json:"guid"`
 	Unit               string    `json:"unit"`
 	Description        string    `json:"description"`
 	MaxNumSampleValues int       `json:"maxNumSampleValues"`
 	NumNans            int       `json:"numNans"`
 	Running            []float64 `json:"running"`
 	// Diagnostics maps a Diagnostic's name to its GUID.
 	//
 	// These map entries communicate that the diagnostic with the given
 	// name and GUID contains metadata that can help debug regressions and
 	// other issues with this Histogram in the Catapult Dashboard.
 	Diagnostics map[string]string `json:"diagnostics"`
 }

 // AddDiagnostic associates name with the given GUID in this Histogram's
 // Diagnostics map.
 //
 // If the the new entry overwrites an existing entry, a warning is logged.
 func (h *Histogram) AddDiagnostic(name string, guid string) {

 	if h.Diagnostics == nil {
 		h.Diagnostics = make(map[string]string)
 	}

 	if existing, ok := h.Diagnostics[name]; ok && existing != guid {
 		log.Printf(
 			"Overwriting shared Diagnostic %v in Histogram %v."+
 				"($old, $new) = (%v, %v)",
 			name, h.Name, existing, guid)
 	}

 	h.Diagnostics[name] = guid
 }

 // ConvertBenchmarkDataToHistograms converts BenchmarkData to Histograms.
 //
 // A BenchmarkData contains one or more Sample objects. This conversion works
 // differently based on the labels of those samples:
 //
 // * If all sample labels are empty, a single Histogram is created for the
 //   BenchmarkData. It contains all sample values and its name is set to
 //   BenchmarkData.Label. Zircon benchmarks results are an example of samples
 //   without labels.
 // * If all sample labels are non-empty, a Histogram is created for each sample.
 //   It contains only that sample's values and its name is set to
 //   "{BenchmarkData.Label}_{Sample.Label}".  Some tracing-based benchmark
 //   results are examples of samples with labels.
 //
 // It does not make sense for some labels to be non-empty while others are empty
 // because there is no way to determine how the benchmark author really wants
 // this information to be organized in the Catapult dashboard. An error is
 // returned in this case.
 //
 // This function assumes that all non-empty sample labels are unique to their
 // parent BenchmarkData. Non-unique names may result in confusing data in the
 // Catapult dashboard.
 //
 // TODO(IN-330): We should have a schema that removes this ambiguity in sample
 // labelling.
 func ConvertBenchmarkDataToHistograms(d schema.BenchmarkData) ([]Histogram, error) {
 	if len(d.Samples) == 0 {
 		return nil, errors.New("BenchmarkData has no samples")
 	}

 	samplesHaveLabels, err := checkSampleLabels(d.Samples)
 	if err != nil {
 		return nil, err
 	}

 	if samplesHaveLabels {
 		// Samples are labeled.  Create a Histogram for each one.
 		var histograms []Histogram
 		for _, sample := range d.Samples {
 			histogram, err := createHistogram(d.Label+"_"+sample.Label, sample.Values)
 			if err != nil {
 				return nil, err
 			}
 			histograms = append(histograms, histogram)
 		}
 		return histograms, nil
 	} else {
 		// Samples are unlabeled. Concat all data into a single Histogram.
 		var sampleValues []float64
 		for _, sample := range d.Samples {
 			sampleValues = append(sampleValues, sample.Values...)
 		}
 		histogram, err := createHistogram(d.Label, sampleValues)
 		return []Histogram{histogram}, err
 	}
 }

 // createHistogram creates a Histogram with the given name.
 //
 // The histogram's statistics are computed from the given slice of values, which
 // are assumed to be nanosecond measurements.
 //
 // This also performs the following normalizations on the input:
 //
 // * Converts values to milliseconds: Catapult doesn't support nanoseconds.
 // * Converts label whitespace to underscores: Catpapult forms a unique key
 //   for fetching graph data using the Histogram name. Whitespace breaks this
 //   key and causes Catapult to incorrectly process the data.
 //
 // Returns an error if values is empty.
 func createHistogram(name string, values []float64) (Histogram, error) {
 	var sampleValues []float64

 	if len(values) == 0 {
 		return Histogram{}, errors.New("at least one sample value required")
 	}

 	// Fuchsia benchmarks use nanoseconds. Catapult doesn't support this,
 	// so convert to milliseconds instead.
 	for _, value := range values {
 		sampleValues = append(sampleValues, value/1e6)
 	}

 	// Catapult does not support whitespace in the Histogram name. Replace with
 	// underscores.  -1 specifies no limit on the number of replacements.
 	name = strings.Replace(name, " ", "_", -1)

 	return Histogram{
 		Name:    name,
 		Unit:    "ms_smallerIsBetter",
 		GUID:    uuid.NewV4().String(),
 		NumNans: 0, // All samples are numeric values
 		// TODO(kjharland): Compute AllBins.
 		MaxNumSampleValues: len(sampleValues),
 		Running:            computeRunningStatistics(sampleValues),
 	}, nil
 }

 // Computes an ordered set of 7 statistics for the given set of values:
 //
 // count, max, meanlogs, mean, min, sum, variance
 //
 // meanlogs is the mean of the logs of the absolute values of the given values.
 //
 // https://github.com/catapult-project/catapult/issues/4150
 func computeRunningStatistics(values []float64) []float64 {
 	count := float64(len(values))
 	min := math.Inf(1)
 	max := math.Inf(-1)
 	var sum float64
 	var meanlogs float64

 	for i, v := range values {
 		min = math.Min(min, v)
 		max = math.Max(max, v)
 		sum += v
 		// Compute meanlogs as a cumulative moving average:
 		// https://en.wikipedia.org/wiki/Moving_average
 		meanlogs += (math.Log(math.Abs(v)) - meanlogs) / float64(i+1)
 	}

 	mean, variance := meanVariance(values)
 	return []float64{count, max, meanlogs, mean, min, sum, variance}
 }

 // Computes the mean and variance of the samples in vals.
 func meanVariance(vals []float64) (mean, variance float64) {
 	// First compute the mean.
 	var sum float64
 	for _, val := range vals {
 		sum += val
 	}

 	valsCount := float64(len(vals))
 	mean = sum / valsCount

 	// A single value has no variance. Leave it as 0 unless there are
 	// multiple values.
 	if valsCount < 2 {
 		return
 	}

 	// Compute variance using the "Two-pass algorithm". See
 	// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
 	var sumOfSquaredDiffs float64

 	for _, val := range vals {
 		sumOfSquaredDiffs += (val - mean) * (val - mean)
 	}
 	variance = sumOfSquaredDiffs / (valsCount - 1)

 	return
 }

 // checkSampleLabels checks whether the given samples have non-empty labels.
 //
 // Returns true iff all samples are labeled. Returns an error if samples are
 // inconsistently labeled or samples is empty.
 func checkSampleLabels(samples []schema.Sample) (bool, error) {
 	if len(samples) == 0 {
 		return false, errors.New("sample list is empty")
 	}

 	samplesShouldHaveLabels := samples[0].Label != ""

 	// Verify that all samples are consistently labeled.
 	for _, sample := range samples {
 		// Return an error if the samples are inconsistently labeled.
 		sampleHasLabel := sample.Label != ""
 		if samplesShouldHaveLabels != sampleHasLabel {
 			return false, errors.New("some samples are missing labels")
 		}
 	}

 	return samplesShouldHaveLabels, nil
 }
	// Copyright 2018 The Fuchsia Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file

	package catapult

	import (
	"errors"
	"log"
	"math"
	"strings"

	schema "fuchsia.googlesource.com/infra/infra/perf/schema/v1"
	uuid "github.com/satori/go.uuid"
	)

	// Histogram is a Catapult histogram object.
	//
	// See https://github.com/catapult-project/catapult/blob/master/docs/histogram-set-json-format.md
	// for more information on the format.
	//
	// TODO(kjharland): Add these missing fields as needed
	// ShortName
	// BinBoundaries
	// NanDiagnostics
	// AllBins
	// SummaryOptions
	// SampleValues
	type Histogram struct {
	Name string `json:"name"`
	GUID string `json:"guid"`
	Unit string `json:"unit"`
	Description string `json:"description"`
	MaxNumSampleValues int `json:"maxNumSampleValues"`
	NumNans int `json:"numNans"`
	Running []float64 `json:"running"`
	// Diagnostics maps a Diagnostic's name to its GUID.
	//
	// These map entries communicate that the diagnostic with the given
	// name and GUID contains metadata that can help debug regressions and
	// other issues with this Histogram in the Catapult Dashboard.
	Diagnostics map[string]string `json:"diagnostics"`
	}

	// AddDiagnostic associates name with the given GUID in this Histogram's
	// Diagnostics map.
	//
	// If the the new entry overwrites an existing entry, a warning is logged.
	func (h *Histogram) AddDiagnostic(name string, guid string) {

	if h.Diagnostics == nil {
	h.Diagnostics = make(map[string]string)
	}

	if existing, ok := h.Diagnostics[name]; ok && existing != guid {
	log.Printf(
	"Overwriting shared Diagnostic %v in Histogram %v."+
	"($old, $new) = (%v, %v)",
	name, h.Name, existing, guid)
	}

	h.Diagnostics[name] = guid
	}

	// ConvertBenchmarkDataToHistograms converts BenchmarkData to Histograms.
	//
	// A BenchmarkData contains one or more Sample objects. This conversion works
	// differently based on the labels of those samples:
	//
	// * If all sample labels are empty, a single Histogram is created for the
	// BenchmarkData. It contains all sample values and its name is set to
	// BenchmarkData.Label. Zircon benchmarks results are an example of samples
	// without labels.
	// * If all sample labels are non-empty, a Histogram is created for each sample.
	// It contains only that sample's values and its name is set to
	// "{BenchmarkData.Label}_{Sample.Label}". Some tracing-based benchmark
	// results are examples of samples with labels.
	//
	// It does not make sense for some labels to be non-empty while others are empty
	// because there is no way to determine how the benchmark author really wants
	// this information to be organized in the Catapult dashboard. An error is
	// returned in this case.
	//
	// This function assumes that all non-empty sample labels are unique to their
	// parent BenchmarkData. Non-unique names may result in confusing data in the
	// Catapult dashboard.
	//
	// TODO(IN-330): We should have a schema that removes this ambiguity in sample
	// labelling.
	func ConvertBenchmarkDataToHistograms(d schema.BenchmarkData) ([]Histogram, error) {
	if len(d.Samples) == 0 {
	return nil, errors.New("BenchmarkData has no samples")
	}

	samplesHaveLabels, err := checkSampleLabels(d.Samples)
	if err != nil {
	return nil, err
	}

	if samplesHaveLabels {
	// Samples are labeled. Create a Histogram for each one.
	var histograms []Histogram
	for _, sample := range d.Samples {
	histogram, err := createHistogram(d.Label+"_"+sample.Label, sample.Values)
	if err != nil {
	return nil, err
	}
	histograms = append(histograms, histogram)
	}
	return histograms, nil
	} else {
	// Samples are unlabeled. Concat all data into a single Histogram.
	var sampleValues []float64
	for _, sample := range d.Samples {
	sampleValues = append(sampleValues, sample.Values...)
	}
	histogram, err := createHistogram(d.Label, sampleValues)
	return []Histogram{histogram}, err
	}
	}

	// createHistogram creates a Histogram with the given name.
	//
	// The histogram's statistics are computed from the given slice of values, which
	// are assumed to be nanosecond measurements.
	//
	// This also performs the following normalizations on the input:
	//
	// * Converts values to milliseconds: Catapult doesn't support nanoseconds.
	// * Converts label whitespace to underscores: Catpapult forms a unique key
	// for fetching graph data using the Histogram name. Whitespace breaks this
	// key and causes Catapult to incorrectly process the data.
	//
	// Returns an error if values is empty.
	func createHistogram(name string, values []float64) (Histogram, error) {
	var sampleValues []float64

	if len(values) == 0 {
	return Histogram{}, errors.New("at least one sample value required")
	}

	// Fuchsia benchmarks use nanoseconds. Catapult doesn't support this,
	// so convert to milliseconds instead.
	for _, value := range values {
	sampleValues = append(sampleValues, value/1e6)
	}

	// Catapult does not support whitespace in the Histogram name. Replace with
	// underscores. -1 specifies no limit on the number of replacements.
	name = strings.Replace(name, " ", "_", -1)

	return Histogram{
	Name: name,
	Unit: "ms_smallerIsBetter",
	GUID: uuid.NewV4().String(),
	NumNans: 0, // All samples are numeric values
	// TODO(kjharland): Compute AllBins.
	MaxNumSampleValues: len(sampleValues),
	Running: computeRunningStatistics(sampleValues),
	}, nil
	}

	// Computes an ordered set of 7 statistics for the given set of values:
	//
	// count, max, meanlogs, mean, min, sum, variance
	//
	// meanlogs is the mean of the logs of the absolute values of the given values.
	//
	// https://github.com/catapult-project/catapult/issues/4150
	func computeRunningStatistics(values []float64) []float64 {
	count := float64(len(values))
	min := math.Inf(1)
	max := math.Inf(-1)
	var sum float64
	var meanlogs float64

	for i, v := range values {
	min = math.Min(min, v)
	max = math.Max(max, v)
	sum += v
	// Compute meanlogs as a cumulative moving average:
	// https://en.wikipedia.org/wiki/Moving_average
	meanlogs += (math.Log(math.Abs(v)) - meanlogs) / float64(i+1)
	}

	mean, variance := meanVariance(values)
	return []float64{count, max, meanlogs, mean, min, sum, variance}
	}

	// Computes the mean and variance of the samples in vals.
	func meanVariance(vals []float64) (mean, variance float64) {
	// First compute the mean.
	var sum float64
	for _, val := range vals {
	sum += val
	}

	valsCount := float64(len(vals))
	mean = sum / valsCount

	// A single value has no variance. Leave it as 0 unless there are
	// multiple values.
	if valsCount < 2 {
	return
	}

	// Compute variance using the "Two-pass algorithm". See
	// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
	var sumOfSquaredDiffs float64

	for _, val := range vals {
	sumOfSquaredDiffs += (val - mean) * (val - mean)
	}
	variance = sumOfSquaredDiffs / (valsCount - 1)

	return
	}

	// checkSampleLabels checks whether the given samples have non-empty labels.
	//
	// Returns true iff all samples are labeled. Returns an error if samples are
	// inconsistently labeled or samples is empty.
	func checkSampleLabels(samples []schema.Sample) (bool, error) {
	if len(samples) == 0 {
	return false, errors.New("sample list is empty")
	}

	samplesShouldHaveLabels := samples[0].Label != ""

	// Verify that all samples are consistently labeled.
	for _, sample := range samples {
	// Return an error if the samples are inconsistently labeled.
	sampleHasLabel := sample.Label != ""
	if samplesShouldHaveLabels != sampleHasLabel {
	return false, errors.New("some samples are missing labels")
	}
	}

	return samplesShouldHaveLabels, nil
	}