blob: 0cf6095f0ca330018cd2722c1a68ff2401fe8c11 [file] [log] [blame]
// Copyright 2022 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package main
import (
"context"
_ "embed"
"time"
"cloud.google.com/go/bigquery"
"go.fuchsia.dev/infra/functools"
)
//go:embed queries/nearby_test_results.sql
var nearbyTestResultsQuery string
type nearbyTestResult struct {
// Absolute builder name, in the form "project/bucket/builder".
Builder string
Failed bool
CommitPosition int
}
func getNearbyTestResults(
ctx context.Context,
bqClient *bigquery.Client,
sig failureSignature,
windowEnd time.Time,
) ([]nearbyTestResult, error) {
return runQuery[nearbyTestResult](ctx, bqClient, nearbyTestResultsQuery,
map[string]any{
"test_id": sig.FailedTest,
// Use a smaller window to decrease the likelihood of treating
// failures from separate old breakages as the first failure of the
// current breakage.
"earliest_time": windowEnd.Add(-20 * time.Hour),
"latest_time": windowEnd,
// TODO(olivernewman): don't hardcode this.
"repo": "turquoise-internal.googlesource.com/integration",
},
)
}
// calculateBlamelistDistances computes, for each builder with a certain failure
// mode, the number of builds between each suspect commit and the first build
// (within the time window used by the query) that had that failure mode.
//
// This is analogous to the manual process of "lining up" CI builder blamelists
// to find a culprit.
func calculateBlamelistDistances(results []nearbyTestResult, suspects []suspectCommit) error {
byBuilder := make(map[string][]nearbyTestResult)
for _, tr := range results {
byBuilder[tr.Builder] = append(byBuilder[tr.Builder], tr)
}
// Sort results in chronological order (earliest first).
for _, results := range byBuilder {
functools.SortBy(results, func(tr nearbyTestResult) int {
return tr.CommitPosition
})
}
// TODO(olivernewman): handle the case where a test has broken on separate
// occasions within the time window. It's not easy to distinguish this from
// flakiness, but we can make a best effort at least for high-frequency
// failure modes.
for builder, results := range byBuilder {
firstFailureIdx := -1
for i, result := range results {
if result.Failed {
firstFailureIdx = i
break
}
}
// Skip builders where the failure mode didn't occur at all.
if firstFailureIdx == -1 {
continue
}
for i, suspect := range suspects {
for buildIdx, result := range results {
if result.CommitPosition >= suspect.CommitPosition {
// TODO(olivernewman): Also take blamelist size into
// account. If we are X% confident that a culprit falls
// within a given blamelist of length N, then we're only
// X/N% confident in each member of the blamelist. So that
// confidence will increase as the blamelist size decreases.
dist := firstFailureIdx - buildIdx
suspects[i].BlamelistDistances[builder] = dist
break
}
}
}
}
return nil
}
// scoreBlamelistDistances computes a 0-100 likelihood score for a potential
// culprit commit based on a list of CI builder first-failure blamelist
// distances. It takes the amount of data points into account by incorporating a
// uncertainty level.
func scoreBlamelistDistances(distances []int) int {
if len(distances) == 0 {
return 0
}
var weightedDistances []int
for _, dist := range distances {
if dist < 0 {
// If the commit landed *after* the first failure that's an
// especially good indicator that it's unlikely to be the culprit,
// so apply a large multiplier to that data point (and negate it, so
// the weighted distances are all positive). We simply downweight
// the suspect instead of completely discarding it (by returning
// zero) because it's possible the first failure is a different
// failure mode (e.g. latent flakiness) than the failure mode we're
// trying to diagnose.
dist *= -10
}
weightedDistances = append(weightedDistances, dist)
}
avg := average(weightedDistances)
// Calculate an uncertainty level based on the number of builders that are
// providing data points. If a commit was in the first failure blamelist of
// only builder it may be a coincidence, whereas we get a stronger signal if
// it's in the first failure blamelist of multiple builders.
uncertainty := 12 - 4*(len(distances)-1)
if uncertainty < 0 {
uncertainty = 0
}
score := 100/(avg/2+1) - float64(uncertainty)
if score < 0 {
return 0
}
return int(score)
}