| // Copyright 2022 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| package main |
| |
| import ( |
| "context" |
| _ "embed" |
| "time" |
| |
| "cloud.google.com/go/bigquery" |
| "go.fuchsia.dev/infra/functools" |
| ) |
| |
| //go:embed queries/nearby_test_results.sql |
| var nearbyTestResultsQuery string |
| |
| type nearbyTestResult struct { |
| // Absolute builder name, in the form "project/bucket/builder". |
| Builder string |
| Failed bool |
| CommitPosition int |
| } |
| |
| func getNearbyTestResults( |
| ctx context.Context, |
| bqClient *bigquery.Client, |
| sig failureSignature, |
| windowEnd time.Time, |
| ) ([]nearbyTestResult, error) { |
| return runQuery[nearbyTestResult](ctx, bqClient, nearbyTestResultsQuery, |
| map[string]any{ |
| "test_id": sig.FailedTest, |
| // Use a smaller window to decrease the likelihood of treating |
| // failures from separate old breakages as the first failure of the |
| // current breakage. |
| "earliest_time": windowEnd.Add(-20 * time.Hour), |
| "latest_time": windowEnd, |
| // TODO(olivernewman): don't hardcode this. |
| "repo": "turquoise-internal.googlesource.com/integration", |
| }, |
| ) |
| } |
| |
| // calculateBlamelistDistances computes, for each builder with a certain failure |
| // mode, the number of builds between each suspect commit and the first build |
| // (within the time window used by the query) that had that failure mode. |
| // |
| // This is analogous to the manual process of "lining up" CI builder blamelists |
| // to find a culprit. |
| func calculateBlamelistDistances(results []nearbyTestResult, suspects []suspectCommit) error { |
| byBuilder := make(map[string][]nearbyTestResult) |
| for _, tr := range results { |
| byBuilder[tr.Builder] = append(byBuilder[tr.Builder], tr) |
| } |
| // Sort results in chronological order (earliest first). |
| for _, results := range byBuilder { |
| functools.SortBy(results, func(tr nearbyTestResult) int { |
| return tr.CommitPosition |
| }) |
| } |
| |
| // TODO(olivernewman): handle the case where a test has broken on separate |
| // occasions within the time window. It's not easy to distinguish this from |
| // flakiness, but we can make a best effort at least for high-frequency |
| // failure modes. |
| for builder, results := range byBuilder { |
| firstFailureIdx := -1 |
| for i, result := range results { |
| if result.Failed { |
| firstFailureIdx = i |
| break |
| } |
| } |
| // Skip builders where the failure mode didn't occur at all. |
| if firstFailureIdx == -1 { |
| continue |
| } |
| for i, suspect := range suspects { |
| for buildIdx, result := range results { |
| if result.CommitPosition >= suspect.CommitPosition { |
| // TODO(olivernewman): Also take blamelist size into |
| // account. If we are X% confident that a culprit falls |
| // within a given blamelist of length N, then we're only |
| // X/N% confident in each member of the blamelist. So that |
| // confidence will increase as the blamelist size decreases. |
| dist := firstFailureIdx - buildIdx |
| suspects[i].BlamelistDistances[builder] = dist |
| break |
| } |
| } |
| } |
| } |
| return nil |
| } |
| |
| // scoreBlamelistDistances computes a 0-100 likelihood score for a potential |
| // culprit commit based on a list of CI builder first-failure blamelist |
| // distances. It takes the amount of data points into account by incorporating a |
| // uncertainty level. |
| func scoreBlamelistDistances(distances []int) int { |
| if len(distances) == 0 { |
| return 0 |
| } |
| var weightedDistances []int |
| for _, dist := range distances { |
| if dist < 0 { |
| // If the commit landed *after* the first failure that's an |
| // especially good indicator that it's unlikely to be the culprit, |
| // so apply a large multiplier to that data point (and negate it, so |
| // the weighted distances are all positive). We simply downweight |
| // the suspect instead of completely discarding it (by returning |
| // zero) because it's possible the first failure is a different |
| // failure mode (e.g. latent flakiness) than the failure mode we're |
| // trying to diagnose. |
| dist *= -10 |
| } |
| weightedDistances = append(weightedDistances, dist) |
| } |
| avg := average(weightedDistances) |
| |
| // Calculate an uncertainty level based on the number of builders that are |
| // providing data points. If a commit was in the first failure blamelist of |
| // only builder it may be a coincidence, whereas we get a stronger signal if |
| // it's in the first failure blamelist of multiple builders. |
| uncertainty := 12 - 4*(len(distances)-1) |
| if uncertainty < 0 { |
| uncertainty = 0 |
| } |
| |
| score := 100/(avg/2+1) - float64(uncertainty) |
| if score < 0 { |
| return 0 |
| } |
| return int(score) |
| } |