blob: 3beaa4b12d968329935cefb633b3b15073de8fb9 [file] [log] [blame]
# Copyright 2021 The Fuchsia Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Entrypoint to Sapling bug filing automation.
Query for patchsets that failed exclusively on tryjobs from a specified LUCI
project, and attach the data to a Monorail bug for further human review.
"""
from datetime import datetime, timedelta
from PB.recipe_engine import result
from PB.go.chromium.org.luci.buildbucket.proto import common as common_pb2
from PB.recipes.fuchsia.sapling import InputProperties
PYTHON_VERSION_COMPATIBILITY = "PY3"
DEPS = [
"fuchsia/bigquery",
"fuchsia/buildbucket_util",
"fuchsia/lkg",
"fuchsia/monorail",
"fuchsia/status_check",
"recipe_engine/buildbucket",
"recipe_engine/file",
"recipe_engine/path",
"recipe_engine/step",
"recipe_engine/time",
]
PROPERTIES = InputProperties
# The datetime string format to pass around queries and properties.
DATETIME_FMT = "%Y-%m-%d %H:%M:%S"
SAPLING_QUERY = """
WITH
eq_patchset_attempts AS (
SELECT
-- equivalent_cl_group_key is a UUID for a group of patchsets which are
-- "equivalent", e.g. PS1 and PS2 will share an equivalent_cl_group_key if
-- PS2 is a trivial rebase. It is a consistent value across multiple CQ
-- attempts.
-- We use this to uniquely identify a group of "equivalent" patchsets
-- rather than treating every patchset as unique.
equivalent_cl_group_key,
cl.change,
cl.patchset,
cl.host,
bb.id,
start_time,
status,
COUNTIF(status = 'SUCCESS')
OVER (PARTITION BY equivalent_cl_group_key) AS cq_success,
FROM
`commit-queue.fuchsia.attempts`,
UNNEST(gerrit_changes) AS cl,
UNNEST(builds) AS bb
WHERE
cl.project = "fuchsia"
-- Get all attempts that were either SUCCESS or FAILURE. We'll use this
-- later to filter out flakes.
AND status IN ("SUCCESS", "FAILURE")
),
patchset_info AS (
SELECT
cl.change,
MAX(cl.patchset) AS max_patchset,
MIN(cl.patchset) AS min_patchset,
FROM `commit-queue.fuchsia.attempts`, UNNEST(gerrit_changes) AS cl
WHERE cl.project = "fuchsia"
GROUP BY(cl.change)
),
all_details AS (
SELECT
equivalent_cl_group_key,
MAX(change) AS change,
MAX(patchset) AS patchset,
MAX(host) AS host,
-- This will be "fuchsia" if there's even one "fuchsia" builder.
MIN(build_info.builder.project)
AS project,
EXTRACT(DATETIME FROM MIN(try.start_time)) AS start_time,
ARRAY_AGG(build_info.builder.builder) AS failed_builders,
ARRAY_AGG(build_info.id) AS bbids,
FROM `cr-buildbucket.fuchsia.builds` AS build_info
JOIN eq_patchset_attempts AS try
ON try.id = build_info.id
WHERE build_info.status = "FAILURE"
-- Exclude any experimental tryjobs.
AND JSON_EXTRACT(build_info.input.properties, "$['$recipe_engine/cq'].experimental") IS NULL
-- Only use tryjobs with no autocorrelator findings. Findings indicate
-- false rejection.
AND (
JSON_EXTRACT(build_info.output.properties, "$.num_autocorrelator_findings") IS NULL
OR JSON_EXTRACT(build_info.output.properties, "$.num_autocorrelator_findings") = "0.0"
)
AND cq_success = 0
-- Exclude any tryjobs with checkout failures.
AND JSON_EXTRACT(build_info.output.properties, "$.checkout_failed") IS NULL
-- Exclude any internal tryjobs with build failures. Build failures are a
-- known issue that we don't need to file for. See fxbug.dev/80124 for
-- more context.
AND (
(
JSON_EXTRACT(build_info.output.properties, "$.build_failed") IS NULL
AND JSON_EXTRACT(build_info.output.properties, "$.failed_to_build") IS NULL
)
OR build_info.builder.project = "fuchsia"
)
-- Exclude any tryjobs which have size check failures. Size check failures
-- are a known issue that we don't need to continue to triage. See
-- fxbug.dev/81426 for more context.
AND build_info.summary_markdown != "binary size checks failed"
GROUP BY equivalent_cl_group_key
)
-- If all equivalent patchsets failed with different builders i.e. a series of
-- different flakes caused a failure, this query may output some flakes.
-- Empirically however, this does not seem to be very frequent.
SELECT
ad.change AS change,
'https://fxrev.dev/c/' || ad.change || '/' || ad.patchset
AS patchset,
CASE
WHEN ad.patchset > 1
THEN
'https://fxrev.dev/c/' || ad.change || '/'
|| CAST(ad.patchset - 1 AS string) || '..' || ad.patchset
ELSE NULL
END
AS prev_patchset,
CASE
WHEN ad.patchset < pi.max_patchset
THEN
'https://fxrev.dev/c/' || ad.change || '/' || ad.patchset || '..'
|| CAST(ad.patchset + 1 AS string)
ELSE NULL
END
AS next_patchset,
ARRAY(SELECT DISTINCT * FROM UNNEST(failed_builders)) AS failed_builders,
ARRAY(
SELECT DISTINCT 'https://ci.chromium.org/b/' || bbid || ' '
FROM UNNEST(bbids) AS bbid
) AS build_links,
FROM all_details AS ad
JOIN patchset_info AS pi
ON ad.change = pi.change
WHERE
start_time >= DATETIME '{start_time}'
AND start_time <= DATETIME '{end_time}'
AND project != "fuchsia";
"""
BUG_DESCRIPTION = """
This bug was filed by http://go/tq-sapling-bug-pipeline.
We've identified that {patchset} ran through CQ and failed exclusively on
internal tryjobs. This indicates a gap in platform test coverage that is
currently covered only by out-of-platform tests.
Failed builds:
{build_info}
{patchset_info}
More context: http://go/tq-sapling
"""
def RunSteps(api, props):
# Determine start and end times to pass into Sapling query.
query_start_time = get_query_start_time(api)
query_end_time = api.time.utcnow().strftime(DATETIME_FMT)
query_output = api.path["start_dir"].join("sapling.json")
api.bigquery.project = "fuchsia-infra"
api.bigquery.query(
"run sapling query",
SAPLING_QUERY.format(
start_time=query_start_time,
end_time=query_end_time,
),
query_output,
)
# Set query start and end time properties so a future build can consume
# them.
presentation = api.step.active_result.presentation
presentation.properties["query_start_time"] = query_start_time
presentation.properties["query_end_time"] = query_end_time
rows = api.file.read_json(
"read rows",
query_output,
test_data=[
{
"change": 123456,
"patchset": "https://fxrev.dev/c/123456/2",
"prev_patchset": "https://fxrev.dev/c/123456/1..2",
"next_patchset": "https://fxrev.dev/c/123456/2..3",
"failed_builders": ["foo.x64", "foo.arm64"],
"build_links": [
"https://ci.chromium.org/b/123",
"https://ci.chromium.org/b/456",
],
},
],
)
if not rows:
return result.RawResult(
summary_markdown="No new Sapling bugs to file",
status=common_pb2.SUCCESS,
)
with api.step.nest("file bugs") as presentation:
for row in rows:
file_monorail_bug(api, row)
def get_query_start_time(api):
"""Determine the Sapling query start time, which is the query end time of
the last successful build. Essentially, this lets us pick up exactly where
we last left off.
If there is no successful build in history, or its `query_end_time` property
is not set, return the current time minus one day.
Returns:
str: string representation of the query start time in UTC.
"""
start_time = None
try:
build_id = api.lkg.build(
"get lkg build",
builder=api.buildbucket_util.full_builder_name(),
test_data="889900",
)
build = api.buildbucket.get(
int(build_id),
fields=["output"],
)
start_time = dict(build.output.properties).get("query_end_time")
# We will hit this case if this is the first time this build has ever run,
# or if the builder has never succeeded before. Fallback on the default
# query start time, which is calculated below.
except api.step.StepFailure:
pass
start_time = start_time or (api.time.utcnow() - timedelta(days=1)).strftime(
DATETIME_FMT
)
# Ensure that the start time matches the expected format.
assert start_time == datetime.strptime(start_time, DATETIME_FMT).strftime(
DATETIME_FMT
)
return start_time
def file_monorail_bug(api, row):
"""File a new Monorail bug for a Sapling query row.
Args:
row (dict): Dict representation of a row which conforms to the Sapling
query schema.
"""
api.monorail.file_bug(
"file bug for %s" % row["patchset"],
"[sapling] Internal-only failures found in %s" % row["patchset"],
BUG_DESCRIPTION.format(
patchset=row["patchset"],
build_info=build_info(row),
patchset_info=patchset_info(row),
),
components=["EngProd>Sapling"],
labels=[
# These bugs contain internal links that are only viewable by
# Googlers.
"Restrict-View-Google",
"sapling-untriaged",
# Attach a label with the change number to aid bug deduplication
# across patchsets.
"sapling-%d" % row["change"],
],
)
def build_info(row):
"""Return build info for a Sapling query row."""
build_info = [
"%s: %s" % (builder, link)
for builder, link in zip(row["failed_builders"], row["build_links"])
]
return "\n".join(build_info)
def patchset_info(row):
"""Return available patchset link(s) for a Sapling query row."""
patchset_info = ["Patchset: %s" % row["patchset"]]
if row["prev_patchset"]:
patchset_info.append("Previous patchset: %s" % row["prev_patchset"])
if row["next_patchset"]:
patchset_info.append("Next patchset: %s" % row["next_patchset"])
return "\n".join(patchset_info)
def GenTests(api):
yield api.status_check.test("lkg")
yield api.status_check.test("no_lkg") + api.step_data("get lkg build", retcode=1)
yield api.status_check.test("no_results") + api.step_data(
"read rows", api.file.read_json(None)
)