recipes/sapling.py - infra/recipes - Git at Google

 # Copyright 2021 The Fuchsia Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Entrypoint to Sapling bug filing automation.

 Query for patchsets that failed exclusively on tryjobs from a specified LUCI
 project, and attach the data to a Monorail bug for further human review.
 """

 from datetime import datetime, timedelta

 from PB.recipe_engine import result
 from PB.go.chromium.org.luci.buildbucket.proto import common as common_pb2

 from PB.recipes.fuchsia.sapling import InputProperties

 PYTHON_VERSION_COMPATIBILITY = "PY3"

 DEPS = [
     "fuchsia/bigquery",
     "fuchsia/buildbucket_util",
     "fuchsia/lkg",
     "fuchsia/monorail",
     "fuchsia/status_check",
     "recipe_engine/buildbucket",
     "recipe_engine/file",
     "recipe_engine/path",
     "recipe_engine/step",
     "recipe_engine/time",
 ]

 PROPERTIES = InputProperties

 # The datetime string format to pass around queries and properties.
 DATETIME_FMT = "%Y-%m-%d %H:%M:%S"

 SAPLING_QUERY = """
 WITH
   eq_patchset_attempts AS (
     SELECT
       -- equivalent_cl_group_key is a UUID for a group of patchsets which are
       -- "equivalent", e.g. PS1 and PS2 will share an equivalent_cl_group_key if
       -- PS2 is a trivial rebase. It is a consistent value across multiple CQ
       -- attempts.
       -- We use this to uniquely identify a group of "equivalent" patchsets
       -- rather than treating every patchset as unique.
       equivalent_cl_group_key,
       cl.change,
       cl.patchset,
       cl.host,
       bb.id,
       start_time,
       status,
       COUNTIF(status = 'SUCCESS')
         OVER (PARTITION BY equivalent_cl_group_key) AS cq_success,
     FROM
       `commit-queue.fuchsia.attempts`,
       UNNEST(gerrit_changes) AS cl,
       UNNEST(builds) AS bb
     WHERE
       cl.project = "fuchsia"
       -- Get all attempts that were either SUCCESS or FAILURE. We'll use this
       -- later to filter out flakes.
       AND status IN ("SUCCESS", "FAILURE")
   ),
   patchset_info AS (
     SELECT
       cl.change,
       MAX(cl.patchset) AS max_patchset,
       MIN(cl.patchset) AS min_patchset,
     FROM `commit-queue.fuchsia.attempts`, UNNEST(gerrit_changes) AS cl
     WHERE cl.project = "fuchsia"
     GROUP BY(cl.change)
   ),
   all_details AS (
     SELECT
       equivalent_cl_group_key,
       MAX(change) AS change,
       MAX(patchset) AS patchset,
       MAX(host) AS host,
       -- This will be "fuchsia" if there's even one "fuchsia" builder.
       MIN(build_info.builder.project)
         AS project,
       EXTRACT(DATETIME FROM MIN(try.start_time)) AS start_time,
       ARRAY_AGG(build_info.builder.builder) AS failed_builders,
       ARRAY_AGG(build_info.id) AS bbids,
     FROM `cr-buildbucket.fuchsia.builds` AS build_info
     JOIN eq_patchset_attempts AS try
       ON try.id = build_info.id
     WHERE build_info.status = "FAILURE"
       -- Exclude any experimental tryjobs.
       AND JSON_EXTRACT(build_info.input.properties, "$['$recipe_engine/cq'].experimental") IS NULL
       -- Only use tryjobs with no autocorrelator findings. Findings indicate
       -- false rejection.
       AND (
           JSON_EXTRACT(build_info.output.properties, "$.num_autocorrelator_findings") IS NULL
           OR JSON_EXTRACT(build_info.output.properties, "$.num_autocorrelator_findings") = "0.0"
       )
       AND cq_success = 0
       -- Exclude any tryjobs with checkout failures.
       AND JSON_EXTRACT(build_info.output.properties, "$.checkout_failed") IS NULL
       -- Exclude any internal tryjobs with build failures. Build failures are a
       -- known issue that we don't need to file for. See fxbug.dev/80124 for
       -- more context.
       AND (
           (
               JSON_EXTRACT(build_info.output.properties, "$.build_failed") IS NULL
               AND JSON_EXTRACT(build_info.output.properties, "$.failed_to_build") IS NULL
           )
           OR build_info.builder.project = "fuchsia"
       )
       -- Exclude any tryjobs which have size check failures. Size check failures
       -- are a known issue that we don't need to continue to triage. See
       -- fxbug.dev/81426 for more context.
       AND build_info.summary_markdown != "binary size checks failed"
     GROUP BY equivalent_cl_group_key
   )

 -- If all equivalent patchsets failed with different builders i.e. a series of
 -- different flakes caused a failure, this query may output some flakes.
 -- Empirically however, this does not seem to be very frequent.
 SELECT
   ad.change AS change,
   'https://fxrev.dev/c/' || ad.change || '/' || ad.patchset
     AS patchset,
   CASE
     WHEN ad.patchset > 1
       THEN
         'https://fxrev.dev/c/' || ad.change || '/'
         || CAST(ad.patchset - 1 AS string) || '..' || ad.patchset
     ELSE NULL
     END
     AS prev_patchset,
   CASE
     WHEN ad.patchset < pi.max_patchset
       THEN
         'https://fxrev.dev/c/' || ad.change || '/' || ad.patchset || '..'
         || CAST(ad.patchset + 1 AS string)
     ELSE NULL
     END
     AS next_patchset,
   ARRAY(SELECT DISTINCT * FROM UNNEST(failed_builders)) AS failed_builders,
   ARRAY(
     SELECT DISTINCT 'https://ci.chromium.org/b/' || bbid || ' '
     FROM UNNEST(bbids) AS bbid
   ) AS build_links,
 FROM all_details AS ad
 JOIN patchset_info AS pi
   ON ad.change = pi.change
 WHERE
   start_time >= DATETIME '{start_time}'
   AND start_time <= DATETIME '{end_time}'
   AND project != "fuchsia";
 """

 BUG_DESCRIPTION = """
 This bug was filed by http://go/tq-sapling-bug-pipeline.

 We've identified that {patchset} ran through CQ and failed exclusively on
 internal tryjobs. This indicates a gap in platform test coverage that is
 currently covered only by out-of-platform tests.

 Failed builds:
 {build_info}

 {patchset_info}

 More context: http://go/tq-sapling
 """


 def RunSteps(api, props):
     # Determine start and end times to pass into Sapling query.
     query_start_time = get_query_start_time(api)
     query_end_time = api.time.utcnow().strftime(DATETIME_FMT)

     query_output = api.path["start_dir"].join("sapling.json")
     api.bigquery.project = "fuchsia-infra"
     api.bigquery.query(
         "run sapling query",
         SAPLING_QUERY.format(
             start_time=query_start_time,
             end_time=query_end_time,
         ),
         query_output,
     )

     # Set query start and end time properties so a future build can consume
     # them.
     presentation = api.step.active_result.presentation
     presentation.properties["query_start_time"] = query_start_time
     presentation.properties["query_end_time"] = query_end_time

     rows = api.file.read_json(
         "read rows",
         query_output,
         test_data=[
             {
                 "change": 123456,
                 "patchset": "https://fxrev.dev/c/123456/2",
                 "prev_patchset": "https://fxrev.dev/c/123456/1..2",
                 "next_patchset": "https://fxrev.dev/c/123456/2..3",
                 "failed_builders": ["foo.x64", "foo.arm64"],
                 "build_links": [
                     "https://ci.chromium.org/b/123",
                     "https://ci.chromium.org/b/456",
                 ],
             },
         ],
     )
     if not rows:
         return result.RawResult(
             summary_markdown="No new Sapling bugs to file",
             status=common_pb2.SUCCESS,
         )
     with api.step.nest("file bugs") as presentation:
         for row in rows:
             file_monorail_bug(api, row)


 def get_query_start_time(api):
     """Determine the Sapling query start time, which is the query end time of
     the last successful build. Essentially, this lets us pick up exactly where
     we last left off.

     If there is no successful build in history, or its `query_end_time` property
     is not set, return the current time minus one day.

     Returns:
         str: string representation of the query start time in UTC.
     """
     start_time = None
     try:
         build_id = api.lkg.build(
             "get lkg build",
             builder=api.buildbucket_util.full_builder_name(),
             test_data="889900",
         )
         build = api.buildbucket.get(
             int(build_id),
             fields=["output"],
         )
         start_time = dict(build.output.properties).get("query_end_time")
     # We will hit this case if this is the first time this build has ever run,
     # or if the builder has never succeeded before. Fallback on the default
     # query start time, which is calculated below.
     except api.step.StepFailure:
         pass

     start_time = start_time or (api.time.utcnow() - timedelta(days=1)).strftime(
         DATETIME_FMT
     )
     # Ensure that the start time matches the expected format.
     assert start_time == datetime.strptime(start_time, DATETIME_FMT).strftime(
         DATETIME_FMT
     )
     return start_time


 def file_monorail_bug(api, row):
     """File a new Monorail bug for a Sapling query row.

     Args:
         row (dict): Dict representation of a row which conforms to the Sapling
           query schema.
     """
     api.monorail.file_bug(
         "file bug for %s" % row["patchset"],
         "[sapling] Internal-only failures found in %s" % row["patchset"],
         BUG_DESCRIPTION.format(
             patchset=row["patchset"],
             build_info=build_info(row),
             patchset_info=patchset_info(row),
         ),
         components=["EngProd>Sapling"],
         labels=[
             # These bugs contain internal links that are only viewable by
             # Googlers.
             "Restrict-View-Google",
             "sapling-untriaged",
             # Attach a label with the change number to aid bug deduplication
             # across patchsets.
             "sapling-%d" % row["change"],
         ],
     )


 def build_info(row):
     """Return build info for a Sapling query row."""
     build_info = [
         "%s: %s" % (builder, link)
         for builder, link in zip(row["failed_builders"], row["build_links"])
     ]
     return "\n".join(build_info)


 def patchset_info(row):
     """Return available patchset link(s) for a Sapling query row."""
     patchset_info = ["Patchset: %s" % row["patchset"]]
     if row["prev_patchset"]:
         patchset_info.append("Previous patchset: %s" % row["prev_patchset"])
     if row["next_patchset"]:
         patchset_info.append("Next patchset: %s" % row["next_patchset"])
     return "\n".join(patchset_info)


 def GenTests(api):
     yield api.status_check.test("lkg")
     yield api.status_check.test("no_lkg") + api.step_data("get lkg build", retcode=1)
     yield api.status_check.test("no_results") + api.step_data(
         "read rows", api.file.read_json(None)
     )
	# Copyright 2021 The Fuchsia Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Entrypoint to Sapling bug filing automation.

	Query for patchsets that failed exclusively on tryjobs from a specified LUCI
	project, and attach the data to a Monorail bug for further human review.
	"""

	from datetime import datetime, timedelta

	from PB.recipe_engine import result
	from PB.go.chromium.org.luci.buildbucket.proto import common as common_pb2

	from PB.recipes.fuchsia.sapling import InputProperties

	PYTHON_VERSION_COMPATIBILITY = "PY3"

	DEPS = [
	"fuchsia/bigquery",
	"fuchsia/buildbucket_util",
	"fuchsia/lkg",
	"fuchsia/monorail",
	"fuchsia/status_check",
	"recipe_engine/buildbucket",
	"recipe_engine/file",
	"recipe_engine/path",
	"recipe_engine/step",
	"recipe_engine/time",
	]

	PROPERTIES = InputProperties

	# The datetime string format to pass around queries and properties.
	DATETIME_FMT = "%Y-%m-%d %H:%M:%S"

	SAPLING_QUERY = """
	WITH
	eq_patchset_attempts AS (
	SELECT
	-- equivalent_cl_group_key is a UUID for a group of patchsets which are
	-- "equivalent", e.g. PS1 and PS2 will share an equivalent_cl_group_key if
	-- PS2 is a trivial rebase. It is a consistent value across multiple CQ
	-- attempts.
	-- We use this to uniquely identify a group of "equivalent" patchsets
	-- rather than treating every patchset as unique.
	equivalent_cl_group_key,
	cl.change,
	cl.patchset,
	cl.host,
	bb.id,
	start_time,
	status,
	COUNTIF(status = 'SUCCESS')
	OVER (PARTITION BY equivalent_cl_group_key) AS cq_success,
	FROM
	`commit-queue.fuchsia.attempts`,
	UNNEST(gerrit_changes) AS cl,
	UNNEST(builds) AS bb
	WHERE
	cl.project = "fuchsia"
	-- Get all attempts that were either SUCCESS or FAILURE. We'll use this
	-- later to filter out flakes.
	AND status IN ("SUCCESS", "FAILURE")
	),
	patchset_info AS (
	SELECT
	cl.change,
	MAX(cl.patchset) AS max_patchset,
	MIN(cl.patchset) AS min_patchset,
	FROM `commit-queue.fuchsia.attempts`, UNNEST(gerrit_changes) AS cl
	WHERE cl.project = "fuchsia"
	GROUP BY(cl.change)
	),
	all_details AS (
	SELECT
	equivalent_cl_group_key,
	MAX(change) AS change,
	MAX(patchset) AS patchset,
	MAX(host) AS host,
	-- This will be "fuchsia" if there's even one "fuchsia" builder.
	MIN(build_info.builder.project)
	AS project,
	EXTRACT(DATETIME FROM MIN(try.start_time)) AS start_time,
	ARRAY_AGG(build_info.builder.builder) AS failed_builders,
	ARRAY_AGG(build_info.id) AS bbids,
	FROM `cr-buildbucket.fuchsia.builds` AS build_info
	JOIN eq_patchset_attempts AS try
	ON try.id = build_info.id
	WHERE build_info.status = "FAILURE"
	-- Exclude any experimental tryjobs.
	AND JSON_EXTRACT(build_info.input.properties, "$['$recipe_engine/cq'].experimental") IS NULL
	-- Only use tryjobs with no autocorrelator findings. Findings indicate
	-- false rejection.
	AND (
	JSON_EXTRACT(build_info.output.properties, "$.num_autocorrelator_findings") IS NULL
	OR JSON_EXTRACT(build_info.output.properties, "$.num_autocorrelator_findings") = "0.0"
	)
	AND cq_success = 0
	-- Exclude any tryjobs with checkout failures.
	AND JSON_EXTRACT(build_info.output.properties, "$.checkout_failed") IS NULL
	-- Exclude any internal tryjobs with build failures. Build failures are a
	-- known issue that we don't need to file for. See fxbug.dev/80124 for
	-- more context.
	AND (
	(
	JSON_EXTRACT(build_info.output.properties, "$.build_failed") IS NULL
	AND JSON_EXTRACT(build_info.output.properties, "$.failed_to_build") IS NULL
	)
	OR build_info.builder.project = "fuchsia"
	)
	-- Exclude any tryjobs which have size check failures. Size check failures
	-- are a known issue that we don't need to continue to triage. See
	-- fxbug.dev/81426 for more context.
	AND build_info.summary_markdown != "binary size checks failed"
	GROUP BY equivalent_cl_group_key
	)

	-- If all equivalent patchsets failed with different builders i.e. a series of
	-- different flakes caused a failure, this query may output some flakes.
	-- Empirically however, this does not seem to be very frequent.
	SELECT
	ad.change AS change,
	'https://fxrev.dev/c/' \|\| ad.change \|\| '/' \|\| ad.patchset
	AS patchset,
	CASE
	WHEN ad.patchset > 1
	THEN
	'https://fxrev.dev/c/' \|\| ad.change \|\| '/'
	\|\| CAST(ad.patchset - 1 AS string) \|\| '..' \|\| ad.patchset
	ELSE NULL
	END
	AS prev_patchset,
	CASE
	WHEN ad.patchset < pi.max_patchset
	THEN
	'https://fxrev.dev/c/' \|\| ad.change \|\| '/' \|\| ad.patchset \|\| '..'
	\|\| CAST(ad.patchset + 1 AS string)
	ELSE NULL
	END
	AS next_patchset,
	ARRAY(SELECT DISTINCT * FROM UNNEST(failed_builders)) AS failed_builders,
	ARRAY(
	SELECT DISTINCT 'https://ci.chromium.org/b/' \|\| bbid \|\| ' '
	FROM UNNEST(bbids) AS bbid
	) AS build_links,
	FROM all_details AS ad
	JOIN patchset_info AS pi
	ON ad.change = pi.change
	WHERE
	start_time >= DATETIME '{start_time}'
	AND start_time <= DATETIME '{end_time}'
	AND project != "fuchsia";
	"""

	BUG_DESCRIPTION = """
	This bug was filed by http://go/tq-sapling-bug-pipeline.

	We've identified that {patchset} ran through CQ and failed exclusively on
	internal tryjobs. This indicates a gap in platform test coverage that is
	currently covered only by out-of-platform tests.

	Failed builds:
	{build_info}

	{patchset_info}

	More context: http://go/tq-sapling
	"""


	def RunSteps(api, props):
	# Determine start and end times to pass into Sapling query.
	query_start_time = get_query_start_time(api)
	query_end_time = api.time.utcnow().strftime(DATETIME_FMT)

	query_output = api.path["start_dir"].join("sapling.json")
	api.bigquery.project = "fuchsia-infra"
	api.bigquery.query(
	"run sapling query",
	SAPLING_QUERY.format(
	start_time=query_start_time,
	end_time=query_end_time,
	),
	query_output,
	)

	# Set query start and end time properties so a future build can consume
	# them.
	presentation = api.step.active_result.presentation
	presentation.properties["query_start_time"] = query_start_time
	presentation.properties["query_end_time"] = query_end_time

	rows = api.file.read_json(
	"read rows",
	query_output,
	test_data=[
	{
	"change": 123456,
	"patchset": "https://fxrev.dev/c/123456/2",
	"prev_patchset": "https://fxrev.dev/c/123456/1..2",
	"next_patchset": "https://fxrev.dev/c/123456/2..3",
	"failed_builders": ["foo.x64", "foo.arm64"],
	"build_links": [
	"https://ci.chromium.org/b/123",
	"https://ci.chromium.org/b/456",
	],
	},
	],
	)
	if not rows:
	return result.RawResult(
	summary_markdown="No new Sapling bugs to file",
	status=common_pb2.SUCCESS,
	)
	with api.step.nest("file bugs") as presentation:
	for row in rows:
	file_monorail_bug(api, row)


	def get_query_start_time(api):
	"""Determine the Sapling query start time, which is the query end time of
	the last successful build. Essentially, this lets us pick up exactly where
	we last left off.

	If there is no successful build in history, or its `query_end_time` property
	is not set, return the current time minus one day.

	Returns:
	str: string representation of the query start time in UTC.
	"""
	start_time = None
	try:
	build_id = api.lkg.build(
	"get lkg build",
	builder=api.buildbucket_util.full_builder_name(),
	test_data="889900",
	)
	build = api.buildbucket.get(
	int(build_id),
	fields=["output"],
	)
	start_time = dict(build.output.properties).get("query_end_time")
	# We will hit this case if this is the first time this build has ever run,
	# or if the builder has never succeeded before. Fallback on the default
	# query start time, which is calculated below.
	except api.step.StepFailure:
	pass

	start_time = start_time or (api.time.utcnow() - timedelta(days=1)).strftime(
	DATETIME_FMT
	)
	# Ensure that the start time matches the expected format.
	assert start_time == datetime.strptime(start_time, DATETIME_FMT).strftime(
	DATETIME_FMT
	)
	return start_time


	def file_monorail_bug(api, row):
	"""File a new Monorail bug for a Sapling query row.

	Args:
	row (dict): Dict representation of a row which conforms to the Sapling
	query schema.
	"""
	api.monorail.file_bug(
	"file bug for %s" % row["patchset"],
	"[sapling] Internal-only failures found in %s" % row["patchset"],
	BUG_DESCRIPTION.format(
	patchset=row["patchset"],
	build_info=build_info(row),
	patchset_info=patchset_info(row),
	),
	components=["EngProd>Sapling"],
	labels=[
	# These bugs contain internal links that are only viewable by
	# Googlers.
	"Restrict-View-Google",
	"sapling-untriaged",
	# Attach a label with the change number to aid bug deduplication
	# across patchsets.
	"sapling-%d" % row["change"],
	],
	)


	def build_info(row):
	"""Return build info for a Sapling query row."""
	build_info = [
	"%s: %s" % (builder, link)
	for builder, link in zip(row["failed_builders"], row["build_links"])
	]
	return "\n".join(build_info)


	def patchset_info(row):
	"""Return available patchset link(s) for a Sapling query row."""
	patchset_info = ["Patchset: %s" % row["patchset"]]
	if row["prev_patchset"]:
	patchset_info.append("Previous patchset: %s" % row["prev_patchset"])
	if row["next_patchset"]:
	patchset_info.append("Next patchset: %s" % row["next_patchset"])
	return "\n".join(patchset_info)


	def GenTests(api):
	yield api.status_check.test("lkg")
	yield api.status_check.test("no_lkg") + api.step_data("get lkg build", retcode=1)
	yield api.status_check.test("no_results") + api.step_data(
	"read rows", api.file.read_json(None)
	)