| # Copyright 2021 The Fuchsia Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Entrypoint to Sapling bug filing automation. |
| |
| Query for patchsets that failed exclusively on tryjobs from a specified LUCI |
| project, and attach the data to a Monorail bug for further human review. |
| """ |
| |
| from datetime import datetime, timedelta |
| |
| from PB.recipe_engine import result |
| from PB.go.chromium.org.luci.buildbucket.proto import common as common_pb2 |
| |
| DEPS = [ |
| "fuchsia/bigquery", |
| "fuchsia/buildbucket_util", |
| "fuchsia/lkg", |
| "fuchsia/monorail", |
| "fuchsia/status_check", |
| "recipe_engine/buildbucket", |
| "recipe_engine/file", |
| "recipe_engine/path", |
| "recipe_engine/step", |
| "recipe_engine/time", |
| ] |
| |
| # The datetime string format to pass around queries and properties. |
| DATETIME_FMT = "%Y-%m-%d %H:%M:%S" |
| |
| SAPLING_QUERY = """ |
| WITH |
| eq_patchset_attempts AS ( |
| SELECT |
| -- equivalent_cl_group_key is a UUID for a group of patchsets which are |
| -- "equivalent", e.g. PS1 and PS2 will share an equivalent_cl_group_key if |
| -- PS2 is a trivial rebase. It is a consistent value across multiple CQ |
| -- attempts. |
| -- We use this to uniquely identify a group of "equivalent" patchsets |
| -- rather than treating every patchset as unique. |
| equivalent_cl_group_key, |
| cl.change, |
| cl.patchset, |
| cl.host, |
| bb.id, |
| start_time, |
| status, |
| COUNTIF(status = 'SUCCESS') |
| OVER (PARTITION BY equivalent_cl_group_key) AS cq_success, |
| FROM |
| `commit-queue.fuchsia.attempts`, |
| UNNEST(gerrit_changes) AS cl, |
| UNNEST(builds) AS bb |
| WHERE |
| cl.project = "fuchsia" |
| -- Get all attempts that were either SUCCESS or FAILURE. We'll use this |
| -- later to filter out flakes. |
| AND status IN ("SUCCESS", "FAILURE") |
| ), |
| patchset_info AS ( |
| SELECT |
| cl.change, |
| MAX(cl.patchset) AS max_patchset, |
| MIN(cl.patchset) AS min_patchset, |
| FROM `commit-queue.fuchsia.attempts`, UNNEST(gerrit_changes) AS cl |
| WHERE cl.project = "fuchsia" |
| GROUP BY(cl.change) |
| ), |
| all_details AS ( |
| SELECT |
| equivalent_cl_group_key, |
| MAX(change) AS change, |
| MAX(patchset) AS patchset, |
| MAX(host) AS host, |
| -- This will be "fuchsia" if there's even one "fuchsia" builder. |
| MIN(build_info.builder.project) |
| AS project, |
| EXTRACT(DATETIME FROM MIN(try.start_time)) AS start_time, |
| ARRAY_AGG(build_info.builder.builder) AS failed_builders, |
| ARRAY_AGG(build_info.id) AS bbids, |
| FROM `cr-buildbucket.fuchsia.builds` AS build_info |
| JOIN eq_patchset_attempts AS try |
| ON try.id = build_info.id |
| WHERE build_info.status = "FAILURE" |
| -- Exclude any experimental tryjobs. |
| AND JSON_EXTRACT(build_info.input.properties, "$['$recipe_engine/cq'].experimental") IS NULL |
| -- Only use tryjobs with no autocorrelator findings. Findings indicate |
| -- false rejection. |
| AND ( |
| JSON_EXTRACT(build_info.output.properties, "$.num_autocorrelator_findings") IS NULL |
| OR JSON_EXTRACT(build_info.output.properties, "$.num_autocorrelator_findings") = "0.0" |
| ) |
| AND cq_success = 0 |
| -- Exclude any tryjobs with checkout failures. |
| AND JSON_EXTRACT(build_info.output.properties, "$.checkout_failed") IS NULL |
| -- Exclude any internal tryjobs with build failures. Build failures are a |
| -- known issue that we don't need to file for. See fxbug.dev/80124 for |
| -- more context. |
| AND ( |
| ( |
| JSON_EXTRACT(build_info.output.properties, "$.build_failed") IS NULL |
| AND JSON_EXTRACT(build_info.output.properties, "$.failed_to_build") IS NULL |
| ) |
| OR build_info.builder.project = "fuchsia" |
| ) |
| -- Exclude any tryjobs which have size check failures. Size check failures |
| -- are a known issue that we don't need to continue to triage. See |
| -- fxbug.dev/81426 for more context. |
| AND build_info.summary_markdown != "binary size checks failed" |
| GROUP BY equivalent_cl_group_key |
| ) |
| |
| -- If all equivalent patchsets failed with different builders i.e. a series of |
| -- different flakes caused a failure, this query may output some flakes. |
| -- Empirically however, this does not seem to be very frequent. |
| SELECT |
| ad.change AS change, |
| 'https://fxrev.dev/c/' || ad.change || '/' || ad.patchset |
| AS patchset, |
| CASE |
| WHEN ad.patchset > 1 |
| THEN |
| 'https://fxrev.dev/c/' || ad.change || '/' |
| || CAST(ad.patchset - 1 AS string) || '..' || ad.patchset |
| ELSE NULL |
| END |
| AS prev_patchset, |
| CASE |
| WHEN ad.patchset < pi.max_patchset |
| THEN |
| 'https://fxrev.dev/c/' || ad.change || '/' || ad.patchset || '..' |
| || CAST(ad.patchset + 1 AS string) |
| ELSE NULL |
| END |
| AS next_patchset, |
| ARRAY(SELECT DISTINCT * FROM UNNEST(failed_builders)) AS failed_builders, |
| ARRAY( |
| SELECT DISTINCT 'https://ci.chromium.org/b/' || bbid || ' ' |
| FROM UNNEST(bbids) AS bbid |
| ) AS build_links, |
| FROM all_details AS ad |
| JOIN patchset_info AS pi |
| ON ad.change = pi.change |
| WHERE |
| start_time >= DATETIME '{start_time}' |
| AND start_time <= DATETIME '{end_time}' |
| AND project != "fuchsia"; |
| """ |
| |
| BUG_DESCRIPTION = """ |
| This bug was filed by http://go/tq-sapling-bug-pipeline. |
| |
| We've identified that {patchset} ran through CQ and failed exclusively on |
| internal tryjobs. This indicates a gap in platform test coverage that is |
| currently covered only by out-of-platform tests. |
| |
| Failed builds: |
| {build_info} |
| |
| {patchset_info} |
| |
| More context: http://go/tq-sapling |
| """ |
| |
| |
| def RunSteps(api): |
| # Determine start and end times to pass into Sapling query. |
| query_start_time = get_query_start_time(api) |
| query_end_time = api.time.utcnow().strftime(DATETIME_FMT) |
| |
| query_output = api.path["start_dir"].join("sapling.json") |
| api.bigquery.project = "fuchsia-infra" |
| api.bigquery.query( |
| "run sapling query", |
| SAPLING_QUERY.format( |
| start_time=query_start_time, |
| end_time=query_end_time, |
| ), |
| query_output, |
| ) |
| |
| # Set query start and end time properties so a future build can consume |
| # them. |
| presentation = api.step.active_result.presentation |
| presentation.properties["query_start_time"] = query_start_time |
| presentation.properties["query_end_time"] = query_end_time |
| |
| rows = api.file.read_json( |
| "read rows", |
| query_output, |
| test_data=[ |
| { |
| "change": 123456, |
| "patchset": "https://fxrev.dev/c/123456/2", |
| "prev_patchset": "https://fxrev.dev/c/123456/1..2", |
| "next_patchset": "https://fxrev.dev/c/123456/2..3", |
| "failed_builders": ["foo.x64", "foo.arm64"], |
| "build_links": [ |
| "https://ci.chromium.org/b/123", |
| "https://ci.chromium.org/b/456", |
| ], |
| }, |
| ], |
| ) |
| if not rows: |
| return result.RawResult( |
| summary_markdown="No new Sapling bugs to file", |
| status=common_pb2.SUCCESS, |
| ) |
| with api.step.nest("file bugs") as presentation: |
| for row in rows: |
| file_monorail_bug(api, row) |
| |
| |
| def get_query_start_time(api): |
| """Determine the Sapling query start time, which is the query end time of |
| the last successful build. Essentially, this lets us pick up exactly where |
| we last left off. |
| |
| If there is no successful build in history, or its `query_end_time` property |
| is not set, return the current time minus one day. |
| |
| Returns: |
| str: string representation of the query start time in UTC. |
| """ |
| start_time = None |
| try: |
| build_id = api.lkg.build( |
| "get lkg build", |
| builder=api.buildbucket_util.full_builder_name(), |
| test_data="889900", |
| ) |
| build = api.buildbucket.get( |
| int(build_id), |
| fields=["output"], |
| ) |
| start_time = dict(build.output.properties).get("query_end_time") |
| # We will hit this case if this is the first time this build has ever run, |
| # or if the builder has never succeeded before. Fallback on the default |
| # query start time, which is calculated below. |
| except api.step.StepFailure: |
| pass |
| |
| start_time = start_time or (api.time.utcnow() - timedelta(days=1)).strftime( |
| DATETIME_FMT |
| ) |
| # Ensure that the start time matches the expected format. |
| assert start_time == datetime.strptime(start_time, DATETIME_FMT).strftime( |
| DATETIME_FMT |
| ) |
| return start_time |
| |
| |
| def file_monorail_bug(api, row): |
| """File a new Monorail bug for a Sapling query row. |
| |
| Args: |
| row (dict): Dict representation of a row which conforms to the Sapling |
| query schema. |
| """ |
| api.monorail.file_bug( |
| "file bug for %s" % row["patchset"], |
| "[sapling] Internal-only failures found in %s" % row["patchset"], |
| BUG_DESCRIPTION.format( |
| patchset=row["patchset"], |
| build_info=build_info(row), |
| patchset_info=patchset_info(row), |
| ), |
| components=["EngProd>Sapling"], |
| labels=[ |
| # These bugs contain internal links that are only viewable by |
| # Googlers. |
| "Restrict-View-Google", |
| "sapling-untriaged", |
| # Attach a label with the change number to aid bug deduplication |
| # across patchsets. |
| "sapling-%d" % row["change"], |
| ], |
| ) |
| |
| |
| def build_info(row): |
| """Return build info for a Sapling query row.""" |
| build_info = [ |
| "%s: %s" % (builder, link) |
| for builder, link in zip(row["failed_builders"], row["build_links"]) |
| ] |
| return "\n".join(build_info) |
| |
| |
| def patchset_info(row): |
| """Return available patchset link(s) for a Sapling query row.""" |
| patchset_info = ["Patchset: %s" % row["patchset"]] |
| if row["prev_patchset"]: |
| patchset_info.append("Previous patchset: %s" % row["prev_patchset"]) |
| if row["next_patchset"]: |
| patchset_info.append("Next patchset: %s" % row["next_patchset"]) |
| return "\n".join(patchset_info) |
| |
| |
| def GenTests(api): |
| yield api.status_check.test("lkg") |
| yield api.status_check.test("no_lkg") + api.step_data("get lkg build", retcode=1) |
| yield api.status_check.test("no_results") + api.step_data( |
| "read rows", api.file.read_json(None) |
| ) |