blob: 578b569972e87f15e1890d0b67725fa808f37274 [file] [log] [blame]
# Copyright 2020 The Fuchsia Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Recipe for closing the tree when builders in a specified Milo console fail."""
import datetime
from future.moves.urllib.parse import urlparse
import hashlib
import re
import textwrap
import attr
from recipe_engine import post_process
from PB.go.chromium.org.luci.buildbucket.proto import build as build_pb2
from PB.go.chromium.org.luci.buildbucket.proto import builder as builder_pb2
from PB.go.chromium.org.luci.buildbucket.proto import (
builds_service as builds_service_pb2,
)
from PB.go.chromium.org.luci.buildbucket.proto import common as common_pb2
from PB.go.chromium.org.luci.milo.api.config import project as milo_pb2
from PB.recipe_engine.result import RawResult
from PB.recipes.fuchsia.tree_closer import InputProperties
PYTHON_VERSION_COMPATIBILITY = "PY3"
DEPS = [
"fuchsia/gitiles",
"fuchsia/luci_config",
"fuchsia/monorail",
"fuchsia/status_check",
"fuchsia/tree_status",
"recipe_engine/buildbucket",
"recipe_engine/context",
"recipe_engine/json",
"recipe_engine/properties",
"recipe_engine/raw_io",
"recipe_engine/step",
"recipe_engine/time",
]
PROPERTIES = InputProperties
DEFAULT_GRACE_PERIOD = datetime.timedelta(hours=2)
# The format used for the "date" field in HTTP responses from requests to the
# tree status page.
TREE_STATUS_DATE_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
# If a builder's latest build has this property set to true, the builder will
# not be considered when deciding whether to close the tree.
IGNORE_BUILDER_PROPERTY = "_tree_closer_ignore"
def RunSteps(api, props):
now = los_angeles_now(api)
for date in props.disable_on_dates:
if now.date() == datetime.date.fromisoformat(date):
return RawResult(
summary_markdown="disabled today (%s)" % date,
status=common_pb2.SUCCESS,
)
rules = (
props.working_hour_rules if during_working_hours(now) else props.off_hour_rules
)
grace_period = (
datetime.timedelta(seconds=props.grace_period_seconds) or DEFAULT_GRACE_PERIOD
)
if not rules: # pragma: no cover
return RawResult(
summary_markdown="no rules to check",
status=common_pb2.SUCCESS,
)
with api.context(infra_steps=True):
if not can_close_tree(api, props.tree_status_host, grace_period):
# The tree is already closed or the status has been changed recently,
# so its status shouldn't be set to closed again. So no need to even
# check the builders; we'll just exit early.
return RawResult(
summary_markdown="tree is already closed or recently opened",
status=common_pb2.SUCCESS,
)
console = api.luci_config.get_milo_console(
props.console_name, project=props.project
)
assert console, "no console with name %r" % props.console_name
with api.step.nest("check console health"):
summary_regexp_ignore = [re.compile(r) for r in props.summary_regexp_ignore]
closure_reason = get_tree_closure_reason(
api, console, rules, summary_regexp_ignore
)
if not closure_reason:
# Builders are green enough, so no need to close the tree.
return RawResult(
summary_markdown="console is healthy", status=common_pb2.SUCCESS
)
# Log the bug title so someone looking at the build results page can
# understand why the builder is closing the tree.
api.step.empty(
closure_reason.bug_title(), status=api.step.FAILURE, raise_on_failure=False
)
bug_link = file_bug(
api, console, closure_reason, props.bug_components, props.bug_labels
)
message = "Tree is closed: %s" % bug_link
api.tree_status.close_tree(
message=message,
hostname=props.tree_status_host,
password=props.tree_status_password,
)
return RawResult(
summary_markdown=closure_reason.bug_title(), status=common_pb2.FAILURE
)
class TreeClosureReason(object): # pragma: no cover
def bug_title(self):
raise NotImplementedError()
def bug_description(self, repo_head_url):
"""Returns the description of the bug to file.
Takes a URL pointing to HEAD of the repo that triggered the tree
closure, for help with debugging. This commit will often be slightly
later than the commit(s) that triggered the failing builder(s).
"""
raise NotImplementedError()
@attr.s
class ReasonConsecutiveFailures(TreeClosureReason):
_builder = attr.ib(type=str)
_failed_builds = attr.ib(type=[build_pb2.Build])
def bug_title(self):
return "%s failed %d times in a row" % (
self._builder,
len(self._failed_builds),
)
def bug_description(self, repo_head_url):
description = """\
The tree was closed because {builder} failed {failures} times in a row.
Latest commit: {repo_head_url}
Full builder history: {builder_link}
Failed builds:
""".format(
builder=self._builder,
failures=len(self._failed_builds),
repo_head_url=repo_head_url,
builder_link=builder_link(self._failed_builds[0].builder, limit=200),
)
lines = textwrap.dedent(description).splitlines()
for build in self._failed_builds:
lines.append("- %s" % build_link(build))
if build.summary_markdown:
lines.extend(formatted_summary_lines(build))
return "\n".join(lines)
@attr.s
class ReasonConcurrentFailures(TreeClosureReason):
_failed_builds_by_builder = attr.ib(type={str: build_pb2.Build})
def bug_title(self):
return "%d builders are failing" % len(self._failed_builds_by_builder)
def bug_description(self, repo_head_url):
description = """\
The tree was closed because {failures} builders are failing.
Latest commit: {repo_head_url}
Failing builders:
""".format(
failures=len(self._failed_builds_by_builder),
repo_head_url=repo_head_url,
)
lines = textwrap.dedent(description).splitlines()
for builder, latest_build in sorted(self._failed_builds_by_builder.items()):
lines.append("- %s: %s" % (builder, build_link(latest_build)))
if latest_build.summary_markdown:
lines.extend(formatted_summary_lines(latest_build))
return "\n".join(lines)
def build_link(build):
return "%s/b%d" % (builder_link(build.builder), build.id)
def builder_link(builder, limit=None):
url = "https://ci.chromium.org/p/%s/builders/%s/%s" % (
builder.project,
builder.bucket,
builder.builder,
)
if limit:
url += "?limit=%d" % limit
return url
def formatted_summary_lines(build):
lines = [""]
lines.extend(" > " + line for line in build.summary_markdown.split("\n"))
lines.append("")
return lines
def get_tree_closure_reason(api, console, rules, summary_regexp_ignore):
"""Determines if and why the tree should be closed.
Args:
console (str): The Milo console to check the builders of.
rules (seq of Rule proto): The conditions under which the tree should
be closed.
summary_regexp_ignore (seq of re.Pattern): Any build whose summary matches
one of these regexes will not be considered for the purpose of
closing the tree.
Returns:
A TreeClosureReason if the tree should be closed, otherwise None.
"""
failing_builders = {}
concurrent_failures_rules = [r for r in rules if r.concurrent_failures > 0]
consecutive_failures_rules = [r for r in rules if r.consecutive_failures > 0]
# Search back in history by the maximum number of builds required by any
# rule. If no rule specifies a `consecutive_failures` value, then we only
# need to check the most recent build of each builder for each concurrent
# failure rule.
if consecutive_failures_rules:
count_to_check = max(r.consecutive_failures for r in consecutive_failures_rules)
else:
count_to_check = 1
# Fetch more builds than we actually need in case some match one of
# `summary_regexp_ignore`.
count_to_fetch = max(5, count_to_check * 2)
# TODO(olivernewman): Combine the search requests into a single batch RPC
# request rather than doing them serially. This will likely be
# significantly more efficient.
for builder in console.builders:
builds = last_n_builds(api, builder, count_to_fetch)
# Rules only apply to builds that failed, so no need to even iterate
# over the rules if all of its builds passed.
if all(build.status == common_pb2.SUCCESS for build in builds):
continue
# The buildbucket API is supposed to return builds newest-first, but
# we'll sort again here just to be safe.
builds.sort(key=lambda b: b.start_time.seconds, reverse=True)
# `properties` is a protobuf Struct, which doesn't support `get()`. So
# convert to a normal Python dict first.
props = dict(builds[0].input.properties.items())
ignore_builder = props.get(IGNORE_BUILDER_PROPERTY, False)
if ignore_builder is True:
continue
builds = [
b
for b in builds
if not any(r.search(b.summary_markdown) for r in summary_regexp_ignore)
][:count_to_check]
if not builds:
# Either builder has no history, or all of its recent builds match
# one of summary_regexp_ignore.
continue
builder_name = builder.name.split("/")[-1]
latest_build = builds[0]
# Likewise, if a builder's most recent build passed, then there's no
# need to close the tree for this builder.
if latest_build.status == common_pb2.SUCCESS:
continue
builder_name = builder.name.split("/")[-1]
failing_builders[builder_name] = latest_build
for rule in consecutive_failures_rules:
if rule.builders_to_check and builder_name not in rule.builders_to_check:
# We're only checking this rule for a subset of builders, and
# this builder is not in that subset.
continue
if len(builds) < rule.consecutive_failures:
# This builder doesn't yet have enough history to determine whether
# it should be closed.
continue
builds_to_check = builds[: rule.consecutive_failures]
rule_matched = True
for build in builds_to_check:
if build.status == common_pb2.SUCCESS:
rule_matched = False
break
failed_step = find_failed_step_match(build, rule.failed_step_regexp)
if not failed_step:
rule_matched = False
break
if rule_matched:
return ReasonConsecutiveFailures(
builder=builder_name,
failed_builds=builds_to_check,
)
for rule in concurrent_failures_rules:
matched_builders = {}
# We only consider the most recent build of each builder when checking
# for concurrently failing builders.
for builder, latest_build in failing_builders.items():
failed_step = find_failed_step_match(latest_build, rule.failed_step_regexp)
if failed_step:
matched_builders[builder] = latest_build
if len(matched_builders) >= rule.concurrent_failures:
return ReasonConcurrentFailures(matched_builders)
return None
def find_failed_step_match(build, step_regex):
for step in build.steps:
if step.status != common_pb2.SUCCESS and re.search(step_regex, step.name):
return step
return None
def last_n_builds(api, console_builder, n):
# builder.name is of the form
# "buildbucket/luci.{project}.{bucket}/{builder}".
_, full_bucket, builder_name = console_builder.name.split("/")
_, project, bucket = full_bucket.split(".", 2)
predicate = builds_service_pb2.BuildPredicate(
builder=builder_pb2.BuilderID(
builder=builder_name,
bucket=bucket,
project=project,
),
status=common_pb2.ENDED_MASK, # Include only completed builds.
)
fields = api.buildbucket.DEFAULT_FIELDS.union(
{"steps.*.name", "steps.*.status", "summary_markdown"}
)
builds = api.buildbucket.search(
predicate, limit=n, step_name=builder_name, fields=fields
)
return builds
def file_bug(api, console, closure_reason, bug_components, bug_labels):
"""Every tree closure needs a tracking bug linked from the tree status."""
repo_head_url = get_head_url(api, console)
return api.monorail.file_bug(
"create monorail bug",
closure_reason.bug_title(),
closure_reason.bug_description(repo_head_url),
components=bug_components,
labels=bug_labels,
)
def can_close_tree(api, tree_hostname, grace_period):
status = api.tree_status.get(tree_hostname)
# No need to close the tree if it's already closed.
if not status.open:
return False
open_duration = api.time.utcnow() - status.date
return open_duration > grace_period
def los_angeles_now(api):
# The `time` recipe module and Python's stdlib don't have good timezone
# support, so shell out to unix `date` to get the timezone-aware time.
with api.context(env={"TZ": "America/Los_Angeles"}):
step = api.step(
"get current time",
["date", "--iso-8601=minutes"],
stdout=api.raw_io.output_text(add_output_log=True),
step_test_data=lambda: api.raw_io.test_api.stream_output_text(
"2021-12-01T09:53"
),
)
return datetime.datetime.fromisoformat(step.stdout.strip())
def during_working_hours(now):
"""Checks if the current time is within West Coast working hours.
Working hours are 9 AM to 5 PM, Monday-Friday.
"""
return 0 <= now.weekday() <= 4 and 9 <= now.hour < 17
def get_head_url(api, console):
"""Returns a URL pointing to HEAD of the repo tracked by the Milo console."""
project_name = urlparse(console.repo_url).path.lstrip("/")
with api.step.nest("get latest revision of %s" % project_name):
# For simplicity, we assume that the console only tracks one ref and that
# the ref is an exact match rather than a regexp (although it might
# still have the "regexp:" prefix, since the default is
# "regexp:refs/heads/main").
ref = console.refs[0]
prefix = "regexp:"
if ref.startswith(prefix):
ref = ref[len(prefix) :]
log = api.gitiles.log(console.repo_url, ref, limit=1, step_name="log")
latest_revision = log[0]["id"]
return "%s/+/%s" % (console.repo_url, latest_revision)
def GenTests(api):
def pseudo_random_build_id(builder_string):
return int(int(hashlib.sha256(builder_string.encode()).hexdigest(), 16) % 1e8)
def test(
name,
builder_history=None,
should_check_tree=True,
should_check_console=True,
should_close_tree=False,
working_hour_rules=None,
off_hour_rules=None,
summary_regexp_ignore=(),
tree_status="open",
tree_status_age=DEFAULT_GRACE_PERIOD + datetime.timedelta(minutes=1),
disable_on_dates=(),
):
"""Create a test case for running this recipe.
Args:
name (str): Name of the test case.
builder_history (dict): Mapping from builder base name to a list
of build status strings like "SUCCESS", corresponding to the
builder's most recent builds in reverse chronological order
(latest builds first).
should_check_console (bool): Whether the recipe is expected to
check the current tree status.
should_check_console (bool): Whether the recipe is expected to
query luci-config and Buildbucket for builders and build results.
should_close_tree (bool): Whether the recipe is expected to close
the tree. Ignored if should_check_console is False.
working_hour_rules (seq of Rule): Passed as a recipe property.
off_hour_rules (seq of Rule): Passed as a recipe property.
summary_regexp_ignore (seq of str): Passed as a recipe property.
tree_status (str): Mock current status for the tree.
tree_status_age (str): Mocked duration since the last tree status
update.
disable_on_dates (seq of str): Passed as a recipe property.
"""
if not builder_history:
builder_history = {}
if working_hour_rules is None:
working_hour_rules = [
dict(concurrent_failures=2, consecutive_failures=2),
]
project = "fuchsia"
bucket = "global.ci"
console_name = "global_ci"
res = (
api.status_check.test(
name, status="failure" if should_close_tree else "success"
)
+ api.buildbucket.try_build(project=project)
+ api.properties(
console_name=console_name,
working_hour_rules=working_hour_rules,
off_hour_rules=off_hour_rules,
summary_regexp_ignore=list(summary_regexp_ignore),
tree_status_host="example.com",
tree_status_password="pa$$word",
bug_components=["tree-closure"],
bug_labels=["tree-closure"],
disable_on_dates=list(disable_on_dates),
)
)
if not should_check_tree:
return res
now = 1592654400
res += api.time.seed(now)
status_date = datetime.datetime.utcfromtimestamp(now) - tree_status_age
res += api.step_data(
"get current tree status",
stdout=api.json.output(
{
"general_state": tree_status,
"key": 12345,
"date": status_date.strftime(TREE_STATUS_DATE_FORMAT),
}
),
)
checker = post_process.MustRun if should_close_tree else post_process.DoesNotRun
res += api.post_process(checker, "create monorail bug")
res += api.post_process(checker, "close tree")
if should_check_console:
res += api.post_process(post_process.MustRun, "check console health")
else:
res += api.post_process(post_process.DoesNotRun, "check console health")
return res
console_builders = []
for builder in builder_history:
console_builders.append(
milo_pb2.Builder(
name="buildbucket/luci.%s.%s/%s" % (project, bucket, builder)
)
)
milo_cfg = milo_pb2.Project(
consoles=[
milo_pb2.Console(
id=console_name,
name="global integration ci",
repo_url="https://fuchsia.googlesource.com/integration",
refs=["regexp:refs/heads/main"],
builders=console_builders,
)
]
)
res += api.luci_config.mock_config(project, "luci-milo.cfg", milo_cfg)
# History is a list of builder statuses, most recent first.
for builder, history in builder_history.items():
search_results = []
for i, build in enumerate(history):
bp = build.builder
bp.builder, bp.bucket, bp.project = builder, bucket, project
build.id = pseudo_random_build_id(builder + str(i))
build.start_time.seconds = int(
now - datetime.timedelta(days=i).total_seconds()
)
search_results.append(build)
res += api.buildbucket.simulated_search_results(
search_results,
step_name="check console health.%s" % builder,
)
if should_close_tree:
res += api.gitiles.log("get latest revision of integration.log", s="a", n=1)
return res
def _build(build_status, steps=None, summary_markdown="", should_ignore=False):
"""Returns a Build proto for mocking buildbucket.search requests.
Args:
build_status (str): A string corresponding to a common_pb2.Status
value.
steps (dict): A mapping from step names (with nestings indicated
by | pipes) to status strings, also corresponding to
common_pb2.Status values.
summary_markdown (str): The summary to set for the build.
"""
steps = steps or {}
b = api.buildbucket.ci_build_message(status=build_status)
b.summary_markdown = summary_markdown
for step_name, status in steps.items():
step = b.steps.add()
step.name = step_name
step.status = common_pb2.Status.Value(status)
if should_ignore:
b.input.properties.update({IGNORE_BUILDER_PROPERTY: True})
return b
def success(**kwargs):
"""Returns a Build proto with a successful status."""
return _build("SUCCESS", **kwargs)
def failure(summary_markdown="5 tests failed", **kwargs):
"""Returns a Build proto with a failure status."""
return _build("FAILURE", summary_markdown=summary_markdown, **kwargs)
# As long as the most recent build for each builder is green, we shouldn't
# close the tree.
yield test(
name="all_green",
builder_history={
"core.arm64-asan": [success(), failure()],
"core.x64-asan": [success(), failure()],
},
disable_on_dates=["2019-05-06"], # Non-current dates should be ignored.
)
# If any builder's most recent `consecutive_failures` builds have all failed,
# close the tree.
yield test(
name="consecutive_failures",
builder_history={
"core.arm64-asan": [failure(steps={"foo": "FAILURE"}), success()],
"core.x64-asan": [
failure(steps={"foo": "FAILURE"}),
failure(steps={"bar": "SUCCESS", "foo": "FAILURE"}),
],
},
working_hour_rules=[
dict(
consecutive_failures=2,
failed_step_regexp="foo",
builders_to_check=["core.x64-asan"],
)
],
should_close_tree=True,
)
# Even if there are consecutive failures, ignore them if we're only checking
# the rule for a subset of builders, and the failing builder is not in that
# subset.
yield test(
name="ignored_consecutive_failures",
builder_history={
"core.arm64-asan": [failure(steps={"foo": "FAILURE"}), success()],
"core.x64-asan": [
failure(steps={"foo": "FAILURE"}),
failure(steps={"bar": "SUCCESS", "foo": "FAILURE"}),
],
},
working_hour_rules=[
dict(
consecutive_failures=2,
failed_step_regexp="foo",
builders_to_check=["core.arm64-asan"],
)
],
should_close_tree=False,
)
yield test(
name="summary_regexp_ignore",
builder_history={
"core.x64-asan": [
failure(
summary_markdown="step failed: checkout.jiri update",
steps={"foo": "FAILURE"},
),
],
},
working_hour_rules=[dict(consecutive_failures=1)],
summary_regexp_ignore=["checkout"],
should_close_tree=False,
)
# Even if a builder has failed more than `consecutive_failures` times in a
# row, don't close the tree unless all the failed builds have a failed step
# matching `failed_step_regexp`.
yield test(
name="consecutive_failures_no_matched_step",
builder_history={
"core.x64-asan": [
failure(steps={"foo": "FAILURE"}),
failure(steps={"bar": "FAILURE"}),
]
},
working_hour_rules=[dict(consecutive_failures=2, failed_step_regexp="foo")],
should_close_tree=False,
)
# If `concurrent_failures` or more builders' most recent builds have all
# failed, close the tree.
yield test(
name="concurrent_failures",
builder_history={
"bringup.arm64-asan": [success()],
"core.arm64-asan": [failure(steps={"foo": "FAILURE"})],
"core.x64-asan": [failure(steps={"bar": "SUCCESS", "foo|x": "FAILURE"})],
},
working_hour_rules=[dict(concurrent_failures=2, failed_step_regexp="foo")],
should_close_tree=True,
)
# Even if `concurrent_failures` or more builders' most recent builds have
# all failed, don't close the tree if they don't all have a failed step
# matching `failed_step_regexp`.
yield test(
name="concurrent_failures_no_matched_step",
builder_history={
"bringup.arm64-asan": [failure()],
"core.arm64-asan": [failure(steps={"bar": "FAILURE"})],
# Only this builder has a failed step matching `failed_step_regexp`.
"core.x64-asan": [failure(steps={"bar": "SUCCESS", "foo|x": "FAILURE"})],
},
working_hour_rules=[dict(concurrent_failures=2, failed_step_regexp="foo")],
should_close_tree=False,
)
# The tree status shouldn't be changed if it was only recently opened.
yield test(
name="tree_recently_opened",
tree_status_age=DEFAULT_GRACE_PERIOD - datetime.timedelta(minutes=1),
should_check_console=False,
)
# The build should exit early if the tree is already closed.
yield test(
name="tree_already_closed", tree_status="closed", should_check_console=False
)
# Builders with no builds shouldn't affect the tree status.
yield test(
name="throttled__no_history",
# If the tree is "throttled" then it's open, so the closer should still
# check the console for failures.
tree_status="throttled",
builder_history={"core.arm64-asan": []},
should_close_tree=False,
)
# Builders with fewer than consecutive_failures builds shouldn't affect the
# tree status.
yield test(
name="not_enough_history",
builder_history={"core.arm64-asan": [failure()]},
should_close_tree=False,
)
# If a builder's latest build has the `IGNORE_BUILDER_PROPERTY` set, then
# the entire builder should be ignored.
yield test(
name="ignored_builder",
builder_history={"core.x64-asan": [failure(should_ignore=True), failure()]},
working_hour_rules=[dict(concurrent_failures=1)],
should_close_tree=False,
)
weekday_nighttime = "2021-12-03T23:53" # Friday at 11:53 PM
weekend_daytime = "2021-12-04T11:01" # Saturday at 11:01 AM
# Tree closer should exit early if the time is outside West Coast working
# hours and there are no off-hour rules configured.
yield test("weekend", should_check_tree=False) + api.step_data(
"get current time", api.raw_io.stream_output_text(weekend_daytime)
)
yield test("nighttime", should_check_tree=False) + api.step_data(
"get current time", api.raw_io.stream_output_text(weekday_nighttime)
)
# If `concurrent_failures` or more builders' most recent builds have all
# failed, close the tree during off-hours if `off_hour_rules` have been set.
yield test(
name="nighttime_concurrent_failures",
builder_history={
"bringup.arm64-asan": [success()],
"core.arm64-asan": [failure(steps={"foo": "FAILURE"})],
"core.x64-asan": [failure(steps={"bar": "SUCCESS", "foo|x": "FAILURE"})],
},
working_hour_rules=[],
off_hour_rules=[dict(concurrent_failures=2, failed_step_regexp="foo")],
should_close_tree=True,
) + api.step_data(
"get current time", api.raw_io.stream_output_text(weekday_nighttime)
)
yield test(
"disable_dates", should_check_tree=False, disable_on_dates=["2021-12-01"]
) + api.step_data(
"get current time", api.raw_io.stream_output_text("2021-12-01T12:00")
)