blob: 8390280343629cc73df5a22de91ab0183d655b3a [file] [log] [blame]
# Copyright 2020 The Fuchsia Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Recipe for closing the tree when builders in a specified Milo console fail."""
import collections
import datetime
import hashlib
import re
import attr
from recipe_engine import post_process
from PB.go.chromium.org.luci.buildbucket.proto import build as build_pb2
from PB.go.chromium.org.luci.buildbucket.proto import builder as builder_pb2
from PB.go.chromium.org.luci.buildbucket.proto import (
builds_service as builds_service_pb2,
)
from PB.go.chromium.org.luci.buildbucket.proto import common as common_pb2
from PB.go.chromium.org.luci.milo.api.config import project as milo_pb2
from PB.recipe_engine.result import RawResult
from PB.recipes.fuchsia.tree_closer import InputProperties
DEPS = [
"fuchsia/luci_config",
"fuchsia/status_check",
"recipe_engine/buildbucket",
"recipe_engine/cipd",
"recipe_engine/context",
"recipe_engine/json",
"recipe_engine/properties",
"recipe_engine/python",
"recipe_engine/step",
"recipe_engine/time",
]
PROPERTIES = InputProperties
DEFAULT_GRACE_PERIOD = datetime.timedelta(hours=2)
# The format used for the "date" field in HTTP responses from requests to the
# tree status page.
TREE_STATUS_DATE_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
# If a builder's latest build has this property set to true, the builder will
# not be considered when deciding whether to close the tree.
IGNORE_BUILDER_PROPERTY = "_tree_closer_ignore"
def RunSteps(api, props):
grace_period = (
datetime.timedelta(seconds=props.grace_period_seconds) or DEFAULT_GRACE_PERIOD
)
if not props.rules: # pragma: no cover
return RawResult(
summary_markdown="no rules to check", status=common_pb2.SUCCESS,
)
with api.context(infra_steps=True):
if not _can_close_tree(api, props.tree_status_host, grace_period):
# The tree is already closed or the status has been changed recently,
# so its status shouldn't be set to closed again. So no need to even
# check the builders; we'll just exit early.
return RawResult(
summary_markdown="tree is already closed or recently opened",
status=common_pb2.SUCCESS,
)
console = api.luci_config.get_milo_console(
props.console_name, project=props.project
)
assert console, "no console with name %r" % props.console_name
with api.step.nest("check console health"):
summary_regexp_ignore = [re.compile(r) for r in props.summary_regexp_ignore]
closure_reason = _get_tree_closure_reason(
api, console, props.rules, summary_regexp_ignore
)
if not closure_reason:
# Builders are green enough, so no need to close the tree.
return RawResult(
summary_markdown="console is healthy", status=common_pb2.SUCCESS
)
# Log the bug title so someone looking at the build results page can
# understand why the builder is closing the tree.
step = api.step(closure_reason.bug_title(), None)
step.presentation.status = api.step.FAILURE
monorail_path = api.cipd.ensure_tool(
# Convert to str because proto strings are unicode, and Python 2
# recipe placeholders don't like unicode.
str(props.monorail_cipd_package),
str(props.monorail_cipd_version) or "latest",
)
bug_link = _file_bug(
api, monorail_path, closure_reason, props.bug_components, props.bug_labels
)
message = "Tree is closed: %s" % bug_link
_close_tree(api, props.tree_status_host, props.tree_status_password, message)
return RawResult(
summary_markdown=closure_reason.bug_title(), status=common_pb2.FAILURE
)
class TreeClosureReason(object): # pragma: no cover
def bug_title(self):
raise NotImplementedError()
def bug_description(self):
raise NotImplementedError()
@attr.s
class ReasonConsecutiveFailures(TreeClosureReason):
_builder = attr.ib(type=str)
_failed_builds = attr.ib(type=[build_pb2.Build])
def bug_title(self):
return "%s failed %d times in a row" % (
self._builder,
len(self._failed_builds),
)
def bug_description(self):
lines = [
"The tree was closed because %s failed %d times in a row."
% (self._builder, len(self._failed_builds)),
"",
"Failed builds:",
"",
]
for build in self._failed_builds:
lines.append("- %s" % _build_link(build))
if build.summary_markdown:
lines.extend(_formatted_summary_lines(build))
return "\n".join(lines)
@attr.s
class ReasonConcurrentFailures(TreeClosureReason):
_failed_builds_by_builder = attr.ib(type={str: build_pb2.Build})
def bug_title(self):
return "%d builders are failing" % len(self._failed_builds_by_builder)
def bug_description(self):
lines = [
(
"The tree was closed because %d builders are failing."
% len(self._failed_builds_by_builder)
),
"",
"Failing builders:",
"",
]
for builder, latest_build in self._failed_builds_by_builder.items():
lines.append("- %s: %s" % (builder, _build_link(latest_build)))
if latest_build.summary_markdown:
lines.extend(_formatted_summary_lines(latest_build))
return "\n".join(lines)
def _build_link(build):
return "https://ci.chromium.org/b/%d" % build.id
def _formatted_summary_lines(build):
lines = [""]
lines.extend(" > " + line for line in build.summary_markdown.split("\n"))
lines.append("")
return lines
def _get_tree_closure_reason(api, console, rules, summary_regexp_ignore):
"""Determines if and why the tree should be closed.
Args:
console (str): The Milo console to check the builders of.
rules (seq of Rule proto): The conditions under which the tree should
be closed.
summary_regexp_ignore (seq of re.Pattern): Any build whose summary matches
one of these regexes will not be considered for the purpose of
closing the tree.
Returns:
A TreeClosureReason if the tree should be closed, otherwise None.
"""
failing_builders = collections.OrderedDict()
concurrent_failures_rules = [r for r in rules if r.concurrent_failures > 0]
consecutive_failures_rules = [r for r in rules if r.consecutive_failures > 0]
# Search back in history by the maximum number of builds required by any
# rule. If no rule specifies a `consecutive_failures` value, then we only
# need to check the most recent build of each builder for each concurrent
# failure rule.
if consecutive_failures_rules:
count_to_check = max(r.consecutive_failures for r in consecutive_failures_rules)
else:
count_to_check = 1
# Fetch more builds than we actually need in case some match one of
# `summary_regexp_ignore`.
count_to_fetch = max(5, count_to_check * 2)
# TODO(olivernewman): Combine the search requests into a single batch RPC
# request rather than doing them serially. This will likely be
# significantly more efficient.
for builder in console.builders:
builds = _last_n_builds(api, builder, count_to_fetch)
# Rules only apply to builds that failed, so no need to even iterate
# over the rules if all of its builds passed.
if all(build.status == common_pb2.SUCCESS for build in builds):
continue
# The buildbucket API is supposed to return builds newest-first, but
# we'll sort again here just to be safe.
builds.sort(key=lambda b: b.start_time.seconds, reverse=True)
# `properties` is a protobuf Struct, which doesn't support `get()`. So
# convert to a normal Python dict first.
props = dict(builds[0].input.properties.items())
ignore_builder = props.get(IGNORE_BUILDER_PROPERTY, False)
if ignore_builder is True:
continue
builds = [
b
for b in builds
if not any(r.search(b.summary_markdown) for r in summary_regexp_ignore)
][:count_to_check]
if not builds:
# Either builder has no history, or all of its recent builds match
# one of summary_regexp_ignore.
continue
builder_name = builder.name.split("/")[-1]
latest_build = builds[0]
# Likewise, if a builder's most recent build passed, then there's no
# need to close the tree for this builder.
if latest_build.status == common_pb2.SUCCESS:
continue
builder_name = builder.name.split("/")[-1]
failing_builders[builder_name] = latest_build
for rule in consecutive_failures_rules:
if len(builds) < rule.consecutive_failures:
# This builder doesn't yet have enough history to determine whether
# it should be closed.
continue
builds_to_check = builds[: rule.consecutive_failures]
rule_matched = True
for build in builds_to_check:
if build.status == common_pb2.SUCCESS:
rule_matched = False
break
failed_step = _find_failed_step_match(build, rule.failed_step_regexp)
if not failed_step:
rule_matched = False
break
if rule_matched:
return ReasonConsecutiveFailures(
builder=builder_name, failed_builds=builds_to_check,
)
for rule in concurrent_failures_rules:
matched_builders = {}
# We only consider the most recent build of each builder when checking
# for concurrently failing builders.
for builder, latest_build in failing_builders.items():
failed_step = _find_failed_step_match(latest_build, rule.failed_step_regexp)
if failed_step:
matched_builders[builder] = latest_build
if len(matched_builders) >= rule.concurrent_failures:
return ReasonConcurrentFailures(matched_builders)
return None
def _find_failed_step_match(build, step_regex):
for step in build.steps:
if step.status != common_pb2.SUCCESS and re.search(step_regex, step.name):
return step
return None
def _last_n_builds(api, console_builder, n):
# builder.name is of the form
# "buildbucket/luci.{project}.{bucket}/{builder}".
_, full_bucket, builder_name = console_builder.name.split("/")
_, project, bucket = full_bucket.split(".", 2)
predicate = builds_service_pb2.BuildPredicate(
builder=builder_pb2.BuilderID(
builder=builder_name, bucket=bucket, project=project,
),
status=common_pb2.ENDED_MASK, # Include only completed builds.
)
fields = api.buildbucket.DEFAULT_FIELDS.union(
{"steps.*.name", "steps.*.status", "summary_markdown"}
)
builds = api.buildbucket.search(
predicate, limit=n, step_name=builder_name, fields=fields
)
return builds
def _file_bug(api, monorail_path, closure_reason, bug_components, bug_labels):
"""Every tree closure needs a tracking bug linked from the tree status."""
args = [
monorail_path,
"new-issue",
"-summary",
closure_reason.bug_title(),
"-description",
closure_reason.bug_description(),
]
for component in bug_components:
args.extend(["-component", component])
for label in bug_labels:
args.extend(["-label", label])
step = api.step(
"create monorail bug",
args,
stdout=api.json.output(),
step_test_data=lambda: api.json.test_api.output_stream({"id": 605}),
)
bug_id = step.stdout["id"]
bug_link = "https://fxbug.dev/%d" % bug_id
step.presentation.links["monorail link"] = bug_link
return bug_link
def _close_tree(api, tree_hostname, password, message):
api.python(
"close tree",
api.resource("tree_status.py"),
[
tree_hostname,
"set",
message,
"--username",
api.buildbucket.builder_name,
"--password",
password,
],
)
def _can_close_tree(api, tree_hostname, grace_period):
step = api.python(
"get current tree status",
api.resource("tree_status.py"),
[tree_hostname, "get"],
stdout=api.json.output(),
step_test_data=lambda: api.json.test_api.output_stream({}),
)
status = step.stdout
step.presentation.step_text = status["general_state"]
step.presentation.links[tree_hostname] = "https://%s" % tree_hostname
# No need to close the tree if it's already closed.
if status["general_state"] not in ("open", "throttled"):
return False
open_date = datetime.datetime.strptime(status["date"], TREE_STATUS_DATE_FORMAT)
open_duration = api.time.utcnow() - open_date
return open_duration > grace_period
def GenTests(api):
def pseudo_random_build_id(builder_string):
return int(int(hashlib.sha256(builder_string).hexdigest(), 16) % 1e8)
def test(
name,
builder_history=None,
should_check_console=True,
should_close_tree=False,
rules=None,
summary_regexp_ignore=(),
tree_status="open",
tree_status_age=DEFAULT_GRACE_PERIOD + datetime.timedelta(minutes=1),
):
"""Create a test case for running this recipe.
Args:
name (str): Name of the test case.
builder_history (dict): Mapping from builder base name to a list
of build status strings like "SUCCESS", corresponding to the
builder's most recent builds in reverse chronological order
(latest builds first).
should_check_console (bool): Whether the recipe is expected to
query luci-config and Buildbucket for builders and build results.
should_close_tree (bool): Whether the recipe is expected to close
the tree. Ignored if should_check_console is False.
rules (seq of Rule): Passed as a recipe property.
summary_regexp_ignore (seq of str): Passed as a recipe property.
tree_status (str): Mock current status for the tree.
tree_status_age (str): Mocked duration since the last tree status
update.
"""
if not builder_history:
builder_history = {}
if rules is None:
rules = [
dict(concurrent_failures=2, consecutive_failures=2),
]
project = "fuchsia"
bucket = "global.ci"
console_name = "global_ci"
res = (
api.status_check.test(
name, status="failure" if should_close_tree else "success"
)
+ api.buildbucket.try_build(project=project)
+ api.properties(
console_name=console_name,
rules=rules,
summary_regexp_ignore=list(summary_regexp_ignore),
tree_status_host="example.com",
tree_status_password="pa$$word",
monorail_cipd_package="fuchsia/infra/monorail/${platform}",
bug_components=["tree-closure"],
bug_labels=["tree-closure"],
)
)
now = 1592654400
res += api.time.seed(now)
status_date = datetime.datetime.utcfromtimestamp(now) - tree_status_age
res += api.step_data(
"get current tree status",
stdout=api.json.output(
{
"general_state": tree_status,
"key": 12345,
"date": status_date.strftime(TREE_STATUS_DATE_FORMAT),
}
),
)
checker = post_process.MustRun if should_close_tree else post_process.DoesNotRun
res += api.post_process(checker, "create monorail bug")
res += api.post_process(checker, "close tree")
if should_check_console:
res += api.post_process(post_process.MustRun, "check console health")
else:
res += api.post_process(post_process.DoesNotRun, "check console health")
return res
console_builders = []
for builder in builder_history:
console_builders.append(
milo_pb2.Builder(
name="buildbucket/luci.%s.%s/%s" % (project, bucket, builder)
)
)
milo_cfg = milo_pb2.Project(
consoles=[
milo_pb2.Console(
id=console_name,
name="global integration ci",
repo_url="https://fuchsia.googlesource.com/integration",
refs=["regexp:refs/heads/master"],
builders=console_builders,
)
]
)
res += api.luci_config.mock_config("luci-milo.cfg", milo_cfg)
# History is a list of builder statuses, most recent first.
for builder, history in builder_history.items():
search_results = []
for i, build in enumerate(history):
bp = build.builder
bp.builder, bp.bucket, bp.project = builder, bucket, project
build.id = pseudo_random_build_id(builder + str(i))
build.start_time.seconds = int(
now - datetime.timedelta(days=i).total_seconds()
)
search_results.append(build)
res += api.buildbucket.simulated_search_results(
search_results, step_name="check console health.%s" % builder,
)
return res
def _build(build_status, steps=None, summary_markdown="", should_ignore=False):
"""Returns a Build proto for mocking buildbucket.search requests.
Args:
build_status (str): A string corresponding to a common_pb2.Status
value.
steps (dict): A mapping from step names (with nestings indicated
by | pipes) to status strings, also corresponding to
common_pb2.Status values.
summary_markdown (str): The summary to set for the build.
"""
steps = steps or {}
b = api.buildbucket.ci_build_message(status=build_status)
b.summary_markdown = summary_markdown
for step_name, status in steps.items():
step = b.steps.add()
step.name = step_name
step.status = common_pb2.Status.Value(status)
if should_ignore:
b.input.properties.update({IGNORE_BUILDER_PROPERTY: True})
return b
def success(**kwargs):
"""Returns a Build proto with a successful status."""
return _build("SUCCESS", **kwargs)
def failure(summary_markdown="5 tests failed", **kwargs):
"""Returns a Build proto with a failure status."""
return _build("FAILURE", summary_markdown=summary_markdown, **kwargs)
# As long as the most recent build for each builder is green, we shouldn't
# close the tree.
yield test(
name="all_green",
builder_history=collections.OrderedDict(
[
("core.arm64-asan", [success(), failure()]),
("core.x64-asan", [success(), failure()]),
],
),
)
# If any builder's most recent `consecutive_failures` builds have all failed,
# close the tree.
yield test(
name="consecutive_failures",
builder_history=collections.OrderedDict(
[
("core.arm64-asan", [failure(steps={"foo": "FAILURE"}), success()]),
(
"core.x64-asan",
[
failure(steps={"foo": "FAILURE"}),
failure(steps={"bar": "SUCCESS", "foo": "FAILURE"}),
],
),
],
),
rules=[dict(consecutive_failures=2, failed_step_regexp="foo")],
should_close_tree=True,
)
yield test(
name="summary_regexp_ignore",
builder_history=collections.OrderedDict(
[
(
"core.x64-asan",
[
failure(
summary_markdown="step failed: checkout.jiri update",
steps={"foo": "FAILURE"},
),
],
),
],
),
rules=[dict(consecutive_failures=1)],
summary_regexp_ignore=["checkout"],
should_close_tree=False,
)
# Even if a builder has failed more than `consecutive_failures` times in a
# row, don't close the tree unless all the failed builds have a failed step
# matching `failed_step_regexp`.
yield test(
name="consecutive_failures_no_matched_step",
builder_history=collections.OrderedDict(
[
(
"core.x64-asan",
[
failure(steps={"foo": "FAILURE"}),
failure(steps={"bar": "FAILURE"}),
],
),
],
),
rules=[dict(consecutive_failures=2, failed_step_regexp="foo")],
should_close_tree=False,
)
# If `concurrent_failures` or more builders' most recent builds have all
# failed, close the tree.
yield test(
name="concurrent_failures",
builder_history=collections.OrderedDict(
{
"bringup.arm64-asan": [success()],
"core.arm64-asan": [failure(steps={"foo": "FAILURE"})],
"core.x64-asan": [
failure(steps={"bar": "SUCCESS", "foo|x": "FAILURE"})
],
},
),
rules=[dict(concurrent_failures=2, failed_step_regexp="foo")],
should_close_tree=True,
)
# Even if `concurrent_failures` or more builders' most recent builds have
# all failed, don't close the tree if they don't all have a failed step
# matching `failed_step_regexp`.
yield test(
name="concurrent_failures_no_matched_step",
builder_history=collections.OrderedDict(
{
"bringup.arm64-asan": [failure()],
"core.arm64-asan": [failure(steps={"bar": "FAILURE"})],
# Only this builder has a failed step matching
# `failed_step_regexp`.
"core.x64-asan": [
failure(steps={"bar": "SUCCESS", "foo|x": "FAILURE"}),
],
},
),
rules=[dict(concurrent_failures=2, failed_step_regexp="foo")],
should_close_tree=False,
)
# The tree status shouldn't be changed if it was only recently opened.
yield test(
name="tree_recently_opened",
tree_status_age=DEFAULT_GRACE_PERIOD - datetime.timedelta(minutes=1),
should_check_console=False,
)
# The build should exit early if the tree is already closed.
yield test(
name="tree_already_closed", tree_status="closed", should_check_console=False
)
# Builders with no builds shouldn't affect the tree status.
yield test(
name="throttled__no_history",
# If the tree is "throttled" then it's open, so the closer should still
# check the console for failures.
tree_status="throttled",
builder_history={"core.arm64-asan": []},
should_close_tree=False,
)
# Builders with fewer than consecutive_failures builds shouldn't affect the
# tree status.
yield test(
name="not_enough_history",
builder_history={"core.arm64-asan": [failure()]},
should_close_tree=False,
)
# If a builder's latest build has the `IGNORE_BUILDER_PROPERTY` set, then
# the entire builder should be ignored.
yield test(
name="ignored_builder",
builder_history={"core.x64-asan": [failure(should_ignore=True), failure()]},
rules=[dict(concurrent_failures=1)],
should_close_tree=False,
)