| # Copyright 2020 The Fuchsia Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| """Recipe for closing the tree when builders in a specified Milo console fail.""" |
| |
| import collections |
| import datetime |
| import hashlib |
| import re |
| |
| import attr |
| |
| from recipe_engine import post_process |
| |
| from PB.go.chromium.org.luci.buildbucket.proto import build as build_pb2 |
| from PB.go.chromium.org.luci.buildbucket.proto import builder as builder_pb2 |
| from PB.go.chromium.org.luci.buildbucket.proto import ( |
| builds_service as builds_service_pb2, |
| ) |
| from PB.go.chromium.org.luci.buildbucket.proto import common as common_pb2 |
| from PB.go.chromium.org.luci.milo.api.config import project as milo_pb2 |
| from PB.recipe_engine.result import RawResult |
| from PB.recipes.fuchsia.tree_closer import InputProperties |
| |
| |
| DEPS = [ |
| "fuchsia/luci_config", |
| "fuchsia/status_check", |
| "recipe_engine/buildbucket", |
| "recipe_engine/cipd", |
| "recipe_engine/context", |
| "recipe_engine/json", |
| "recipe_engine/properties", |
| "recipe_engine/python", |
| "recipe_engine/step", |
| "recipe_engine/time", |
| ] |
| |
| PROPERTIES = InputProperties |
| |
| DEFAULT_GRACE_PERIOD = datetime.timedelta(hours=2) |
| |
| # The format used for the "date" field in HTTP responses from requests to the |
| # tree status page. |
| TREE_STATUS_DATE_FORMAT = "%Y-%m-%d %H:%M:%S.%f" |
| |
| # If a builder's latest build has this property set to true, the builder will |
| # not be considered when deciding whether to close the tree. |
| IGNORE_BUILDER_PROPERTY = "_tree_closer_ignore" |
| |
| |
| def RunSteps(api, props): |
| grace_period = ( |
| datetime.timedelta(seconds=props.grace_period_seconds) or DEFAULT_GRACE_PERIOD |
| ) |
| |
| if not props.rules: # pragma: no cover |
| return RawResult( |
| summary_markdown="no rules to check", status=common_pb2.SUCCESS, |
| ) |
| |
| with api.context(infra_steps=True): |
| if not _can_close_tree(api, props.tree_status_host, grace_period): |
| # The tree is already closed or the status has been changed recently, |
| # so its status shouldn't be set to closed again. So no need to even |
| # check the builders; we'll just exit early. |
| return RawResult( |
| summary_markdown="tree is already closed or recently opened", |
| status=common_pb2.SUCCESS, |
| ) |
| |
| console = api.luci_config.get_milo_console( |
| props.console_name, project=props.project |
| ) |
| assert console, "no console with name %r" % props.console_name |
| |
| with api.step.nest("check console health"): |
| summary_regexp_ignore = [re.compile(r) for r in props.summary_regexp_ignore] |
| closure_reason = _get_tree_closure_reason( |
| api, console, props.rules, summary_regexp_ignore |
| ) |
| |
| if not closure_reason: |
| # Builders are green enough, so no need to close the tree. |
| return RawResult( |
| summary_markdown="console is healthy", status=common_pb2.SUCCESS |
| ) |
| |
| # Log the bug title so someone looking at the build results page can |
| # understand why the builder is closing the tree. |
| step = api.step(closure_reason.bug_title(), None) |
| step.presentation.status = api.step.FAILURE |
| |
| monorail_path = api.cipd.ensure_tool( |
| # Convert to str because proto strings are unicode, and Python 2 |
| # recipe placeholders don't like unicode. |
| str(props.monorail_cipd_package), |
| str(props.monorail_cipd_version) or "latest", |
| ) |
| bug_link = _file_bug( |
| api, monorail_path, closure_reason, props.bug_components, props.bug_labels |
| ) |
| |
| message = "Tree is closed: %s" % bug_link |
| _close_tree(api, props.tree_status_host, props.tree_status_password, message) |
| |
| return RawResult( |
| summary_markdown=closure_reason.bug_title(), status=common_pb2.FAILURE |
| ) |
| |
| |
| class TreeClosureReason(object): # pragma: no cover |
| def bug_title(self): |
| raise NotImplementedError() |
| |
| def bug_description(self): |
| raise NotImplementedError() |
| |
| |
| @attr.s |
| class ReasonConsecutiveFailures(TreeClosureReason): |
| _builder = attr.ib(type=str) |
| _failed_builds = attr.ib(type=[build_pb2.Build]) |
| |
| def bug_title(self): |
| return "%s failed %d times in a row" % ( |
| self._builder, |
| len(self._failed_builds), |
| ) |
| |
| def bug_description(self): |
| lines = [ |
| "The tree was closed because %s failed %d times in a row." |
| % (self._builder, len(self._failed_builds)), |
| "", |
| "Failed builds:", |
| "", |
| ] |
| for build in self._failed_builds: |
| lines.append("- %s" % _build_link(build)) |
| if build.summary_markdown: |
| lines.extend(_formatted_summary_lines(build)) |
| |
| return "\n".join(lines) |
| |
| |
| @attr.s |
| class ReasonConcurrentFailures(TreeClosureReason): |
| _failed_builds_by_builder = attr.ib(type={str: build_pb2.Build}) |
| |
| def bug_title(self): |
| return "%d builders are failing" % len(self._failed_builds_by_builder) |
| |
| def bug_description(self): |
| lines = [ |
| ( |
| "The tree was closed because %d builders are failing." |
| % len(self._failed_builds_by_builder) |
| ), |
| "", |
| "Failing builders:", |
| "", |
| ] |
| for builder, latest_build in self._failed_builds_by_builder.items(): |
| lines.append("- %s: %s" % (builder, _build_link(latest_build))) |
| if latest_build.summary_markdown: |
| lines.extend(_formatted_summary_lines(latest_build)) |
| return "\n".join(lines) |
| |
| |
| def _build_link(build): |
| return "https://ci.chromium.org/b/%d" % build.id |
| |
| |
| def _formatted_summary_lines(build): |
| lines = [""] |
| lines.extend(" > " + line for line in build.summary_markdown.split("\n")) |
| lines.append("") |
| return lines |
| |
| |
| def _get_tree_closure_reason(api, console, rules, summary_regexp_ignore): |
| """Determines if and why the tree should be closed. |
| |
| Args: |
| console (str): The Milo console to check the builders of. |
| rules (seq of Rule proto): The conditions under which the tree should |
| be closed. |
| summary_regexp_ignore (seq of re.Pattern): Any build whose summary matches |
| one of these regexes will not be considered for the purpose of |
| closing the tree. |
| |
| Returns: |
| A TreeClosureReason if the tree should be closed, otherwise None. |
| """ |
| failing_builders = collections.OrderedDict() |
| |
| concurrent_failures_rules = [r for r in rules if r.concurrent_failures > 0] |
| consecutive_failures_rules = [r for r in rules if r.consecutive_failures > 0] |
| |
| # Search back in history by the maximum number of builds required by any |
| # rule. If no rule specifies a `consecutive_failures` value, then we only |
| # need to check the most recent build of each builder for each concurrent |
| # failure rule. |
| if consecutive_failures_rules: |
| count_to_check = max(r.consecutive_failures for r in consecutive_failures_rules) |
| else: |
| count_to_check = 1 |
| |
| # Fetch more builds than we actually need in case some match one of |
| # `summary_regexp_ignore`. |
| count_to_fetch = max(5, count_to_check * 2) |
| |
| # TODO(olivernewman): Combine the search requests into a single batch RPC |
| # request rather than doing them serially. This will likely be |
| # significantly more efficient. |
| for builder in console.builders: |
| builds = _last_n_builds(api, builder, count_to_fetch) |
| # Rules only apply to builds that failed, so no need to even iterate |
| # over the rules if all of its builds passed. |
| if all(build.status == common_pb2.SUCCESS for build in builds): |
| continue |
| |
| # The buildbucket API is supposed to return builds newest-first, but |
| # we'll sort again here just to be safe. |
| builds.sort(key=lambda b: b.start_time.seconds, reverse=True) |
| |
| # `properties` is a protobuf Struct, which doesn't support `get()`. So |
| # convert to a normal Python dict first. |
| props = dict(builds[0].input.properties.items()) |
| ignore_builder = props.get(IGNORE_BUILDER_PROPERTY, False) |
| if ignore_builder is True: |
| continue |
| |
| builds = [ |
| b |
| for b in builds |
| if not any(r.search(b.summary_markdown) for r in summary_regexp_ignore) |
| ][:count_to_check] |
| if not builds: |
| # Either builder has no history, or all of its recent builds match |
| # one of summary_regexp_ignore. |
| continue |
| |
| builder_name = builder.name.split("/")[-1] |
| latest_build = builds[0] |
| |
| # Likewise, if a builder's most recent build passed, then there's no |
| # need to close the tree for this builder. |
| if latest_build.status == common_pb2.SUCCESS: |
| continue |
| |
| builder_name = builder.name.split("/")[-1] |
| failing_builders[builder_name] = latest_build |
| |
| for rule in consecutive_failures_rules: |
| if len(builds) < rule.consecutive_failures: |
| # This builder doesn't yet have enough history to determine whether |
| # it should be closed. |
| continue |
| |
| builds_to_check = builds[: rule.consecutive_failures] |
| rule_matched = True |
| for build in builds_to_check: |
| if build.status == common_pb2.SUCCESS: |
| rule_matched = False |
| break |
| failed_step = _find_failed_step_match(build, rule.failed_step_regexp) |
| if not failed_step: |
| rule_matched = False |
| break |
| |
| if rule_matched: |
| return ReasonConsecutiveFailures( |
| builder=builder_name, failed_builds=builds_to_check, |
| ) |
| |
| for rule in concurrent_failures_rules: |
| matched_builders = {} |
| # We only consider the most recent build of each builder when checking |
| # for concurrently failing builders. |
| for builder, latest_build in failing_builders.items(): |
| failed_step = _find_failed_step_match(latest_build, rule.failed_step_regexp) |
| if failed_step: |
| matched_builders[builder] = latest_build |
| |
| if len(matched_builders) >= rule.concurrent_failures: |
| return ReasonConcurrentFailures(matched_builders) |
| |
| return None |
| |
| |
| def _find_failed_step_match(build, step_regex): |
| for step in build.steps: |
| if step.status != common_pb2.SUCCESS and re.search(step_regex, step.name): |
| return step |
| return None |
| |
| |
| def _last_n_builds(api, console_builder, n): |
| # builder.name is of the form |
| # "buildbucket/luci.{project}.{bucket}/{builder}". |
| _, full_bucket, builder_name = console_builder.name.split("/") |
| _, project, bucket = full_bucket.split(".", 2) |
| predicate = builds_service_pb2.BuildPredicate( |
| builder=builder_pb2.BuilderID( |
| builder=builder_name, bucket=bucket, project=project, |
| ), |
| status=common_pb2.ENDED_MASK, # Include only completed builds. |
| ) |
| fields = api.buildbucket.DEFAULT_FIELDS.union( |
| {"steps.*.name", "steps.*.status", "summary_markdown"} |
| ) |
| builds = api.buildbucket.search( |
| predicate, limit=n, step_name=builder_name, fields=fields |
| ) |
| return builds |
| |
| |
| def _file_bug(api, monorail_path, closure_reason, bug_components, bug_labels): |
| """Every tree closure needs a tracking bug linked from the tree status.""" |
| args = [ |
| monorail_path, |
| "new-issue", |
| "-summary", |
| closure_reason.bug_title(), |
| "-description", |
| closure_reason.bug_description(), |
| ] |
| for component in bug_components: |
| args.extend(["-component", component]) |
| for label in bug_labels: |
| args.extend(["-label", label]) |
| step = api.step( |
| "create monorail bug", |
| args, |
| stdout=api.json.output(), |
| step_test_data=lambda: api.json.test_api.output_stream({"id": 605}), |
| ) |
| bug_id = step.stdout["id"] |
| bug_link = "https://fxbug.dev/%d" % bug_id |
| step.presentation.links["monorail link"] = bug_link |
| return bug_link |
| |
| |
| def _close_tree(api, tree_hostname, password, message): |
| api.python( |
| "close tree", |
| api.resource("tree_status.py"), |
| [ |
| tree_hostname, |
| "set", |
| message, |
| "--username", |
| api.buildbucket.builder_name, |
| "--password", |
| password, |
| ], |
| ) |
| |
| |
| def _can_close_tree(api, tree_hostname, grace_period): |
| step = api.python( |
| "get current tree status", |
| api.resource("tree_status.py"), |
| [tree_hostname, "get"], |
| stdout=api.json.output(), |
| step_test_data=lambda: api.json.test_api.output_stream({}), |
| ) |
| status = step.stdout |
| step.presentation.step_text = status["general_state"] |
| step.presentation.links[tree_hostname] = "https://%s" % tree_hostname |
| # No need to close the tree if it's already closed. |
| if status["general_state"] not in ("open", "throttled"): |
| return False |
| open_date = datetime.datetime.strptime(status["date"], TREE_STATUS_DATE_FORMAT) |
| open_duration = api.time.utcnow() - open_date |
| return open_duration > grace_period |
| |
| |
| def GenTests(api): |
| def pseudo_random_build_id(builder_string): |
| return int(int(hashlib.sha256(builder_string).hexdigest(), 16) % 1e8) |
| |
| def test( |
| name, |
| builder_history=None, |
| should_check_console=True, |
| should_close_tree=False, |
| rules=None, |
| summary_regexp_ignore=(), |
| tree_status="open", |
| tree_status_age=DEFAULT_GRACE_PERIOD + datetime.timedelta(minutes=1), |
| ): |
| """Create a test case for running this recipe. |
| |
| Args: |
| name (str): Name of the test case. |
| builder_history (dict): Mapping from builder base name to a list |
| of build status strings like "SUCCESS", corresponding to the |
| builder's most recent builds in reverse chronological order |
| (latest builds first). |
| should_check_console (bool): Whether the recipe is expected to |
| query luci-config and Buildbucket for builders and build results. |
| should_close_tree (bool): Whether the recipe is expected to close |
| the tree. Ignored if should_check_console is False. |
| rules (seq of Rule): Passed as a recipe property. |
| summary_regexp_ignore (seq of str): Passed as a recipe property. |
| tree_status (str): Mock current status for the tree. |
| tree_status_age (str): Mocked duration since the last tree status |
| update. |
| """ |
| if not builder_history: |
| builder_history = {} |
| |
| if rules is None: |
| rules = [ |
| dict(concurrent_failures=2, consecutive_failures=2), |
| ] |
| |
| project = "fuchsia" |
| bucket = "global.ci" |
| console_name = "global_ci" |
| |
| res = ( |
| api.status_check.test( |
| name, status="failure" if should_close_tree else "success" |
| ) |
| + api.buildbucket.try_build(project=project) |
| + api.properties( |
| console_name=console_name, |
| rules=rules, |
| summary_regexp_ignore=list(summary_regexp_ignore), |
| tree_status_host="example.com", |
| tree_status_password="pa$$word", |
| monorail_cipd_package="fuchsia/infra/monorail/${platform}", |
| bug_components=["tree-closure"], |
| bug_labels=["tree-closure"], |
| ) |
| ) |
| |
| now = 1592654400 |
| res += api.time.seed(now) |
| |
| status_date = datetime.datetime.utcfromtimestamp(now) - tree_status_age |
| res += api.step_data( |
| "get current tree status", |
| stdout=api.json.output( |
| { |
| "general_state": tree_status, |
| "key": 12345, |
| "date": status_date.strftime(TREE_STATUS_DATE_FORMAT), |
| } |
| ), |
| ) |
| |
| checker = post_process.MustRun if should_close_tree else post_process.DoesNotRun |
| res += api.post_process(checker, "create monorail bug") |
| res += api.post_process(checker, "close tree") |
| |
| if should_check_console: |
| res += api.post_process(post_process.MustRun, "check console health") |
| else: |
| res += api.post_process(post_process.DoesNotRun, "check console health") |
| return res |
| |
| console_builders = [] |
| for builder in builder_history: |
| console_builders.append( |
| milo_pb2.Builder( |
| name="buildbucket/luci.%s.%s/%s" % (project, bucket, builder) |
| ) |
| ) |
| |
| milo_cfg = milo_pb2.Project( |
| consoles=[ |
| milo_pb2.Console( |
| id=console_name, |
| name="global integration ci", |
| repo_url="https://fuchsia.googlesource.com/integration", |
| refs=["regexp:refs/heads/master"], |
| builders=console_builders, |
| ) |
| ] |
| ) |
| res += api.luci_config.mock_config("luci-milo.cfg", milo_cfg) |
| |
| # History is a list of builder statuses, most recent first. |
| for builder, history in builder_history.items(): |
| search_results = [] |
| for i, build in enumerate(history): |
| bp = build.builder |
| bp.builder, bp.bucket, bp.project = builder, bucket, project |
| build.id = pseudo_random_build_id(builder + str(i)) |
| build.start_time.seconds = int( |
| now - datetime.timedelta(days=i).total_seconds() |
| ) |
| search_results.append(build) |
| res += api.buildbucket.simulated_search_results( |
| search_results, step_name="check console health.%s" % builder, |
| ) |
| |
| return res |
| |
| def _build(build_status, steps=None, summary_markdown="", should_ignore=False): |
| """Returns a Build proto for mocking buildbucket.search requests. |
| |
| Args: |
| build_status (str): A string corresponding to a common_pb2.Status |
| value. |
| steps (dict): A mapping from step names (with nestings indicated |
| by | pipes) to status strings, also corresponding to |
| common_pb2.Status values. |
| summary_markdown (str): The summary to set for the build. |
| """ |
| steps = steps or {} |
| b = api.buildbucket.ci_build_message(status=build_status) |
| b.summary_markdown = summary_markdown |
| for step_name, status in steps.items(): |
| step = b.steps.add() |
| step.name = step_name |
| step.status = common_pb2.Status.Value(status) |
| if should_ignore: |
| b.input.properties.update({IGNORE_BUILDER_PROPERTY: True}) |
| return b |
| |
| def success(**kwargs): |
| """Returns a Build proto with a successful status.""" |
| return _build("SUCCESS", **kwargs) |
| |
| def failure(summary_markdown="5 tests failed", **kwargs): |
| """Returns a Build proto with a failure status.""" |
| return _build("FAILURE", summary_markdown=summary_markdown, **kwargs) |
| |
| # As long as the most recent build for each builder is green, we shouldn't |
| # close the tree. |
| yield test( |
| name="all_green", |
| builder_history=collections.OrderedDict( |
| [ |
| ("core.arm64-asan", [success(), failure()]), |
| ("core.x64-asan", [success(), failure()]), |
| ], |
| ), |
| ) |
| |
| # If any builder's most recent `consecutive_failures` builds have all failed, |
| # close the tree. |
| yield test( |
| name="consecutive_failures", |
| builder_history=collections.OrderedDict( |
| [ |
| ("core.arm64-asan", [failure(steps={"foo": "FAILURE"}), success()]), |
| ( |
| "core.x64-asan", |
| [ |
| failure(steps={"foo": "FAILURE"}), |
| failure(steps={"bar": "SUCCESS", "foo": "FAILURE"}), |
| ], |
| ), |
| ], |
| ), |
| rules=[dict(consecutive_failures=2, failed_step_regexp="foo")], |
| should_close_tree=True, |
| ) |
| |
| yield test( |
| name="summary_regexp_ignore", |
| builder_history=collections.OrderedDict( |
| [ |
| ( |
| "core.x64-asan", |
| [ |
| failure( |
| summary_markdown="step failed: checkout.jiri update", |
| steps={"foo": "FAILURE"}, |
| ), |
| ], |
| ), |
| ], |
| ), |
| rules=[dict(consecutive_failures=1)], |
| summary_regexp_ignore=["checkout"], |
| should_close_tree=False, |
| ) |
| |
| # Even if a builder has failed more than `consecutive_failures` times in a |
| # row, don't close the tree unless all the failed builds have a failed step |
| # matching `failed_step_regexp`. |
| yield test( |
| name="consecutive_failures_no_matched_step", |
| builder_history=collections.OrderedDict( |
| [ |
| ( |
| "core.x64-asan", |
| [ |
| failure(steps={"foo": "FAILURE"}), |
| failure(steps={"bar": "FAILURE"}), |
| ], |
| ), |
| ], |
| ), |
| rules=[dict(consecutive_failures=2, failed_step_regexp="foo")], |
| should_close_tree=False, |
| ) |
| |
| # If `concurrent_failures` or more builders' most recent builds have all |
| # failed, close the tree. |
| yield test( |
| name="concurrent_failures", |
| builder_history=collections.OrderedDict( |
| { |
| "bringup.arm64-asan": [success()], |
| "core.arm64-asan": [failure(steps={"foo": "FAILURE"})], |
| "core.x64-asan": [ |
| failure(steps={"bar": "SUCCESS", "foo|x": "FAILURE"}) |
| ], |
| }, |
| ), |
| rules=[dict(concurrent_failures=2, failed_step_regexp="foo")], |
| should_close_tree=True, |
| ) |
| |
| # Even if `concurrent_failures` or more builders' most recent builds have |
| # all failed, don't close the tree if they don't all have a failed step |
| # matching `failed_step_regexp`. |
| yield test( |
| name="concurrent_failures_no_matched_step", |
| builder_history=collections.OrderedDict( |
| { |
| "bringup.arm64-asan": [failure()], |
| "core.arm64-asan": [failure(steps={"bar": "FAILURE"})], |
| # Only this builder has a failed step matching |
| # `failed_step_regexp`. |
| "core.x64-asan": [ |
| failure(steps={"bar": "SUCCESS", "foo|x": "FAILURE"}), |
| ], |
| }, |
| ), |
| rules=[dict(concurrent_failures=2, failed_step_regexp="foo")], |
| should_close_tree=False, |
| ) |
| |
| # The tree status shouldn't be changed if it was only recently opened. |
| yield test( |
| name="tree_recently_opened", |
| tree_status_age=DEFAULT_GRACE_PERIOD - datetime.timedelta(minutes=1), |
| should_check_console=False, |
| ) |
| |
| # The build should exit early if the tree is already closed. |
| yield test( |
| name="tree_already_closed", tree_status="closed", should_check_console=False |
| ) |
| |
| # Builders with no builds shouldn't affect the tree status. |
| yield test( |
| name="throttled__no_history", |
| # If the tree is "throttled" then it's open, so the closer should still |
| # check the console for failures. |
| tree_status="throttled", |
| builder_history={"core.arm64-asan": []}, |
| should_close_tree=False, |
| ) |
| |
| # Builders with fewer than consecutive_failures builds shouldn't affect the |
| # tree status. |
| yield test( |
| name="not_enough_history", |
| builder_history={"core.arm64-asan": [failure()]}, |
| should_close_tree=False, |
| ) |
| |
| # If a builder's latest build has the `IGNORE_BUILDER_PROPERTY` set, then |
| # the entire builder should be ignored. |
| yield test( |
| name="ignored_builder", |
| builder_history={"core.x64-asan": [failure(should_ignore=True), failure()]}, |
| rules=[dict(concurrent_failures=1)], |
| should_close_tree=False, |
| ) |