blob: 58f0ecbd043214af042346abed478367bc176610 [file] [log] [blame]
# Copyright 2019 The Fuchsia Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Recipe for running zbi tests."""
import copy
import re
from RECIPE_MODULES.fuchsia.swarming_retry import api as swarming_retry_api
from PB.recipes.fuchsia.zbi_test import InputProperties
PYTHON_VERSION_COMPATIBILITY = "PY3"
DEPS = [
"fuchsia/artifacts",
"fuchsia/autocorrelator",
"fuchsia/build",
"fuchsia/buildbucket_util",
"fuchsia/cas_util",
"fuchsia/checkout",
"fuchsia/emu",
"fuchsia/swarming_retry",
"fuchsia/symbolize",
"fuchsia/testing_requests",
"fuchsia/testsharder",
"recipe_engine/buildbucket",
"recipe_engine/file",
"recipe_engine/path",
"recipe_engine/platform",
"recipe_engine/properties",
"recipe_engine/step",
"recipe_engine/swarming",
]
PROPERTIES = InputProperties
# How long to wait (in seconds) before forcibly terminating the test swarming
# task if there's no output being produced.
TEST_IO_TIMEOUT_SECS = 300
# How long a pending test swarming task waits to be scheduled on a bot.
# We should never expire a test task. This is currently 5 hours, but
# should be treated as infinite.
TEST_EXPIRATION_TIMEOUT_SECS = 18000
# How long the test is allowed to run before swarming cancels it.
TEST_EXECUTION_TIMEOUT_SECS = 600
BOTANIST_DEVICE_CONFIG = "/etc/botanist/config.json"
IMAGES_JSON = "images.json"
QEMU_KERNEL_NAME = "qemu-kernel"
# The log level to use for botanist invocations in test tasks. Can be one of
# "fatal", "error", "warning", "info", "debug", or "trace", where "trace" is
# the most verbose, and fatal is the least.
BOTANIST_LOG_LEVEL = "debug"
SPECIAL_FAILURE_LOG_STRINGS = (
"ZIRCON KERNEL PANIC",
"ZIRCON KERNEL OOPS",
"DEVICE SUSPEND TIMED OUT",
"ASSERT FAILED",
)
SPECIAL_FAILURE_LOG_STRINGS_POSSIBLY_FOUND_IN_TESTS = (
"DEVICE SUSPEND TIMED OUT",
"ASSERT FAILED",
)
SPECIAL_FAILURE_LOG_PATTERNS = (re.compile(r"ERROR: [A-Za-z]+Sanitizer"),)
def prebuilt_path(api, checkout_root, *path):
"""Returns the Path to the host-platform subdir under the given subdirs."""
path = list(path)
path.append(
"{os}-{arch}".format(
os=api.platform.name,
arch={"intel": "x64"}[api.platform.arch],
)
)
return checkout_root.join("prebuilt", *path)
def match_special_failure(output, test_output=False):
"""Check the given string for known special failure patterns.
Args:
output (str): the output to search.
test_output (bool): whether the output is that from a test, in which case
specific failure modes are excluded, as they may result from an
intentional assertion or death test case.
Returns:
None or a string that can be used in `failure_reason`.
"""
failure_strings = SPECIAL_FAILURE_LOG_STRINGS
if test_output:
failure_strings = [
s
for s in failure_strings
if s not in SPECIAL_FAILURE_LOG_STRINGS_POSSIBLY_FOUND_IN_TESTS
]
for s in failure_strings:
if s in output:
return s
for regex in SPECIAL_FAILURE_LOG_PATTERNS: # pragma: nocover
match = regex.search(output)
if match:
return match.group(0)
return None
class Task(swarming_retry_api.TriggeredTask):
# TODO(fxbug.dev/50072) The task request construction logic in this method
# should be abstracted into a helper in api.testing_requests so, for example,
# the QEMU configuration here does not diverge from the QEMU configurations
# elsewhere.
def __init__(
self,
api,
name,
task_requester,
build_results,
zbi_test,
qemu_kernel,
zedboot_images,
device_type,
**kwargs
):
self._request = api.swarming.task_request().with_name(name)
super(Task, self).__init__(self._request, api, **kwargs)
self._checkout = build_results.checkout
self._gn_results = build_results.gn_results
is_emu_type = api.emu.is_emulator_type(device_type)
images = [zbi_test]
if is_emu_type:
images.append(qemu_kernel)
# Hack. Botanist's QEMU codepath currently expects to run something with
# a name of "zircon-a" and a type of "zbi".
zbi_test["name"] = "zircon-a"
zbi_test["type"] = "zbi"
images.extend(zedboot_images)
task_input_tree = api.cas_util.tree(root=api.path.mkdtemp("isolate"))
relative_cwd = api.path.relpath(
self._gn_results.build_dir, self._checkout.root_dir
)
api.file.ensure_directory(
"ensure relative cwd", task_input_tree.root.join(relative_cwd)
)
for img in images:
task_input_tree.register_link(
target=self._gn_results.build_dir.join(img["path"]),
linkname=task_input_tree.root.join(relative_cwd, img["path"]),
)
build_results.images = images
image_manifest_path = task_input_tree.root.join(relative_cwd, IMAGES_JSON)
api.file.write_json(
"write image manifest", image_manifest_path, images, indent=2
)
cmd = [
# botanist will run the following as a subprocess. seriallistener
# is responsible for reading in the serial output that botanist
# forwards to a socket; it will exit(0) if it sees it or else it
# will time out and fail.
"./seriallistener",
"-timeout",
"%ss" % zbi_test.get("timeout", TEST_EXECUTION_TIMEOUT_SECS),
# If of emulator_type, then we are already redirecting serial to
# stdout. Do not double redirect.
"-stdout=%s" % (not is_emu_type),
"-success-str",
zbi_test["success_string"],
]
zbi_shard = api.testsharder.Shard(
name, [], dimensions={"device_type": device_type}
)
self._request = task_requester.request(
zbi_shard,
build_results,
cmd,
task_input_tree,
["seriallistener"],
IMAGES_JSON,
BOTANIST_LOG_LEVEL,
relative_cwd,
)
def process_result(self, attempt):
assert attempt.result
result = attempt.result
symbolizer_tool = self._gn_results.tool("symbolizer")
clang_dir = prebuilt_path(
self._api, self._checkout.root_dir, "third_party", "clang"
)
build_id_dirs = (
self._gn_results.build_dir.join(".build-id"),
clang_dir.join("lib", "debug", ".build-id"),
)
attempt.logs[
self._api.testing_requests.TEST_TASK_OUTPUT_FILE
] = self._api.symbolize(
symbolizer_tool=symbolizer_tool,
build_id_dirs=build_id_dirs,
data=result.output,
name="symbolize %s" % self._api.testing_requests.TEST_TASK_OUTPUT_FILE,
)
serial_log_name = self._api.testing_requests.SERIAL_LOG_NAME
if serial_log_name in result.outputs:
serial_log_contents = self._api.file.read_text(
"read %s" % serial_log_name,
result.outputs[serial_log_name],
include_log=False,
test_data="[00004.791] 00000.01025> CPU 1: 01:00:02:02:02:02",
)
attempt.logs[serial_log_name] = self._api.symbolize(
symbolizer_tool=symbolizer_tool,
build_id_dirs=build_id_dirs,
data=serial_log_contents,
name="symbolize %s" % serial_log_name,
)
# A kernel panic may be present in the logs even if the task timed
# out, so check for that first.
special_failure = match_special_failure(result.output, test_output=True)
if special_failure is not None:
attempt.failure_reason = special_failure
def present_attempt(self, task_step, attempt, **kwargs):
del task_step, kwargs # Unused.
name = "%s (%s)" % (attempt.name, "pass" if attempt.success else "fail")
step = self._api.step.empty(name)
step.presentation.step_summary_text = attempt.failure_reason
step.presentation.links["task UI"] = attempt.task_ui_link
for log, data in sorted(attempt.logs.items()):
step.presentation.logs[log] = data
def RunSteps(api, props):
"""Builds and executes Zircon tests in QEMU on a different machine."""
assert props.manifest
assert props.remote
checkout = api.checkout.fuchsia_with_options(
manifest=props.manifest,
remote=props.remote,
)
build_results = api.build.with_options(
checkout=checkout,
fint_params_path=props.fint_params_path,
)
if not build_results:
step = api.step.empty("build is unaffected")
step.presentation.properties["skipped_because_unaffected"] = True
return
zedboot_imgs = build_results.zedboot_images
qemu_kernels = build_results.zbi_test_qemu_kernel_images
with api.step.nest("record affected_tests_no_work") as presentation:
presentation.properties["affected_tests_no_work"] = build_results.no_work
if build_results.no_work:
return
if props.artifact_gcs_bucket:
api.artifacts.gcs_bucket = props.artifact_gcs_bucket
api.artifacts.namespace = api.buildbucket_util.id
api.artifacts.upload("upload artifacts", build_results)
task_requester = api.testing_requests.get_task_requester(
buildbucket_build=api.buildbucket.build,
pave=False,
pool=props.test_pool,
swarming_expiration_timeout_secs=TEST_EXPIRATION_TIMEOUT_SECS,
swarming_io_timeout_secs=TEST_IO_TIMEOUT_SECS,
timeout_secs=TEST_EXECUTION_TIMEOUT_SECS,
use_runtests=False,
default_service_account="",
targets_serial=True,
catapult_dashboard_master="",
catapult_dashboard_bot="",
release_branch="",
release_version="",
zircon_args=[],
)
tasks = []
with api.step.nest("prepare tests"):
for zbi_test in sorted(
build_results.gn_results.zbi_tests, key=lambda t: t["name"]
):
name = zbi_test["name"]
if zbi_test.get("disabled", False):
continue # pragma: no cover
allowed = set(props.allowed_device_types)
specified = set(zbi_test.get("device_types", ["QEMU"]))
device_types = allowed.intersection(specified)
for device_type in sorted(device_types):
task_name = "%s - %s" % (name, device_type)
with api.step.nest(task_name):
tasks.append(
Task(
api,
name=task_name,
task_requester=task_requester,
build_results=build_results,
# Copy the image objects, as they are shared across different
# task requests and each may need to modify its object.
zbi_test=copy.deepcopy(zbi_test),
# Non-emulator tasks won't have a QEMU kernel
# image, hence get().
qemu_kernel=copy.deepcopy(qemu_kernels.get(name)),
zedboot_images=zedboot_imgs,
device_type=device_type,
)
)
try:
api.swarming_retry.run_and_present_tasks(
tasks,
collect_output_dir=api.path.mkdtemp("swarming"),
max_attempts=props.max_attempts_per_test,
)
except api.step.StepFailure as exc:
if api.buildbucket_util.is_tryjob:
with api.step.nest("check for correlated failures") as parent_step:
api.autocorrelator.check_try(
"check try",
exc,
exc.reason,
ignore_failed_build=True,
ignore_skipped_tests=True,
)
api.autocorrelator.check_ci(
"check ci",
checkout.integration_revision,
exc,
exc.reason,
)
api.autocorrelator.set_properties(parent_step)
raise api.autocorrelator.compose_exception(exc)
def GenTests(api):
def test(name, zbi_test=None, output=(), tryjob=False, status="success"):
test = api.buildbucket_util.test(name, tryjob=tryjob, status=status)
test += api.checkout.source_info(
[
{
"name": "integration",
"remote": "https://fuchsia.googlesource.com/integration",
"revision": "a491082dc1b632bbcd60ba3618d20b503c2de738",
"relativePath": "integration",
},
{
"name": "fuchsia",
"remote": "https://fuchsia.googlesource.com/fuchsia",
"revision": "a491082dc1b632bbcd60ba3618d20b503c2de738",
"relativePath": ".",
},
]
)
test += api.properties(
manifest="manifest",
remote="https://fuchsia.googlesource.com/fuchsia",
fint_params_path="fint_params/zbi_test-arm64.textproto",
ninja_targets=["bundles:infratools"],
allowed_device_types=["QEMU"],
test_pool="fuchsia.tests",
artifact_gcs_bucket="fuchsia-artifacts",
)
if not zbi_test:
return test
device_types = zbi_test.get("device_types", ["QEMU"])
test += api.properties(allowed_device_types=device_types)
test += api.build.fint_set_artifacts(
metadata=dict(
optimize="debug",
product="products/bringup.gni",
target_arch=zbi_test["cpu"],
variants=["clang"],
)
)
test += api.build.fint_build_artifacts(
zbi_test_qemu_kernel_images={
"zbi-test-arm64": {"path": "zbi-test-arm64.bin", "type": "bin"},
}
)
test += api.step_data(
"prepare tests.read zbi test manifest",
api.file.read_json([zbi_test]),
)
def get_task_data(task_name, task_id, output):
if zbi_test["success_string"] not in output:
return api.swarming_retry.failed_task(
task_name, task_id=task_id, output=output
)
return api.swarming_retry.passed_task(
task_name,
task_id=task_id,
output=output,
outputs=[api.testing_requests.SERIAL_LOG_NAME],
)
task_id = 123
task_data = []
task_retry_data = []
failed_first_attempt = len(output) > 1
for device in device_types:
task_name = "%s - %s" % (zbi_test["name"], device)
test += api.swarming_retry.trigger_data(
name=task_name, task_id=task_id, iteration=0
)
task_data.append(
get_task_data(task_name, task_id=task_id, output=output[0])
)
if failed_first_attempt:
task_id += 1
test += api.swarming_retry.trigger_data(
name=task_name, task_id=task_id, iteration=1
)
task_retry_data.append(
get_task_data(task_name, task_id=task_id, output=output[1])
)
task_id += 1
test += api.swarming_retry.collect_data(task_data, iteration=0)
if failed_first_attempt:
test += api.swarming_retry.collect_data(task_retry_data, iteration=1)
return test
zbi_test = {
"cpu": "arm64",
"name": "zbi-test-arm64",
"type": "zbi",
"success_string": "purple monkey dishwasher",
"path": "zbi-test-arm64.zbi",
"device_types": ["QEMU", "AEMU", "DEVICE_TYPE"],
"timeout": 60,
"qemu_kernel_label": "//build/images:qemu_kernel_override",
}
yield test("pass", zbi_test, [zbi_test["success_string"]])
yield (
test(
"fail",
zbi_test,
["not success", "not success"],
tryjob=True,
status="failure",
)
+ api.autocorrelator.check_try(
"check for correlated failures.check try",
test_data=[{"build_id": "456", "score": 0.98, "is_green": False}],
)
+ api.autocorrelator.check_ci(
"check for correlated failures.check ci",
test_data={
"build_id": "789",
"score": 0.95,
"is_green": False,
"commit_dist": 0,
},
)
+ api.properties(**{"$fuchsia/autocorrelator": {"ci_bucket": "ci"}})
)
yield test("flake", zbi_test, ["not success", zbi_test["success_string"]])
# Including success string and "ZIRCON KERNEL PANIC" in output to clarify
# that this fails because of the kernel panic and not because the
# success string is not present.
output = zbi_test["success_string"] + "ZIRCON KERNEL PANIC"
yield test("kernel_panic", zbi_test, [output, output], status="failure")
yield (
test("affected_tests_no_work", tryjob=True)
+ api.properties(fint_params_path="specs/zbi_test-x64.textproto")
+ api.build.fint_set_artifacts(skip_build=False)
+ api.build.fint_build_artifacts(build_not_affected=True)
)
yield (
test("skip_if_unaffected", tryjob=True)
+ api.build.fint_set_artifacts(skip_build=True)
)