| // Copyright 2018 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "watchdog.h" |
| |
| #include <assert.h> |
| #include <errno.h> |
| #include <inttypes.h> |
| #include <limits.h> |
| #include <pthread.h> |
| #include <string.h> |
| #include <time.h> |
| #include <unistd.h> |
| |
| #include <unittest/unittest.h> |
| #include <zircon/compiler.h> |
| |
| constexpr int WATCHDOG_ERRCODE = 5; |
| |
| constexpr uint64_t NANOSECONDS_PER_SECOND = 1000 * 1000 * 1000; |
| |
| // The watchdog thread wakes up after this many seconds to check whether |
| // a test has timed out. The lower the number this is the more accurate |
| // the watchdog is with regard to the specified timeout. But there's |
| // no point in running too frequently. The wait mechanism we use is |
| // interruptible, so this value can be high and there's no worries of waiting |
| // for the watchdog to terminate. The watchdog works this way so that we |
| // can have one watchdog thread that is continuously running instead of |
| // starting a new watchdog thread for each test. Another goal is to not |
| // require any synchronization between the watchdog thread and the test. |
| // E.g., We don't want to have to wait for the watchdog to acknowledge that |
| // a test is starting and stopping. Instead we just let it run at its own pace. |
| // Tests often complete in milliseconds, far lower than our "tick". |
| constexpr int WATCHDOG_TICK_SECONDS = 1; |
| |
| // Value stored in |active_timeout_seconds| to indicate test is not running. |
| constexpr int WATCHDOG_TIMEOUT_NOT_RUNNING = INT_MAX; |
| |
| // This can be overridden by the user by setting env var WATCHDOG_ENV_NAME. |
| static int base_timeout_seconds = DEFAULT_BASE_TIMEOUT_SECONDS; |
| |
| static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; |
| |
| // Cond used to wait on the watchdog thread starting. |
| static pthread_cond_t init_cond = PTHREAD_COND_INITIALIZER; |
| static bool init_done = false; |
| |
| // The name of the current test. |
| // Used to report which test timed out. |
| static const char* test_name; // TA_GUARDED(mutex) |
| |
| // The current timeout in effect. |
| // When tests aren't running we set this to INT_MAX. |
| static int active_timeout_seconds = WATCHDOG_TIMEOUT_NOT_RUNNING; // TA_GUARDED(mutex) |
| |
| // The time when the test was started. |
| // This is the result of clock_gettime converted to nanoseconds. |
| static uint64_t test_start_time; // TA_GUARDED(mutex) |
| |
| // True if tests are running. |
| // Set by watchdog initialize(), reset by watchdog_terminate(). |
| static bool tests_running; // TA_GUARDED(mutex) |
| |
| static pthread_t watchdog_thread; |
| |
| // This library is used for both the host and target. |
| // For portability concerns we use pthread_cond_timedwait to get a |
| // cancelable wait. |
| static pthread_cond_t cond = PTHREAD_COND_INITIALIZER; |
| |
| static uint64_t timespec_to_nanoseconds(const struct timespec* ts) { |
| return (ts->tv_sec * NANOSECONDS_PER_SECOND) + ts->tv_nsec; |
| } |
| |
| /** |
| * Set the base timeout. |
| * |timeout| must be >= 0. |
| * A value of zero disables the timeout. |
| * The timeout must be set before calling watchdog_initialize(), and must |
| * not be changed until after watchdog_terminate() is called. |
| */ |
| void watchdog_set_base_timeout(int seconds) { |
| assert(seconds >= 0); |
| base_timeout_seconds = seconds; |
| } |
| |
| static int test_timeout_for_type(test_type_t type) { |
| int factor; |
| |
| switch (type) { |
| case TEST_SMALL: |
| factor = TEST_TIMEOUT_FACTOR_SMALL; |
| break; |
| case TEST_MEDIUM: |
| factor = TEST_TIMEOUT_FACTOR_MEDIUM; |
| break; |
| case TEST_LARGE: |
| factor = TEST_TIMEOUT_FACTOR_LARGE; |
| break; |
| case TEST_PERFORMANCE: |
| factor = TEST_TIMEOUT_FACTOR_PERFORMANCE; |
| break; |
| default: |
| __UNREACHABLE; |
| } |
| |
| int64_t timeout = base_timeout_seconds * factor; |
| if (timeout > INT_MAX) |
| timeout = INT_MAX; |
| return static_cast<int>(timeout); |
| } |
| |
| /** |
| * Return true if watchdog support is enabled for this test run. |
| */ |
| bool watchdog_is_enabled() { return base_timeout_seconds > 0; } |
| |
| static __NO_RETURN void watchdog_signal_timeout(const char* name, int timeout_seconds) { |
| unittest_printf_critical( |
| "\n\n*** WATCHDOG TIMER FIRED (%d seconds), test: %s ***\n", timeout_seconds, name); |
| exit(WATCHDOG_ERRCODE); |
| } |
| |
| static void* watchdog_thread_func(void* arg) { |
| pthread_mutex_lock(&mutex); |
| |
| init_done = true; |
| pthread_cond_signal(&init_cond); |
| |
| for (;;) { |
| // Has watchdog_terminate() been called? |
| // Test this here, before calling pthread_cond_timedwait(), so that |
| // we catch the case of all tests completing and watchdog_terminate |
| // being called before we get started. Otherwise we'll wait one tick |
| // before we notice this. |
| if (!tests_running) { |
| pthread_mutex_unlock(&mutex); |
| break; |
| } |
| |
| // Use REALTIME because that's the default clock for conds and |
| // OSX doesn't include the pthread APIs for changing it. |
| struct timespec delay; |
| clock_gettime(CLOCK_REALTIME, &delay); |
| delay.tv_sec += WATCHDOG_TICK_SECONDS; |
| // If compiled with #define NDEBUG the assert essentially goes away. |
| // Thus we need to protect |result| with __UNUSED lest the compiler |
| // complain and fail the build. |
| auto result __UNUSED = pthread_cond_timedwait(&cond, &mutex, &delay); |
| // We can time-out just as watchdog_terminate() is called, and |
| // thus we can't make any assumptions based on |result|. |
| assert(result == 0 || result == ETIMEDOUT); |
| |
| struct timespec now; |
| clock_gettime(CLOCK_MONOTONIC, &now); |
| uint64_t now_nanos = timespec_to_nanoseconds(&now); |
| assert(now_nanos >= test_start_time); |
| uint64_t elapsed_nanos = now_nanos - test_start_time; |
| |
| // Note: We skip worrying about handling the (rare) case where the |
| // test completes but before it can notify us we wake and see that |
| // the timeout has been reached. |
| uint64_t timeout_nanos = active_timeout_seconds * NANOSECONDS_PER_SECOND; |
| if (elapsed_nanos >= timeout_nanos) { |
| int timeout = active_timeout_seconds; |
| pthread_mutex_unlock(&mutex); |
| watchdog_signal_timeout(test_name, timeout); |
| /* NOTREACHED */ |
| } |
| } |
| |
| return nullptr; |
| } |
| |
| /** |
| * Start the watchdog thread. |
| * |
| * The thread begins in an idle state, waiting for watchdog_start(). |
| * This must only be called once. |
| */ |
| void watchdog_initialize() { |
| if (watchdog_is_enabled()) { |
| tests_running = true; |
| |
| // We don't want the watchdog thread to commit pages while tests are running as that |
| // muddies page usage stats. To avoid that, wait for the thread to start before running |
| // the tests. Currently, the watchdog thread always uses the unsafe stacks during |
| // initialization; if that changes, then we'll need to explicitly write to it. |
| |
| pthread_mutex_lock(&mutex); |
| int res = pthread_create(&watchdog_thread, nullptr, &watchdog_thread_func, NULL); |
| if (res == 0) { |
| while (!init_done) { |
| pthread_cond_wait(&init_cond, &mutex); |
| } |
| } |
| pthread_mutex_unlock(&mutex); |
| |
| if (res != 0) { |
| unittest_printf_critical("ERROR STARTING WATCHDOG THREAD: %d(%s)\n", res, strerror(res)); |
| exit(WATCHDOG_ERRCODE); |
| } |
| } |
| } |
| |
| /** |
| * Turn on the watchdog timer for test |name|. |
| * |
| * Storage for |name| must survive the duration of the test. |
| * |
| * If the timer goes off the process terminates. |
| * This must be called at the start of a test. |
| */ |
| void watchdog_start(test_type_t type, const char* name) { |
| if (watchdog_is_enabled()) { |
| pthread_mutex_lock(&mutex); |
| test_name = name; |
| active_timeout_seconds = test_timeout_for_type(type); |
| struct timespec now; |
| clock_gettime(CLOCK_MONOTONIC, &now); |
| test_start_time = timespec_to_nanoseconds(&now); |
| pthread_mutex_unlock(&mutex); |
| } |
| } |
| |
| /** |
| * Call this to turn off the watchdog timer. |
| * |
| * Yeah, there's a "race" if a test finishes right when we're called. |
| * We don't worry about this small window given the amount of time we wait. |
| * This must be called after watchdog_start(). |
| */ |
| void watchdog_cancel() { |
| if (watchdog_is_enabled()) { |
| pthread_mutex_lock(&mutex); |
| active_timeout_seconds = WATCHDOG_TIMEOUT_NOT_RUNNING; |
| test_name = nullptr; |
| pthread_mutex_unlock(&mutex); |
| } |
| } |
| |
| /** |
| * Terminate the watchdog thread. |
| * |
| * This must be called after all tests complete. |
| */ |
| void watchdog_terminate() { |
| // All tests must have completed. |
| assert(active_timeout_seconds == WATCHDOG_TIMEOUT_NOT_RUNNING); |
| |
| if (watchdog_is_enabled()) { |
| pthread_mutex_lock(&mutex); |
| tests_running = false; |
| __UNUSED int res = pthread_cond_signal(&cond); |
| assert(res == 0); |
| pthread_mutex_unlock(&mutex); |
| res = pthread_join(watchdog_thread, NULL); |
| assert(res == 0); |
| } |
| } |