// Copyright 2018 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "watchdog.h"

#include <assert.h>
#include <errno.h>
#include <inttypes.h>
#include <limits.h>
#include <pthread.h>
#include <string.h>
#include <time.h>
#include <unistd.h>

#include <unittest/unittest.h>
#include <zircon/compiler.h>

constexpr int WATCHDOG_ERRCODE = 5;

constexpr uint64_t NANOSECONDS_PER_SECOND = 1000 * 1000 * 1000;

// The watchdog thread wakes up after this many seconds to check whether
// a test has timed out. The lower the number this is the more accurate
// the watchdog is with regard to the specified timeout. But there's
// no point in running too frequently. The wait mechanism we use is
// interruptible, so this value can be high and there's no worries of waiting
// for the watchdog to terminate. The watchdog works this way so that we
// can have one watchdog thread that is continuously running instead of
// starting a new watchdog thread for each test. Another goal is to not
// require any synchronization between the watchdog thread and the test.
// E.g., We don't want to have to wait for the watchdog to acknowledge that
// a test is starting and stopping. Instead we just let it run at its own pace.
// Tests often complete in milliseconds, far lower than our "tick".
constexpr int WATCHDOG_TICK_SECONDS = 1;

// Value stored in |active_timeout_seconds| to indicate test is not running.
constexpr int WATCHDOG_TIMEOUT_NOT_RUNNING = INT_MAX;

// This can be overridden by the user by setting env var WATCHDOG_ENV_NAME.
static int base_timeout_seconds = DEFAULT_BASE_TIMEOUT_SECONDS;

static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;

// Cond used to wait on the watchdog thread starting.
static pthread_cond_t init_cond = PTHREAD_COND_INITIALIZER;
static bool init_done = false;

// The name of the current test.
// Used to report which test timed out.
static const char* test_name;  // TA_GUARDED(mutex)

// The current timeout in effect.
// When tests aren't running we set this to INT_MAX.
static int active_timeout_seconds = WATCHDOG_TIMEOUT_NOT_RUNNING;  // TA_GUARDED(mutex)

// The time when the test was started.
// This is the result of clock_gettime converted to nanoseconds.
static uint64_t test_start_time;  // TA_GUARDED(mutex)

// True if tests are running.
// Set by watchdog initialize(), reset by watchdog_terminate().
static bool tests_running;  // TA_GUARDED(mutex)

static pthread_t watchdog_thread;

// This library is used for both the host and target.
// For portability concerns we use pthread_cond_timedwait to get a
// cancelable wait.
static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;

static uint64_t timespec_to_nanoseconds(const struct timespec* ts) {
  return (ts->tv_sec * NANOSECONDS_PER_SECOND) + ts->tv_nsec;
}

/**
 * Set the base timeout.
 * |timeout| must be >= 0.
 * A value of zero disables the timeout.
 * The timeout must be set before calling watchdog_initialize(), and must
 * not be changed until after watchdog_terminate() is called.
 */
void watchdog_set_base_timeout(int seconds) {
  assert(seconds >= 0);
  base_timeout_seconds = seconds;
}

static int test_timeout_for_type(test_type_t type) {
  int factor;

  switch (type) {
    case TEST_SMALL:
      factor = TEST_TIMEOUT_FACTOR_SMALL;
      break;
    case TEST_MEDIUM:
      factor = TEST_TIMEOUT_FACTOR_MEDIUM;
      break;
    case TEST_LARGE:
      factor = TEST_TIMEOUT_FACTOR_LARGE;
      break;
    case TEST_PERFORMANCE:
      factor = TEST_TIMEOUT_FACTOR_PERFORMANCE;
      break;
    default:
      __UNREACHABLE;
  }

  int64_t timeout = base_timeout_seconds * factor;
  if (timeout > INT_MAX)
    timeout = INT_MAX;
  return static_cast<int>(timeout);
}

/**
 * Return true if watchdog support is enabled for this test run.
 */
bool watchdog_is_enabled() { return base_timeout_seconds > 0; }

static __NO_RETURN void watchdog_signal_timeout(const char* name, int timeout_seconds) {
  unittest_printf_critical(
    "\n\n*** WATCHDOG TIMER FIRED (%d seconds), test: %s ***\n", timeout_seconds, name);
  exit(WATCHDOG_ERRCODE);
}

static void* watchdog_thread_func(void* arg) {
  pthread_mutex_lock(&mutex);

  init_done = true;
  pthread_cond_signal(&init_cond);

  for (;;) {
    // Has watchdog_terminate() been called?
    // Test this here, before calling pthread_cond_timedwait(), so that
    // we catch the case of all tests completing and watchdog_terminate
    // being called before we get started. Otherwise we'll wait one tick
    // before we notice this.
    if (!tests_running) {
      pthread_mutex_unlock(&mutex);
      break;
    }

    // Use REALTIME because that's the default clock for conds and
    // OSX doesn't include the pthread APIs for changing it.
    struct timespec delay;
    clock_gettime(CLOCK_REALTIME, &delay);
    delay.tv_sec += WATCHDOG_TICK_SECONDS;
    // If compiled with #define NDEBUG the assert essentially goes away.
    // Thus we need to protect |result| with __UNUSED lest the compiler
    // complain and fail the build.
    auto result __UNUSED = pthread_cond_timedwait(&cond, &mutex, &delay);
    // We can time-out just as watchdog_terminate() is called, and
    // thus we can't make any assumptions based on |result|.
    assert(result == 0 || result == ETIMEDOUT);

    struct timespec now;
    clock_gettime(CLOCK_MONOTONIC, &now);
    uint64_t now_nanos = timespec_to_nanoseconds(&now);
    assert(now_nanos >= test_start_time);
    uint64_t elapsed_nanos = now_nanos - test_start_time;

    // Note: We skip worrying about handling the (rare) case where the
    // test completes but before it can notify us we wake and see that
    // the timeout has been reached.
    uint64_t timeout_nanos = active_timeout_seconds * NANOSECONDS_PER_SECOND;
    if (elapsed_nanos >= timeout_nanos) {
      int timeout = active_timeout_seconds;
      pthread_mutex_unlock(&mutex);
      watchdog_signal_timeout(test_name, timeout);
      /* NOTREACHED */
    }
  }

  return nullptr;
}

/**
 * Start the watchdog thread.
 *
 * The thread begins in an idle state, waiting for watchdog_start().
 * This must only be called once.
 */
void watchdog_initialize() {
  if (watchdog_is_enabled()) {
    tests_running = true;

    // We don't want the watchdog thread to commit pages while tests are running as that
    // muddies page usage stats. To avoid that, wait for the thread to start before running
    // the tests. Currently, the watchdog thread always uses the unsafe stacks during
    // initialization; if that changes, then we'll need to explicitly write to it.

    pthread_mutex_lock(&mutex);
    int res = pthread_create(&watchdog_thread, nullptr, &watchdog_thread_func, NULL);
    if (res == 0) {
      while (!init_done) {
        pthread_cond_wait(&init_cond, &mutex);
      }
    }
    pthread_mutex_unlock(&mutex);

    if (res != 0) {
      unittest_printf_critical("ERROR STARTING WATCHDOG THREAD: %d(%s)\n", res, strerror(res));
      exit(WATCHDOG_ERRCODE);
    }
  }
}

/**
 * Turn on the watchdog timer for test |name|.
 *
 * Storage for |name| must survive the duration of the test.
 *
 * If the timer goes off the process terminates.
 * This must be called at the start of a test.
 */
void watchdog_start(test_type_t type, const char* name) {
  if (watchdog_is_enabled()) {
    pthread_mutex_lock(&mutex);
    test_name = name;
    active_timeout_seconds = test_timeout_for_type(type);
    struct timespec now;
    clock_gettime(CLOCK_MONOTONIC, &now);
    test_start_time = timespec_to_nanoseconds(&now);
    pthread_mutex_unlock(&mutex);
  }
}

/**
 * Call this to turn off the watchdog timer.
 *
 * Yeah, there's a "race" if a test finishes right when we're called.
 * We don't worry about this small window given the amount of time we wait.
 * This must be called after watchdog_start().
 */
void watchdog_cancel() {
  if (watchdog_is_enabled()) {
    pthread_mutex_lock(&mutex);
    active_timeout_seconds = WATCHDOG_TIMEOUT_NOT_RUNNING;
    test_name = nullptr;
    pthread_mutex_unlock(&mutex);
  }
}

/**
 * Terminate the watchdog thread.
 *
 * This must be called after all tests complete.
 */
void watchdog_terminate() {
  // All tests must have completed.
  assert(active_timeout_seconds == WATCHDOG_TIMEOUT_NOT_RUNNING);

  if (watchdog_is_enabled()) {
    pthread_mutex_lock(&mutex);
    tests_running = false;
    __UNUSED int res = pthread_cond_signal(&cond);
    assert(res == 0);
    pthread_mutex_unlock(&mutex);
    res = pthread_join(watchdog_thread, NULL);
    assert(res == 0);
  }
}
