// Copyright 2017 The Fuchsia Authors
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT

#include <bits.h>
#include <lib/boot-options/boot-options.h>
#include <lib/console.h>
#include <pow2.h>
#include <trace.h>

#include <arch/arch_ops.h>
#include <arch/mp.h>
#include <arch/x86/feature.h>
#include <arch/x86/platform_access.h>
#include <kernel/percpu.h>
#include <kernel/timer.h>
#include <platform/pc/acpi.h>

#include "system_priv.h"

#define LOCAL_TRACE 0

namespace {

static constexpr uint64_t kMaxLongTermPowerLimit = 0x7FFF;

// Intel recommends a time window of 28s, which corresponds to the following value.
static constexpr uint64_t kDefaultTimeWindow = 0x6e;

// Intel Volume 3 Section 14.9.3.
static constexpr uint64_t kPowerLimitPl1Enable = 1ull << 15;
static constexpr uint64_t kPowerLimitPl1Clamp = 1ull << 16;
static constexpr uint64_t kPowerLimitPl2Enable = 1ull << 47;
static constexpr uint64_t kPowerLimitPl2Clamp = 1ull << 48;

// Intel Volume 4 Table 2-39
static constexpr struct {
  uint64_t bit;
  const char* str;
} kLimitReasons[] = {
    {1 << 0, "PROCHOT"},
    {1 << 1, "Thermal event"},
    {1 << 4, "Residency state regulation limit"},
    {1 << 5, "Running average thermal limit"},
    {1 << 6, "Voltage regulator (VR) thermal alert"},
    {1 << 7, "Voltage regulator (VR) thermal design current limit"},
    {1 << 8, "Other"},
    {1 << 10, "Package/platform-Level PL1"},
    {1 << 11, "Package/platform-Level PL2"},
    {1 << 12, "Max turbo limit"},
    {1 << 13, "Turbo transition attenuation"},
};

// Intel Volume 4 Table 2-39 "MSR_GRAPHICS_PERF_LIMIT_REASONS"
static constexpr struct {
  uint64_t bit;
  const char* str;
} kLimitReasonsGfx[] = {
    {1 << 0, "PROCHOT"},
    {1 << 1, "Thermal event"},
    {1 << 5, "Running average thermal limit"},
    {1 << 6, "Voltage regulator (VR) thermal alert"},
    {1 << 7, "Voltage regulator (VR) thermal design current limit"},
    {1 << 8, "Other"},
    {1 << 10, "Package/platform-Level PL1"},
    {1 << 11, "Package/platform-Level PL2"},
    {1 << 12, "Inefficient operation"},
};

static constexpr uint64_t kLimitReasonsLogShift = 16;

struct rapl_units {
  uint32_t power_mw;
  uint32_t time_us;
  uint32_t energy_uj;
};

rapl_units GetUnits(MsrAccess* msr) {
  // MSR_RAPL_POWER_UNIT provides the following information across all RAPL domains
  // Power Units[3:0]: power info (in watts) is based on the multiplier, 1/2^PU where PU is an
  // unsigned integer represented by bits [3:0].
  //
  // Time Units[19:16]: Time info (in seconds) is based on multiplier, 1/2^TU where TU is an
  // unsigned integer represented by bits[19:16]
  //
  // Energy Units[12:8]: Energy related information (in Joules) is based on the multiplier, 1/2^ESU,
  // where ESU is an unsigned integer represented by bits 12:8.
  //
  // Based on Intel Software Manual vol 3, chapter 14.9.
  //
  // To give better precision we specify power in milliwatts, time in microseconds, and energy in
  // microjoules.
  uint64_t rapl_unit = msr->read_msr(X86_MSR_RAPL_POWER_UNIT);
  rapl_units units = {
      .power_mw = 1000u / (1 << BITS_SHIFT(rapl_unit, 3, 0)),
      .time_us = 1000000u / (1 << BITS_SHIFT(rapl_unit, 19, 16)),
      .energy_uj = 1000000u / (1 << BITS_SHIFT(rapl_unit, 12, 8)),
  };
  return units;
}

zx_status_t SetPkgPl1(const zx_system_powerctl_arg_t* arg, MsrAccess* msr) {
  auto x86_microarch = x86_get_microarch_config()->x86_microarch;
  if ((x86_microarch != X86_MICROARCH_INTEL_SANDY_BRIDGE) &&
      (x86_microarch != X86_MICROARCH_INTEL_SILVERMONT) &&
      (x86_microarch != X86_MICROARCH_INTEL_BROADWELL) &&
      (x86_microarch != X86_MICROARCH_INTEL_HASWELL) &&
      (x86_microarch != X86_MICROARCH_INTEL_SKYLAKE)) {
    return ZX_ERR_NOT_SUPPORTED;
  }

  uint32_t power_limit = arg->x86_power_limit.power_limit;
  uint32_t time_window = arg->x86_power_limit.time_window;
  uint8_t clamp = arg->x86_power_limit.clamp;
  uint8_t enable = arg->x86_power_limit.enable;

  // zx_system_powerctl_arg_t is in mW and us, hence the math below
  rapl_units units = GetUnits(msr);

  // MSR_PKG_POWER_LIMIT allows SW to define power limit from package domain
  // power limit is defined in terms of avg power over a time window
  // Power limit 1[14:0]: sets avg power limit of package domain corresponding
  // to time window 1. Unit is in MSR_RAPL_POWER_UNIT
  // Enable power limit[15]: 0-disabled, 1-enabled
  // Package clamp limit1[16]: Allow going below OS requested p/t states
  // Time window[23:17]: Time limit = 2^Y * (1.0 + Z/4.0) * Time_Unit
  // Y = uint in bits[21:17] and Z = uint in bits[23:22]
  // Based on Intel Software Manual vol 3, chapter 14.9

  uint64_t rapl = msr->read_msr(X86_MSR_PKG_POWER_LIMIT);

  rapl &= ~BITMAP_LAST_WORD_MASK(15);

  if (power_limit > 0) {
    uint64_t raw_msr = power_limit / units.power_mw;
    if (raw_msr > kMaxLongTermPowerLimit) {
      return ZX_ERR_INVALID_ARGS;
    }

    rapl |= BITS(raw_msr, 15, 0);
  } else {
    // MSR_PKG_POWER_INFO is a RO MSR that reports package power range for RAPL
    // Thermal Spec power[14:0]: The value here is the equivalent of thermal spec power
    // of package domain. Setting to this thermal spec power if input is 0
    rapl |= BITS_SHIFT(msr->read_msr(X86_MSR_PKG_POWER_INFO), 15, 0);
  }

  // Based on Intel Software Manual vol 3, chapter 14.9,
  // Time limit = 2^Y * (1.0 + Z/4.0) * Time_Unit

  rapl &= ~0xFE0000;

  if (time_window > 0) {
    uint64_t t = time_window / units.time_us;
    uint64_t y = log2_ulong_floor(t);
    uint64_t z = (((4 * t)) / (1 << y)) - 4;
    t = (y & 0x1F) | ((z & 0x3) << 5);
    rapl |= t << 17;
  } else {
    rapl |= kDefaultTimeWindow << 17;
  }
  if (clamp) {
    rapl |= kPowerLimitPl1Clamp;
  } else {
    rapl &= ~kPowerLimitPl1Clamp;
  }

  if (enable) {
    rapl |= kPowerLimitPl1Enable;
  } else {
    rapl &= ~kPowerLimitPl1Enable;
  }

  msr->write_msr(X86_MSR_PKG_POWER_LIMIT, rapl);
  return ZX_OK;
}

void print_limits() {
  MsrAccess msr;
  rapl_units units = GetUnits(&msr);

  uint64_t rapl = msr.read_msr(X86_MSR_PKG_POWER_LIMIT);

  // Based on Intel Software Manual vol 3, chapter 14.9,
  // Time limit = 2^Y * (1.0 + Z/4.0) * Time_Unit
  auto y = static_cast<uint32_t>(BITS_SHIFT(rapl, 21, 17));
  auto z = static_cast<uint32_t>(BITS_SHIFT(rapl, 23, 22));
  uint32_t time_window = (1 << y) * (4 + z) * units.time_us / 4;
  auto power_limit = static_cast<uint32_t>(BITS_SHIFT(rapl, 14, 0));

  printf("PL1 limit: %umW\n", power_limit * units.power_mw);
  printf("PL1 window: %uus\n", time_window);
  printf("PL1 %sabled, clamping %sabled\n", rapl & kPowerLimitPl1Enable ? "en" : "dis",
         rapl & kPowerLimitPl1Clamp ? "en" : "dis");

  // Repeat for PL2
  y = BITS_SHIFT(rapl, 53, 49);
  z = BITS_SHIFT(rapl, 55, 54);
  time_window = (1 << y) * (4 + z) * units.time_us / 4;
  power_limit = BITS_SHIFT(rapl, 46, 32);

  printf("PL2 limit: %umW\n", power_limit * units.power_mw);
  printf("PL2 window: %uus\n", time_window);
  printf("PL2 %sabled, clamping %sabled\n", rapl & kPowerLimitPl2Enable ? "en" : "dis",
         rapl & kPowerLimitPl2Clamp ? "en" : "dis");
}

void clear_limit_reason_log() {
  // Limit reason MSR is supported on Intel Core generations 6 through 11, Intel Xeon generations
  // 1 through 3, Intel Core i3 8th generation, and Intel Xeon E processors. See Intel Volume 4
  // Table 2-39.
  auto x86_microarch = x86_get_microarch_config()->x86_microarch;
  if ((x86_microarch != X86_MICROARCH_INTEL_SKYLAKE) &&
      (x86_microarch != X86_MICROARCH_INTEL_CANNONLAKE) &&
      (x86_microarch != X86_MICROARCH_INTEL_TIGERLAKE)) {
    printf("Limit reasons msr not supported\n");
    return;
  }

  // The limit reason log is stored in bits 29:16 and can be cleared by writing zeros.
  MsrAccess msr;
  msr.write_msr(X86_MSR_PERF_LIMIT_REASONS, 0);
  msr.write_msr(X86_MSR_GFX_PERF_LIMIT_REASONS, 0);
}

void print_limit_reasons(bool use_log) {
  // Limit reason MSR is supported on Intel Core generations 6 through 11, Intel Xeon generations
  // 1 through 3, Intel Core i3 8th generation, and Intel Xeon E processors. See Intel Volume 4
  // Table 2-39.
  auto x86_microarch = x86_get_microarch_config()->x86_microarch;
  if ((x86_microarch != X86_MICROARCH_INTEL_SKYLAKE) &&
      (x86_microarch != X86_MICROARCH_INTEL_CANNONLAKE) &&
      (x86_microarch != X86_MICROARCH_INTEL_TIGERLAKE)) {
    printf("Limit reasons msr not supported\n");
    return;
  }

  MsrAccess msr;
  uint64_t limit_reasons = msr.read_msr(X86_MSR_PERF_LIMIT_REASONS);

  // The log bits (29:16) are latched versions of the status bits (13:0). If we're printing the log
  // shift the register value down.
  if (use_log) {
    limit_reasons = limit_reasons >> kLimitReasonsLogShift;
  }

  bool is_limited = false;
  printf("perf limit reasons:\n");
  for (auto reason : kLimitReasons) {
    if (!(limit_reasons & reason.bit)) {
      continue;
    }
    printf("\t%s\n", reason.str);
    is_limited = true;
  }
  if (!is_limited) {
    printf("\tnone\n");
  }

  limit_reasons = msr.read_msr(X86_MSR_GFX_PERF_LIMIT_REASONS);
  if (use_log) {
    limit_reasons = limit_reasons >> kLimitReasonsLogShift;
  }

  printf("gfx perf limit reasons:\n");
  is_limited = false;
  for (auto reason : kLimitReasonsGfx) {
    if (!(limit_reasons & reason.bit)) {
      continue;
    }
    printf("\t%s\n", reason.str);
    is_limited = true;
  }
  if (!is_limited) {
    printf("\tnone\n");
  }
}

RecurringCallback g_status_callback([]() {
  MsrAccess msr;
  rapl_units units = GetUnits(&msr);

  printf("energy consumed:\n");
  {
    static uint64_t last_energy_status = 0;

    uint64_t energy_status = read_msr(X86_MSR_PKG_ENERGY_STATUS);
    uint64_t uj = (energy_status - last_energy_status) * units.energy_uj;

    printf("\tpkg: %lu uJ (%lu J) (total: %lu uJ)\n", uj, uj / 1000000,
           energy_status * units.energy_uj);

    last_energy_status = energy_status;
  }

  {
    // PP0 usually is the core
    static uint64_t last_pp0_energy_status = 0;

    uint64_t pp0_energy_status = msr.read_msr(X86_MSR_PP0_ENERGY_STATUS);
    uint64_t uj = (pp0_energy_status - last_pp0_energy_status) * units.energy_uj;

    printf("\tpp0: %lu uJ (%lu J) (total: %lu uJ)\n", uj, uj / 1000000,
           pp0_energy_status * units.energy_uj);

    last_pp0_energy_status = pp0_energy_status;
  }

  {
    // PP1 usually is graphics
    static uint64_t last_pp1_energy_status = 0;

    uint64_t pp1_energy_status = msr.read_msr(X86_MSR_PP1_ENERGY_STATUS);
    uint64_t uj = (pp1_energy_status - last_pp1_energy_status) * units.energy_uj;

    printf("\tpp1: %lu uJ (%lu J) (total: %lu uJ)\n", uj, uj / 1000000,
           pp1_energy_status * units.energy_uj);

    last_pp1_energy_status = pp1_energy_status;
  }

  print_limit_reasons(/*use_log=*/false);
});

void print_command_usage() {
  static const struct {
    const char* cmd_str;
    const char* help_str;
  } subcommands[] = {
      {"status", "toggle status display"},
      {"limitreason clear", "clear the cpu limit reason log"},
      {"limitreason log", "print all cpu limit reasons since last clear"},
      {"limits", "print package power limits"},
  };
  printf("usage:\n");
  for (auto subcommand : subcommands) {
    printf("\tpower %-32s: %s\n", subcommand.cmd_str, subcommand.help_str);
  }
}

// This thread performs the work for suspend/resume.  We use a separate thread
// rather than the invoking thread to let us lean on the context switch code
// path to persist all of the usermode thread state that is not saved on a plain
// mode switch.
zx_status_t suspend_thread(void* raw_arg) {
  auto arg = reinterpret_cast<const zx_system_powerctl_arg_t*>(raw_arg);
  uint8_t target_s_state = arg->acpi_transition_s_state.target_s_state;
  uint8_t sleep_type_a = arg->acpi_transition_s_state.sleep_type_a;
  uint8_t sleep_type_b = arg->acpi_transition_s_state.sleep_type_b;

  return PlatformSuspend(target_s_state, sleep_type_a, sleep_type_b);
}

zx_status_t acpi_transition_s_state(const zx_system_powerctl_arg_t* arg) {
  uint8_t target_s_state = arg->acpi_transition_s_state.target_s_state;
  if (target_s_state == 0 || target_s_state > 5) {
    TRACEF("Bad S-state: S%u\n", target_s_state);
    return ZX_ERR_INVALID_ARGS;
  }

  // If not a shutdown, ensure CPU 0 is the only cpu left running.
  if (target_s_state != 5 && mp_get_online_mask() != cpu_num_to_mask(0)) {
    TRACEF("Too many CPUs running for state S%u\n", target_s_state);
    return ZX_ERR_BAD_STATE;
  }

  // Currently only transitioning to the S3 state is supported.
  if (target_s_state != 3) {
    return ZX_ERR_NOT_SUPPORTED;
  }

  // Prepare a resume path and execute the suspend on a separate thread (see comment on
  // |suspend_thread()| for explanation).
  Thread* t = Thread::Create("suspend-thread", suspend_thread,
                             const_cast<zx_system_powerctl_arg_t*>(arg), HIGHEST_PRIORITY);
  if (!t) {
    return ZX_ERR_NO_MEMORY;
  }

  t->Resume();

  zx_status_t retcode;
  zx_status_t status = t->Join(&retcode, ZX_TIME_INFINITE);
  ASSERT(status == ZX_OK);

  if (retcode != ZX_OK) {
    return retcode;
  }

  return ZX_OK;
}

}  // namespace

zx_status_t arch_system_powerctl(uint32_t cmd, const zx_system_powerctl_arg_t* arg,
                                 MsrAccess* msr) {
  switch (cmd) {
    case ZX_SYSTEM_POWERCTL_ACPI_TRANSITION_S_STATE:
      if (gBootOptions->x86_enable_suspend) {
        return acpi_transition_s_state(arg);
      } else {
        return ZX_ERR_NOT_SUPPORTED;
      }
    case ZX_SYSTEM_POWERCTL_X86_SET_PKG_PL1:
      return SetPkgPl1(arg, msr);
    default:
      return ZX_ERR_NOT_SUPPORTED;
  }
}

static zx_status_t cmd_power(int argc, const cmd_args* argv, uint32_t flags) {
  if (argc < 2) {
    print_command_usage();
    return ZX_ERR_INVALID_ARGS;
  }

  if (!strcmp(argv[1].str, "status")) {
    g_status_callback.Toggle();
    return ZX_OK;
  } else if (!strcmp(argv[1].str, "limitreason")) {
    if (argc < 3) {
      print_command_usage();
      return ZX_ERR_INVALID_ARGS;
    }
    if (!strcmp(argv[2].str, "log")) {
      print_limit_reasons(/*use_log=*/true);
      return ZX_OK;
    } else if (!strcmp(argv[2].str, "clear")) {
      clear_limit_reason_log();
      return ZX_OK;
    }
  } else if (!strcmp(argv[1].str, "limits")) {
    print_limits();
    return ZX_OK;
  }

  print_command_usage();
  return ZX_ERR_INVALID_ARGS;
}

STATIC_COMMAND_START
STATIC_COMMAND("power", "power limiting debug commands (for x86 only)", &cmd_power)
STATIC_COMMAND_END(cpu)
