blob: 5d97b2e6351a19ef84e27031a593338ca0de9bae [file] [log] [blame]
// Copyright 2017 The Fuchsia Authors
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#include <bits.h>
#include <lib/affine/ratio.h>
#include <lib/arch/cache.h>
#include <platform.h>
#include <trace.h>
#include <zircon/syscalls/hypervisor.h>
#include <zircon/syscalls/port.h>
#include <arch/arm64/hypervisor/el2_state.h>
#include <arch/hypervisor.h>
#include <dev/interrupt/arm_gic_hw_interface.h>
#include <dev/psci.h>
#include <dev/timer/arm_generic.h>
#include <hypervisor/ktrace.h>
#include <kernel/percpu.h>
#include <kernel/stats.h>
#include <vm/fault.h>
#include <vm/physmap.h>
#include "vmexit_priv.h"
#define LOCAL_TRACE 0
#define SET_SYSREG(sysreg) \
({ \
guest_state->system_state.sysreg = reg; \
LTRACEF("guest " #sysreg ": %#lx\n", guest_state->system_state.sysreg); \
next_pc(guest_state); \
ZX_OK; \
})
namespace {
constexpr size_t kPageTableLevelShift = 3;
constexpr uint32_t kPsciMajorVersion = 0;
constexpr uint32_t kPsciMinorVersion = 2;
constexpr uint16_t kSmcPsci = 0;
enum TimerControl : uint32_t {
ENABLE = 1u << 0,
IMASK = 1u << 1,
ISTATUS = 1u << 2,
};
// Note: This function assumes that the timer being used by the host is the
// virtual view of the ARM system timer, or equivalent (eg. the physical timer
// with CNTVOFF_EL2 set to zero). This is _currently_ true, as the Fuchsia EL2
// code seems to always set CNTVOFF_EL2 offset to zero, and then just leave it
// there for all time. If this ever changes, this code will need to be updated
// to account for the difference between the physical and virtual views of the
// system timer.
zx_ticks_t convert_raw_ticks_to_ticks(zx_ticks_t raw_ticks) {
return raw_ticks + platform_get_raw_ticks_to_ticks_offset();
}
void next_pc(GuestState* guest_state) { guest_state->system_state.elr_el2 += 4; }
bool timer_enabled(GuestState* guest_state) {
bool enabled = guest_state->cntv_ctl_el0 & TimerControl::ENABLE;
bool masked = guest_state->cntv_ctl_el0 & TimerControl::IMASK;
return enabled && !masked;
}
zx_status_t handle_wfi_wfe_instruction(uint32_t iss, GuestState* guest_state,
GichState* gich_state) {
next_pc(guest_state);
const WaitInstruction wi(iss);
if (wi.is_wfe) {
ktrace_vcpu_exit(VCPU_WFE_INSTRUCTION, guest_state->system_state.elr_el2);
return ZX_OK;
}
ktrace_vcpu_exit(VCPU_WFI_INSTRUCTION, guest_state->system_state.elr_el2);
// If a list register is in use, then we have an active interrupt.
if (gich_state->IsUsingListRegister()) {
return ZX_OK;
}
zx_time_t deadline = ZX_TIME_INFINITE;
if (timer_enabled(guest_state)) {
zx_ticks_t guest_ticks_deadline = convert_raw_ticks_to_ticks(guest_state->cntv_cval_el0);
if (current_ticks() >= guest_ticks_deadline) {
return ZX_OK;
}
deadline = platform_get_ticks_to_time_ratio().Scale(guest_ticks_deadline);
}
return gich_state->Wait(deadline);
}
zx_status_t handle_smc_instruction(uint32_t iss, GuestState* guest_state,
zx_port_packet_t* packet) {
const SmcInstruction si(iss);
if (si.imm != kSmcPsci) {
dprintf(CRITICAL, "hypervisor: Unhandled guest SMC instruction %#lx\n", guest_state->x[0]);
// From ARM DEN 0028B, Section 5.2: The Unknown SMC Function Identifier is a sign-extended
// value of (-1) that is returned in R0, W0 or X0 register.
guest_state->x[0] = ~0ul;
next_pc(guest_state);
return ZX_OK;
}
next_pc(guest_state);
switch (guest_state->x[0]) {
case PSCI64_PSCI_VERSION:
// See ARM PSCI Platform Design Document, Section 5.1.1.
guest_state->x[0] = (kPsciMajorVersion << 16) | kPsciMinorVersion;
return ZX_OK;
case PSCI64_CPU_ON:
memset(packet, 0, sizeof(*packet));
packet->type = ZX_PKT_TYPE_GUEST_VCPU;
packet->guest_vcpu.type = ZX_PKT_GUEST_VCPU_STARTUP;
packet->guest_vcpu.startup.id = guest_state->x[1];
packet->guest_vcpu.startup.entry = guest_state->x[2];
guest_state->x[0] = PSCI_SUCCESS;
return ZX_ERR_NEXT;
case PSCI64_CPU_OFF:
return ZX_ERR_STOP;
case PSCI64_SYSTEM_OFF:
return ZX_ERR_UNAVAILABLE;
default:
dprintf(CRITICAL, "hypervisor: Unhandled guest SMC PSCI instruction %#lx\n",
guest_state->x[0]);
guest_state->x[0] = PSCI_NOT_SUPPORTED;
return ZX_OK;
}
}
void clean_invalidate_cache(zx_paddr_t table, size_t index_shift) {
// TODO(abdulla): Make this understand concatenated page tables.
auto* pte = static_cast<pte_t*>(paddr_to_physmap(table));
pte_t page = index_shift > MMU_GUEST_PAGE_SIZE_SHIFT ? MMU_PTE_L012_DESCRIPTOR_BLOCK
: MMU_PTE_L3_DESCRIPTOR_PAGE;
for (size_t i = 0; i < PAGE_SIZE / sizeof(pte_t); i++) {
pte_t desc = pte[i] & MMU_PTE_DESCRIPTOR_MASK;
pte_t paddr = pte[i] & MMU_PTE_OUTPUT_ADDR_MASK;
if (desc == page) {
zx_vaddr_t vaddr = reinterpret_cast<zx_vaddr_t>(paddr_to_physmap(paddr));
arch_clean_invalidate_cache_range(vaddr, 1lu << index_shift);
} else if (desc != MMU_PTE_DESCRIPTOR_INVALID) {
size_t adjust_shift = MMU_GUEST_PAGE_SIZE_SHIFT - kPageTableLevelShift;
clean_invalidate_cache(paddr, index_shift - adjust_shift);
}
}
// Invalidate guest i-cache,
arch::InvalidateGlobalInstructionCache();
}
zx_status_t handle_system_instruction(uint32_t iss, uint64_t& hcr, GuestState* guest_state,
hypervisor::GuestPhysicalAddressSpace* gpas,
zx_port_packet_t* packet) {
const SystemInstruction si(iss);
const uint64_t reg = guest_state->x[si.xt];
switch (si.sysreg) {
case SystemRegister::MAIR_EL1:
return SET_SYSREG(mair_el1);
case SystemRegister::SCTLR_EL1: {
if (si.read) {
return ZX_ERR_NOT_SUPPORTED;
}
uint32_t sctlr_el1 = reg & UINT32_MAX;
// If the MMU is being enabled and caches are on, invalidate the caches.
//
// At this point the guest may reasonably assume that the caches are
// clear, but accesses by the host (either directly or even by just
// a speculative CPU load) may have led to them containing data. If this
// has happened, a guest's write to raw memory may be hidden by a stale
// cache entry.
//
// Invalidating the caches removes all stale data from cache. It's not
// a problem if a cache line is brought back into the cache after we
// invalidate: it will correctly contain the guest's data.
bool mmu_enabled = (sctlr_el1 & SCTLR_ELX_M) != 0;
bool dcaches_enabled = (sctlr_el1 & SCTLR_ELX_C) != 0;
if (mmu_enabled && dcaches_enabled) {
// Clean/invalidate the pages. We don't strictly need the clean, but it
// doesn't hurt.
clean_invalidate_cache(gpas->arch_table_phys(), MMU_GUEST_TOP_SHIFT);
// Stop trapping MMU register accesses to improve performance.
//
// We'll start monitoring again if the guest does a set/way cache
// operation.
hcr &= ~HCR_EL2_TVM;
}
LTRACEF("guest sctlr_el1: %#x\n", sctlr_el1);
LTRACEF("guest hcr_el2: %#lx\n", hcr);
guest_state->system_state.sctlr_el1 = sctlr_el1;
next_pc(guest_state);
return ZX_OK;
}
case SystemRegister::TCR_EL1:
return SET_SYSREG(tcr_el1);
case SystemRegister::TTBR0_EL1:
return SET_SYSREG(ttbr0_el1);
case SystemRegister::TTBR1_EL1:
return SET_SYSREG(ttbr1_el1);
case SystemRegister::OSLAR_EL1:
case SystemRegister::OSLSR_EL1:
case SystemRegister::OSDLR_EL1:
case SystemRegister::DBGPRCR_EL1:
next_pc(guest_state);
// These registers are RAZ/WI. Their state is dictated by the host.
if (si.read) {
guest_state->x[si.xt] = 0;
}
return ZX_OK;
case SystemRegister::ICC_SGI1R_EL1: {
if (si.read) {
// ICC_SGI1R_EL1 is write-only.
return ZX_ERR_INVALID_ARGS;
}
SgiRegister sgi(reg);
if (sgi.aff3 != 0 || sgi.aff2 != 0 || sgi.aff1 != 0 || sgi.rs != 0) {
return ZX_ERR_NOT_SUPPORTED;
}
memset(packet, 0, sizeof(*packet));
packet->type = ZX_PKT_TYPE_GUEST_VCPU;
packet->guest_vcpu.type = ZX_PKT_GUEST_VCPU_INTERRUPT;
if (sgi.all_but_local) {
auto vpid = BITS(guest_state->vmpidr_el2, 16, 0);
packet->guest_vcpu.interrupt.mask = ~(1ul << vpid);
} else {
packet->guest_vcpu.interrupt.mask = sgi.target_list;
}
packet->guest_vcpu.interrupt.vector = sgi.int_id;
next_pc(guest_state);
return ZX_ERR_NEXT;
}
case SystemRegister::DC_ISW:
case SystemRegister::DC_CISW:
case SystemRegister::DC_CSW: {
// Clean and invalidate the cache.
//
// The guest will typically need to iterate over a large number of
// sets/ways to do a full clean/invalidate. To avoid doing several full
// cache cleans in a row, we only do a cache operation when the guest is
// operating on set/way 0.
//
// The guest can't know the mapping between set/way and physical memory,
// so are required to iterate through every set/way. If the guest
// doesn't do this, they shouldn't be surprised if not everything has
// been cleaned.
uint64_t set_way = BITS_SHIFT(reg, 31, 4);
if (set_way == 0) {
clean_invalidate_cache(gpas->arch_table_phys(), MMU_GUEST_TOP_SHIFT);
}
// If the MMU or caches are off, start monitoring guest SCTLR register
// accesses so we can determine when the MMU/caches are turned on again.
//
// When the MMU or caches are turned off and the guest has just cleared
// caches, the guest can reasonably assume that the caches will remain
// clear, and that they won't need to invalidate them again prior to the
// MMU being turned on again.
//
// We (the host) can't guarantee that the we won't inadvertently cause
// the cache lines to load again (e.g., through speculative CPU
// accesses). Instead, we start monitoring for when the guest turns on
// the MMU again, and clean/invalidate caches then. This ensures that
// any writes done by the guest while caches are disabled won't be
// hidden by stale cache lines.
uint32_t sctlr_el1 = guest_state->system_state.sctlr_el1;
bool mmu_enabled = (sctlr_el1 & SCTLR_ELX_M) != 0;
bool dcaches_enabled = (sctlr_el1 & SCTLR_ELX_C) != 0;
if (!mmu_enabled || !dcaches_enabled) {
hcr |= HCR_EL2_TVM;
}
next_pc(guest_state);
return ZX_OK;
}
default:
dprintf(CRITICAL, "hypervisor: Unhandled guest system register %#x access\n",
static_cast<uint16_t>(si.sysreg));
return ZX_ERR_NOT_SUPPORTED;
}
}
zx_status_t handle_instruction_abort(GuestState* guest_state,
hypervisor::GuestPhysicalAddressSpace* gpas) {
const zx_vaddr_t guest_paddr = guest_state->hpfar_el2;
if (auto result = gpas->PageFault(guest_paddr); result.is_error()) {
dprintf(CRITICAL, "hypervisor: Unhandled guest instruction abort %#lx\n", guest_paddr);
return result.status_value();
}
return ZX_OK;
}
zx_status_t handle_data_abort(uint32_t iss, GuestState* guest_state,
hypervisor::GuestPhysicalAddressSpace* gpas,
hypervisor::TrapMap* traps, zx_port_packet_t* packet) {
zx_vaddr_t guest_paddr = guest_state->hpfar_el2;
zx::status<hypervisor::Trap*> trap = traps->FindTrap(ZX_GUEST_TRAP_BELL, guest_paddr);
switch (trap.status_value()) {
case ZX_ERR_NOT_FOUND:
if (auto result = gpas->PageFault(guest_paddr); result.is_error()) {
dprintf(CRITICAL, "hypervisor: Unhandled guest data abort %#lx\n", guest_paddr);
return result.status_value();
}
return ZX_OK;
case ZX_OK:
break;
default:
return trap.status_value();
}
next_pc(guest_state);
// Combine the lower bits of FAR_EL2 with HPFAR_EL2 to get the exact IPA.
guest_paddr |= guest_state->far_el2 & (PAGE_SIZE - 1);
LTRACEF("guest far_el2: %#lx\n", guest_state->far_el2);
const DataAbort data_abort(iss);
switch ((*trap)->kind()) {
case ZX_GUEST_TRAP_BELL:
if (data_abort.read) {
return ZX_ERR_NOT_SUPPORTED;
}
packet->key = (*trap)->key();
packet->type = ZX_PKT_TYPE_GUEST_BELL;
packet->guest_bell.addr = guest_paddr;
if (!(*trap)->HasPort()) {
return ZX_ERR_BAD_STATE;
}
return (*trap)->Queue(*packet).status_value();
case ZX_GUEST_TRAP_MEM:
if (!data_abort.valid) {
return ZX_ERR_IO_DATA_INTEGRITY;
}
packet->key = (*trap)->key();
packet->type = ZX_PKT_TYPE_GUEST_MEM;
packet->guest_mem.addr = guest_paddr;
packet->guest_mem.access_size = data_abort.access_size;
packet->guest_mem.sign_extend = data_abort.sign_extend;
packet->guest_mem.xt = data_abort.xt;
packet->guest_mem.read = data_abort.read;
if (!data_abort.read) {
packet->guest_mem.data = guest_state->x[data_abort.xt];
}
return ZX_ERR_NEXT;
default:
return ZX_ERR_BAD_STATE;
}
}
std::string_view ErrorTypeToString(SError::ErrorType type) {
switch (type) {
case SError::ErrorType::kUncontainable:
return "Uncontainable";
case SError::ErrorType::kUnrecoverableState:
return "Unrecoverable State";
case SError::ErrorType::kRestartableState:
return "Restartable State";
case SError::ErrorType::kRecoverableState:
return "Recoverable State";
case SError::ErrorType::kCorrected:
return "Corrected";
default:
return "Unknown";
}
}
std::string_view DataFaultStatusCodeToString(SError::DataFaultStatusCode code) {
switch (code) {
case SError::DataFaultStatusCode::kUncategorized:
return "Uncategorized";
case SError::DataFaultStatusCode::kAsyncSError:
return "Async SError";
default:
return "Unknown";
}
}
zx_status_t handle_serror_interrupt(GuestState* guest_state, uint32_t iss) {
// We received a system error (SError) exception.
//
// This isn't necessarily the guest's fault. It might be the host (kernel or
// userspace) triggered the SError, but it wasn't reported until the guest
// happened to be running.
//
// Print out a log and continue.
const SError serror(iss);
std::string_view aet_string = ErrorTypeToString(serror.aet());
std::string_view dfsc_string = DataFaultStatusCodeToString(serror.dfsc());
dprintf(CRITICAL,
"hypervisor: Received SError while running guest. Ignoring. "
"(Guest at EL%u, PC=%#lx. "
"CPU: %u, Syndrome: ISS=%#x [IDS=%u; IESB=%u; AET=%#x (%*s); EA=%u; DFSC=%#x (%*s)])\n",
guest_state->el(), guest_state->system_state.elr_el2, arch_curr_cpu_num(), serror.iss,
serror.ids(), serror.iesb(), static_cast<uint32_t>(serror.aet()),
static_cast<int>(aet_string.size()), aet_string.data(), serror.ea(),
static_cast<uint32_t>(serror.dfsc()), static_cast<int>(dfsc_string.size()),
dfsc_string.data());
return ZX_OK;
}
} // namespace
ExceptionSyndrome::ExceptionSyndrome(uint32_t esr) {
ec = static_cast<ExceptionClass>(BITS_SHIFT(esr, 31, 26));
iss = BITS(esr, 24, 0);
}
WaitInstruction::WaitInstruction(uint32_t iss) { is_wfe = BIT(iss, 0); }
SmcInstruction::SmcInstruction(uint32_t iss) { imm = static_cast<uint16_t>(BITS(iss, 15, 0)); }
SystemInstruction::SystemInstruction(uint32_t iss) {
sysreg = static_cast<SystemRegister>(BITS(iss, 21, 10) >> 6 | BITS_SHIFT(iss, 4, 1));
xt = static_cast<uint8_t>(BITS_SHIFT(iss, 9, 5));
read = BIT(iss, 0);
}
SgiRegister::SgiRegister(uint64_t sgir) {
aff3 = static_cast<uint8_t>(BITS_SHIFT(sgir, 55, 48));
aff2 = static_cast<uint8_t>(BITS_SHIFT(sgir, 39, 32));
aff1 = static_cast<uint8_t>(BITS_SHIFT(sgir, 23, 16));
rs = static_cast<uint8_t>(BITS_SHIFT(sgir, 47, 44));
target_list = static_cast<uint8_t>(BITS_SHIFT(sgir, 15, 0));
int_id = static_cast<uint8_t>(BITS_SHIFT(sgir, 27, 24));
all_but_local = BIT(sgir, 40);
}
DataAbort::DataAbort(uint32_t iss) {
valid = BIT_SHIFT(iss, 24);
access_size = static_cast<uint8_t>(1u << BITS_SHIFT(iss, 23, 22));
sign_extend = BIT(iss, 21);
xt = static_cast<uint8_t>(BITS_SHIFT(iss, 20, 16));
read = !BIT(iss, 6);
}
void timer_maybe_interrupt(GuestState* guest_state, GichState* gich_state) {
if (timer_enabled(guest_state)) {
zx_ticks_t guest_ticks_deadline = convert_raw_ticks_to_ticks(guest_state->cntv_cval_el0);
if (current_ticks() >= guest_ticks_deadline) {
gich_state->Track(kTimerVector, hypervisor::InterruptType::PHYSICAL);
}
}
}
zx_status_t vmexit_handler(uint64_t* hcr, GuestState* guest_state, GichState* gich_state,
hypervisor::GuestPhysicalAddressSpace* gpas, hypervisor::TrapMap* traps,
zx_port_packet_t* packet) {
LTRACEF("guest esr_el1: %#x\n", guest_state->system_state.esr_el1);
LTRACEF("guest esr_el2: %#x\n", guest_state->esr_el2);
LTRACEF("guest elr_el2: %#lx\n", guest_state->system_state.elr_el2);
LTRACEF("guest spsr_el2: %#x\n", guest_state->system_state.spsr_el2);
ExceptionSyndrome syndrome(guest_state->esr_el2);
zx_status_t status;
switch (syndrome.ec) {
case ExceptionClass::WFI_WFE_INSTRUCTION:
LTRACEF("handling wfi/wfe instruction, iss %#x\n", syndrome.iss);
GUEST_STATS_INC(wfi_wfe_instructions);
status = handle_wfi_wfe_instruction(syndrome.iss, guest_state, gich_state);
break;
case ExceptionClass::SMC_INSTRUCTION:
LTRACEF("handling smc instruction, iss %#x func %#lx\n", syndrome.iss, guest_state->x[0]);
GUEST_STATS_INC(smc_instructions);
ktrace_vcpu_exit(VCPU_SMC_INSTRUCTION, guest_state->system_state.elr_el2);
status = handle_smc_instruction(syndrome.iss, guest_state, packet);
break;
case ExceptionClass::SYSTEM_INSTRUCTION:
LTRACEF("handling system instruction\n");
GUEST_STATS_INC(system_instructions);
ktrace_vcpu_exit(VCPU_SYSTEM_INSTRUCTION, guest_state->system_state.elr_el2);
status = handle_system_instruction(syndrome.iss, *hcr, guest_state, gpas, packet);
break;
case ExceptionClass::INSTRUCTION_ABORT:
LTRACEF("handling instruction abort at %#lx\n", guest_state->hpfar_el2);
GUEST_STATS_INC(instruction_aborts);
ktrace_vcpu_exit(VCPU_INSTRUCTION_ABORT, guest_state->system_state.elr_el2);
status = handle_instruction_abort(guest_state, gpas);
break;
case ExceptionClass::DATA_ABORT:
LTRACEF("handling data abort at %#lx\n", guest_state->hpfar_el2);
GUEST_STATS_INC(data_aborts);
ktrace_vcpu_exit(VCPU_DATA_ABORT, guest_state->system_state.elr_el2);
status = handle_data_abort(syndrome.iss, guest_state, gpas, traps, packet);
break;
case ExceptionClass::SERROR_INTERRUPT:
LTRACEF("handling serror interrupt at %#lx\n", guest_state->hpfar_el2);
ktrace_vcpu_exit(VCPU_SERROR_INTERRUPT, guest_state->system_state.elr_el2);
status = handle_serror_interrupt(guest_state, syndrome.iss);
break;
default:
LTRACEF("unhandled exception syndrome, ec %#x iss %#x\n", static_cast<uint32_t>(syndrome.ec),
syndrome.iss);
ktrace_vcpu_exit(VCPU_NOT_SUPPORTED, guest_state->system_state.elr_el2);
status = ZX_ERR_NOT_SUPPORTED;
break;
}
switch (status) {
case ZX_OK:
case ZX_ERR_NEXT:
case ZX_ERR_STOP:
case ZX_ERR_UNAVAILABLE:
case ZX_ERR_INTERNAL_INTR_RETRY:
case ZX_ERR_INTERNAL_INTR_KILLED:
break;
default:
dprintf(CRITICAL, "hypervisor: VM exit handler for %u (%s) in EL%u at %#lx returned %d\n",
static_cast<uint32_t>(syndrome.ec), exception_class_name(syndrome.ec),
guest_state->el(), guest_state->system_state.elr_el2, status);
break;
}
return status;
}