blob: 05de55849fe9a1a7cf65339e3e38babe6c31f528 [file]
// Copyright 2025 syzkaller project authors. All rights reserved.
// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
#ifndef EXECUTOR_COMMON_KVM_AMD64_SYZOS_H
#define EXECUTOR_COMMON_KVM_AMD64_SYZOS_H
// This file provides guest code running inside the AMD64 KVM.
#include <linux/kvm.h>
#include <stdbool.h>
#include "common_kvm_syzos.h"
#include "kvm.h"
// There are no particular rules to assign numbers here, but changing them will
// result in losing some existing reproducers. Therefore, we try to leave spaces
// between unrelated IDs.
// Remember these constants must match those in sys/linux/dev_kvm_amd64.txt.
typedef enum {
SYZOS_API_UEXIT = 0,
SYZOS_API_CODE = 10,
SYZOS_API_CPUID = 100,
SYZOS_API_WRMSR = 101,
SYZOS_API_RDMSR = 102,
SYZOS_API_WR_CRN = 103,
SYZOS_API_WR_DRN = 104,
SYZOS_API_IN_DX = 105,
SYZOS_API_OUT_DX = 106,
SYZOS_API_SET_IRQ_HANDLER = 200,
SYZOS_API_ENABLE_NESTED = 300,
SYZOS_API_NESTED_CREATE_VM = 301,
SYZOS_API_NESTED_LOAD_CODE = 302,
SYZOS_API_NESTED_VMLAUNCH = 303,
SYZOS_API_NESTED_VMRESUME = 304,
SYZOS_API_NESTED_LOAD_SYZOS = 310,
SYZOS_API_NESTED_INTEL_VMWRITE_MASK = 340,
SYZOS_API_NESTED_AMD_VMCB_WRITE_MASK = 380,
SYZOS_API_NESTED_AMD_INVLPGA = 381,
SYZOS_API_NESTED_AMD_STGI = 382,
SYZOS_API_NESTED_AMD_CLGI = 383,
SYZOS_API_NESTED_AMD_INJECT_EVENT = 384,
SYZOS_API_NESTED_AMD_SET_INTERCEPT = 385,
SYZOS_API_NESTED_AMD_VMLOAD = 386,
SYZOS_API_NESTED_AMD_VMSAVE = 387,
SYZOS_API_STOP, // Must be the last one
} syzos_api_id;
struct api_call_uexit {
struct api_call_header header;
uint64 exit_code;
};
struct api_call_code {
struct api_call_header header;
uint8 insns[];
};
struct api_call_nested_load_code {
struct api_call_header header;
uint64 vm_id;
uint8 insns[];
};
struct api_call_nested_load_syzos {
struct api_call_header header;
uint64 vm_id;
uint64 unused_pages;
uint8 program[];
};
struct api_call_cpuid {
struct api_call_header header;
uint32 eax;
uint32 ecx;
};
// This struct must match the push/pop order in nested_vm_exit_handler_intel_asm().
struct l2_guest_regs {
uint64 rax, rbx, rcx, rdx, rsi, rdi, rbp;
uint64 r8, r9, r10, r11, r12, r13, r14, r15;
};
// Flags for mem_region
#define MEM_REGION_FLAG_USER_CODE (1 << 0)
#define MEM_REGION_FLAG_DIRTY_LOG (1 << 1)
#define MEM_REGION_FLAG_READONLY (1 << 2)
#define MEM_REGION_FLAG_EXECUTOR_CODE (1 << 3)
#define MEM_REGION_FLAG_GPA0 (1 << 5)
#define MEM_REGION_FLAG_NO_HOST_MEM (1 << 6)
#define MEM_REGION_FLAG_REMAINING (1 << 7)
struct mem_region {
uint64 gpa;
int pages;
uint32 flags;
};
struct syzos_boot_args {
uint32 region_count;
uint32 reserved;
struct mem_region regions[];
};
struct syzos_globals {
uint64 alloc_offset;
uint64 total_size;
uint64 text_sizes[KVM_MAX_VCPU];
struct l2_guest_regs l2_ctx[KVM_MAX_VCPU][KVM_MAX_L2_VMS];
uint64 active_vm_id[KVM_MAX_VCPU];
};
#ifdef __cplusplus
extern "C" {
#endif
GUEST_CODE static void guest_uexit(uint64 exit_code);
GUEST_CODE static void nested_vm_exit_handler_intel(uint64 exit_reason, struct l2_guest_regs* regs);
GUEST_CODE static void nested_vm_exit_handler_amd(uint64 exit_reason, struct l2_guest_regs* regs);
#ifdef __cplusplus
}
#endif
GUEST_CODE static void guest_execute_code(uint8* insns, uint64 size);
GUEST_CODE static void guest_handle_cpuid(uint32 eax, uint32 ecx);
GUEST_CODE static void guest_handle_wrmsr(uint64 reg, uint64 val);
GUEST_CODE static void guest_handle_rdmsr(uint64 reg);
GUEST_CODE static void guest_handle_wr_crn(struct api_call_2* cmd);
GUEST_CODE static void guest_handle_wr_drn(struct api_call_2* cmd);
GUEST_CODE static void guest_handle_in_dx(struct api_call_2* cmd);
GUEST_CODE static void guest_handle_out_dx(struct api_call_3* cmd);
GUEST_CODE static void guest_handle_set_irq_handler(struct api_call_2* cmd);
GUEST_CODE static void guest_handle_enable_nested(struct api_call_1* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_create_vm(struct api_call_1* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_load_syzos(struct api_call_nested_load_syzos* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_vmresume(struct api_call_1* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_intel_vmwrite_mask(struct api_call_5* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_amd_vmcb_write_mask(struct api_call_5* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_amd_invlpga(struct api_call_2* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_amd_stgi();
GUEST_CODE static void guest_handle_nested_amd_clgi();
GUEST_CODE static void guest_handle_nested_amd_inject_event(struct api_call_5* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_amd_set_intercept(struct api_call_5* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_amd_vmload(struct api_call_1* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_amd_vmsave(struct api_call_1* cmd, uint64 cpu_id);
typedef enum {
UEXIT_END = (uint64)-1,
UEXIT_IRQ = (uint64)-2,
UEXIT_ASSERT = (uint64)-3,
UEXIT_INVALID_MAIN = (uint64)-4,
} uexit_code;
typedef enum {
CPU_VENDOR_INTEL,
CPU_VENDOR_AMD,
} cpu_vendor_id;
__attribute__((naked)) GUEST_CODE static void dummy_null_handler()
{
asm("iretq");
}
__attribute__((naked)) GUEST_CODE static void uexit_irq_handler()
{
asm volatile(R"(
// Call guest_uexit(UEXIT_IRQ).
movq $-2, %rdi
call guest_uexit
iretq
)");
}
// Main guest function that performs necessary setup and passes the control to the user-provided
// payload.
// The inner loop uses a complex if-statement, because Clang is eager to insert a jump table into
// a switch statement.
// TODO(glider): executor/style_test.go insists that single-line compound statements should not
// be used e.g. in the following case:
// if (call == SYZOS_API_UEXIT) {
// struct api_call_uexit* ucmd = (struct api_call_uexit*)cmd;
// guest_uexit(ucmd->exit_code);
// } else if (call == SYZOS_API_WR_CRN) {
// guest_handle_wr_crn((struct api_call_2*)cmd); // Style check fails here
// }
// , i.e. when the braces are consistent with the rest of the code, even despite this violates the
// Google C++ style guide.
// We add single-line comments to justify having the compound statements below.
__attribute__((used))
GUEST_CODE static void
guest_main(uint64 cpu)
{
volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS;
uint64 size = globals->text_sizes[cpu];
uint64 addr = X86_SYZOS_ADDR_USER_CODE + cpu * KVM_PAGE_SIZE;
while (size >= sizeof(struct api_call_header)) {
struct api_call_header* cmd = (struct api_call_header*)addr;
volatile uint64 call = cmd->call;
if ((call >= SYZOS_API_STOP) || (cmd->size > size)) {
guest_uexit(UEXIT_INVALID_MAIN);
return;
}
if (call == SYZOS_API_UEXIT) {
// Issue a user exit.
struct api_call_uexit* ucmd = (struct api_call_uexit*)cmd;
guest_uexit(ucmd->exit_code);
} else if (call == SYZOS_API_CODE) {
// Execute an instruction blob.
struct api_call_code* ccmd = (struct api_call_code*)cmd;
guest_execute_code(ccmd->insns, cmd->size - sizeof(struct api_call_header));
} else if (call == SYZOS_API_CPUID) {
// Issue CPUID.
struct api_call_cpuid* ccmd = (struct api_call_cpuid*)cmd;
guest_handle_cpuid(ccmd->eax, ccmd->ecx);
} else if (call == SYZOS_API_WRMSR) {
// Write an MSR register.
struct api_call_2* ccmd = (struct api_call_2*)cmd;
guest_handle_wrmsr(ccmd->args[0], ccmd->args[1]);
} else if (call == SYZOS_API_RDMSR) {
// Read an MSR register.
struct api_call_1* ccmd = (struct api_call_1*)cmd;
guest_handle_rdmsr(ccmd->arg);
} else if (call == SYZOS_API_WR_CRN) {
// Write value to a control register.
guest_handle_wr_crn((struct api_call_2*)cmd);
} else if (call == SYZOS_API_WR_DRN) {
// Write value to a debug register.
guest_handle_wr_drn((struct api_call_2*)cmd);
} else if (call == SYZOS_API_IN_DX) {
// Read data from an I/O port.
guest_handle_in_dx((struct api_call_2*)cmd);
} else if (call == SYZOS_API_OUT_DX) {
// Write data to an I/O port.
guest_handle_out_dx((struct api_call_3*)cmd);
} else if (call == SYZOS_API_SET_IRQ_HANDLER) {
// Set the handler for a particular IRQ.
guest_handle_set_irq_handler((struct api_call_2*)cmd);
} else if (call == SYZOS_API_ENABLE_NESTED) {
// Enable nested virtualization.
guest_handle_enable_nested((struct api_call_1*)cmd, cpu);
} else if (call == SYZOS_API_NESTED_CREATE_VM) {
// Create a nested VM.
guest_handle_nested_create_vm((struct api_call_1*)cmd, cpu);
} else if (call == SYZOS_API_NESTED_LOAD_CODE) {
// Load code into the nested VM.
guest_handle_nested_load_code((struct api_call_nested_load_code*)cmd, cpu);
} else if (call == SYZOS_API_NESTED_LOAD_SYZOS) {
// Load SYZOS into the nested VM.
guest_handle_nested_load_syzos((struct api_call_nested_load_syzos*)cmd, cpu);
} else if (call == SYZOS_API_NESTED_VMLAUNCH) {
// Launch the nested VM.
guest_handle_nested_vmlaunch((struct api_call_1*)cmd, cpu);
} else if (call == SYZOS_API_NESTED_VMRESUME) {
// Resume a nested VM.
guest_handle_nested_vmresume((struct api_call_1*)cmd, cpu);
} else if (call == SYZOS_API_NESTED_INTEL_VMWRITE_MASK) {
// Write to a VMCS field using masks.
guest_handle_nested_intel_vmwrite_mask((struct api_call_5*)cmd, cpu);
} else if (call == SYZOS_API_NESTED_AMD_VMCB_WRITE_MASK) {
// Write to a VMCB field using masks.
guest_handle_nested_amd_vmcb_write_mask((struct api_call_5*)cmd, cpu);
} else if (call == SYZOS_API_NESTED_AMD_INVLPGA) {
// Invalidate TLB mappings for the specified address/ASID.
guest_handle_nested_amd_invlpga((struct api_call_2*)cmd, cpu);
} else if (call == SYZOS_API_NESTED_AMD_STGI) {
// Set Global Interrupt Flag (Enable Interrupts).
guest_handle_nested_amd_stgi();
} else if (call == SYZOS_API_NESTED_AMD_CLGI) {
// Clear Global Interrupt Flag (Disable Interrupts, including NMI).
guest_handle_nested_amd_clgi();
} else if (call == SYZOS_API_NESTED_AMD_INJECT_EVENT) {
// Inject an event (IRQ/Exception) into the L2 guest via VMCB.
guest_handle_nested_amd_inject_event((struct api_call_5*)cmd, cpu);
} else if (call == SYZOS_API_NESTED_AMD_SET_INTERCEPT) {
// Set/Clear specific intercept bits in the VMCB.
guest_handle_nested_amd_set_intercept((struct api_call_5*)cmd, cpu);
} else if (call == SYZOS_API_NESTED_AMD_VMLOAD) {
// Execute VMLOAD to load state from VMCB.
guest_handle_nested_amd_vmload((struct api_call_1*)cmd, cpu);
} else if (call == SYZOS_API_NESTED_AMD_VMSAVE) {
// Execute VMSAVE to save state to VMCB.
guest_handle_nested_amd_vmsave((struct api_call_1*)cmd, cpu);
}
addr += cmd->size;
size -= cmd->size;
};
guest_uexit(UEXIT_END);
}
GUEST_CODE static noinline void guest_execute_code(uint8* insns, uint64 size)
{
volatile void (*fn)() = (volatile void (*)())insns;
fn();
}
// Perform a userspace exit that can be handled by the host.
// The host returns from ioctl(KVM_RUN) with kvm_run.exit_reason=KVM_EXIT_MMIO,
// and can handle the call depending on the data passed as exit code.
// Make sure the compiler does not optimize this function away, it is called from
// assembly.
__attribute__((used))
GUEST_CODE static noinline void
guest_uexit(uint64 exit_code)
{
// Force exit_code into RAX using inline asm constraints ("a").
// We write to X86_SYZOS_ADDR_UEXIT (0x40100).
// This allows the L1 hypervisor to reliably read RAX during an EPT violation.
volatile uint64* ptr = (volatile uint64*)X86_SYZOS_ADDR_UEXIT;
asm volatile("movq %0, (%1)" ::"a"(exit_code), "r"(ptr) : "memory");
}
GUEST_CODE static noinline void guest_handle_cpuid(uint32 eax, uint32 ecx)
{
asm volatile(
"cpuid\n"
: // Currently ignore outputs
: "a"(eax), "c"(ecx)
: "rbx", "rdx");
}
GUEST_CODE static noinline void wrmsr(uint64 reg, uint64 val)
{
asm volatile(
"wrmsr"
:
: "c"(reg),
"a"((uint32)val),
"d"((uint32)(val >> 32))
: "memory");
}
// Write val into an MSR register reg.
GUEST_CODE static noinline void guest_handle_wrmsr(uint64 reg, uint64 val)
{
wrmsr(reg, val);
}
GUEST_CODE static noinline uint64 rdmsr(uint64 msr_id)
{
uint32 low = 0, high = 0; // nolint
// The RDMSR instruction takes the MSR address in ecx.
// It puts the lower 32 bits of the MSR value into eax, and the upper.
// 32 bits of the MSR value into edx.
asm volatile("rdmsr" : "=a"(low), "=d"(high) : "c"(msr_id));
return ((uint64)high << 32) | low;
}
// Read an MSR register, ignore the result.
GUEST_CODE static noinline void guest_handle_rdmsr(uint64 reg)
{
(void)rdmsr(reg);
}
// Write to CRn control register.
GUEST_CODE static noinline void guest_handle_wr_crn(struct api_call_2* cmd)
{
uint64 value = cmd->args[1];
// Prevent the compiler from generating a switch table.
volatile uint64 reg = cmd->args[0];
if (reg == 0) {
// Move value to CR0.
asm volatile("movq %0, %%cr0" ::"r"(value) : "memory");
return;
}
if (reg == 2) {
// Move value to CR2.
asm volatile("movq %0, %%cr2" ::"r"(value) : "memory");
return;
}
if (reg == 3) {
// Move value to CR3.
asm volatile("movq %0, %%cr3" ::"r"(value) : "memory");
return;
}
if (reg == 4) {
// Move value to CR4.
asm volatile("movq %0, %%cr4" ::"r"(value) : "memory");
return;
}
if (reg == 8) {
// Move value to CR8 (TPR - Task Priority Register).
asm volatile("movq %0, %%cr8" ::"r"(value) : "memory");
return;
}
}
// Write to DRn debug register.
GUEST_CODE static noinline void guest_handle_wr_drn(struct api_call_2* cmd)
{
uint64 value = cmd->args[1];
volatile uint64 reg = cmd->args[0];
if (reg == 0) {
asm volatile("movq %0, %%dr0" ::"r"(value) : "memory");
return;
}
if (reg == 1) {
asm volatile("movq %0, %%dr1" ::"r"(value) : "memory");
return;
}
if (reg == 2) {
asm volatile("movq %0, %%dr2" ::"r"(value) : "memory");
return;
}
if (reg == 3) {
asm volatile("movq %0, %%dr3" ::"r"(value) : "memory");
return;
}
if (reg == 4) {
asm volatile("movq %0, %%dr4" ::"r"(value) : "memory");
return;
}
if (reg == 5) {
asm volatile("movq %0, %%dr5" ::"r"(value) : "memory");
return;
}
if (reg == 6) {
asm volatile("movq %0, %%dr6" ::"r"(value) : "memory");
return;
}
if (reg == 7) {
asm volatile("movq %0, %%dr7" ::"r"(value) : "memory");
return;
}
}
// Read data from an I/O port, should result in KVM_EXIT_IO.
GUEST_CODE static noinline void guest_handle_in_dx(struct api_call_2* cmd)
{
uint16 port = cmd->args[0];
volatile int size = cmd->args[1];
if (size == 1) {
uint8 unused;
// Reads 1 byte from the port in DX into AL.
asm volatile("inb %1, %0" : "=a"(unused) : "d"(port));
return;
}
if (size == 2) {
uint16 unused;
// Reads 2 bytes from the port in DX into AX.
asm volatile("inw %1, %0" : "=a"(unused) : "d"(port));
return;
}
if (size == 4) {
uint32 unused;
// Reads 4 bytes from the port in DX into EAX.
asm volatile("inl %1, %0" : "=a"(unused) : "d"(port));
}
return;
}
// Write data to an I/O port, should result in KVM_EXIT_IO.
GUEST_CODE static noinline void guest_handle_out_dx(struct api_call_3* cmd)
{
uint16 port = cmd->args[0];
volatile int size = cmd->args[1];
uint32 data = (uint32)cmd->args[2];
if (size == 1) {
// Writes 1 byte from AL to the port in DX.
asm volatile("outb %b0, %w1" ::"a"(data), "d"(port));
return;
}
if (size == 2) {
// Writes 2 bytes from AX to the port in DX.
asm volatile("outw %w0, %w1" ::"a"(data), "d"(port));
return;
}
if (size == 4) {
// Writes 4 bytes from EAX to the port in DX.
asm volatile("outl %k0, %w1" ::"a"(data), "d"(port));
return;
}
}
// See https://wiki.osdev.org/Interrupt_Descriptor_Table#Gate_Descriptor_2.
struct idt_entry_64 {
uint16 offset_low;
uint16 selector;
// Interrupt Stack Table offset in bits 0..2
uint8 ist;
// Gate Type, P and DPL.
uint8 type_attr;
uint16 offset_mid;
uint32 offset_high;
uint32 reserved;
} __attribute__((packed));
// IDT gate setup should be similar to syzos_setup_idt() in the host code.
GUEST_CODE static void set_idt_gate(uint8 vector, uint64 handler)
{
volatile struct idt_entry_64* idt =
(volatile struct idt_entry_64*)(X86_SYZOS_ADDR_VAR_IDT);
volatile struct idt_entry_64* idt_entry = &idt[vector];
idt_entry->offset_low = (uint16)handler;
idt_entry->offset_mid = (uint16)(handler >> 16);
idt_entry->offset_high = (uint32)(handler >> 32);
idt_entry->selector = X86_SYZOS_SEL_CODE;
idt_entry->type_attr = 0x8E;
idt_entry->ist = 0;
idt_entry->reserved = 0;
}
GUEST_CODE static noinline void guest_handle_set_irq_handler(struct api_call_2* cmd)
{
uint8 vector = (uint8)cmd->args[0];
uint64 type = cmd->args[1];
volatile uint64 handler_addr = 0;
if (type == 1)
handler_addr = executor_fn_guest_addr(dummy_null_handler);
else if (type == 2)
handler_addr = executor_fn_guest_addr(uexit_irq_handler);
set_idt_gate(vector, handler_addr);
}
GUEST_CODE static cpu_vendor_id get_cpu_vendor(void)
{
uint32 ebx, eax = 0;
asm volatile(
"cpuid"
: "+a"(eax), "=b"(ebx)
: // No explicit inputs, EAX is handled by +a.
: "ecx", "edx");
if (ebx == 0x756e6547) { // "Genu[ineIntel]".
return CPU_VENDOR_INTEL;
} else if (ebx == 0x68747541) { // "Auth[enticAMD]".
return CPU_VENDOR_AMD;
} else {
// Should not happen on AMD64, but for completeness.
guest_uexit(UEXIT_ASSERT);
return CPU_VENDOR_INTEL; // Default to Intel if unknown.
}
}
GUEST_CODE static inline uint64 read_cr0(void)
{
uint64 val;
asm volatile("mov %%cr0, %0" : "=r"(val));
return val;
}
GUEST_CODE static inline uint64 read_cr3(void)
{
uint64 val;
asm volatile("mov %%cr3, %0" : "=r"(val));
return val;
}
GUEST_CODE static inline uint64 read_cr4(void)
{
uint64 val;
asm volatile("mov %%cr4, %0" : "=r"(val));
return val;
}
GUEST_CODE static inline void write_cr4(uint64 val)
{
asm volatile("mov %0, %%cr4" : : "r"(val));
}
GUEST_CODE static noinline void vmwrite(uint64 field, uint64 value)
{
uint8 error = 0; // nolint
// 'setna' sets the byte to 1 if CF=1 or ZF=1 (VMfail)
asm volatile("vmwrite %%rax, %%rbx; setna %0"
: "=q"(error)
: "a"(value), "b"(field)
: "cc", "memory");
if (error)
guest_uexit(UEXIT_ASSERT);
}
GUEST_CODE static noinline uint64 vmread(uint64 field)
{
uint64 value;
asm volatile("vmread %%rbx, %%rax"
: "=a"(value)
: "b"(field)
: "cc");
return value;
}
GUEST_CODE static inline void nested_vmptrld(uint64 cpu_id, uint64 vm_id)
{
uint64 vmcs_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
uint8 error = 0; // nolint
asm volatile("vmptrld %1; setna %0"
: "=q"(error)
: "m"(vmcs_addr)
: "memory", "cc");
if (error)
guest_uexit(0xE2BAD2);
}
GUEST_CODE static noinline void vmcb_write16(uint64 vmcb, uint16 offset, uint16 val)
{
*((volatile uint16*)(vmcb + offset)) = val;
}
GUEST_CODE static noinline void vmcb_write32(uint64 vmcb, uint16 offset, uint32 val)
{
*((volatile uint32*)(vmcb + offset)) = val;
}
GUEST_CODE static noinline uint32 vmcb_read32(uint64 vmcb, uint16 offset)
{
return *((volatile uint32*)(vmcb + offset));
}
GUEST_CODE static noinline void vmcb_write64(uint64 vmcb, uint16 offset, uint64 val)
{
*((volatile uint64*)(vmcb + offset)) = val;
}
GUEST_CODE static noinline uint64 vmcb_read64(volatile uint8* vmcb, uint16 offset)
{
return *((volatile uint64*)(vmcb + offset));
}
GUEST_CODE static void guest_memset(void* s, uint8 c, int size)
{
volatile uint8* p = (volatile uint8*)s;
for (int i = 0; i < size; i++)
p[i] = c;
}
GUEST_CODE static void guest_memcpy(void* dst, void* src, int size)
{
volatile uint8* d = (volatile uint8*)dst;
volatile uint8* s = (volatile uint8*)src;
for (int i = 0; i < size; i++)
d[i] = s[i];
}
GUEST_CODE static noinline void
nested_enable_vmx_intel(uint64 cpu_id)
{
uint64 vmxon_addr = X86_SYZOS_ADDR_VM_ARCH_SPECIFIC(cpu_id);
uint64 cr4 = read_cr4();
cr4 |= X86_CR4_VMXE;
write_cr4(cr4);
uint64 feature_control = rdmsr(X86_MSR_IA32_FEATURE_CONTROL);
// Check if Lock bit (bit 0) is clear.
if ((feature_control & 1) == 0) {
// If unlocked, set Lock bit (bit 0) and Enable VMX outside SMX bit (bit 2).
feature_control |= 0b101;
asm volatile("wrmsr" : : "d"(0x0), "c"(X86_MSR_IA32_FEATURE_CONTROL), "A"(feature_control));
}
// Store revision ID at the beginning of VMXON.
*(uint32*)vmxon_addr = rdmsr(X86_MSR_IA32_VMX_BASIC);
uint8 error;
// Can't use enter_vmx_operation() yet, because VMCS is not valid.
asm volatile("vmxon %1; setna %0"
: "=q"(error)
: "m"(vmxon_addr)
: "memory", "cc");
if (error) {
guest_uexit(0xE2BAD0);
return;
}
}
GUEST_CODE static noinline void
nested_enable_svm_amd(uint64 cpu_id)
{
// Get the Host Save Area (HSAVE) physical address for this CPU.
// The HSAVE area stores the host processor's state on VMRUN and is restored on VMEXIT.
uint64 hsave_addr = X86_SYZOS_ADDR_VM_ARCH_SPECIFIC(cpu_id);
// Set the SVM Enable (SVME) bit in EFER. This enables SVM operations.
uint64 efer = rdmsr(X86_MSR_IA32_EFER);
efer |= X86_EFER_SVME;
wrmsr(X86_MSR_IA32_EFER, efer);
// Write the physical address of the HSAVE area to the VM_HSAVE_PA MSR.
// This MSR tells the CPU where to save/restore host state during VMRUN/VMEXIT.
wrmsr(X86_MSR_VM_HSAVE_PA, hsave_addr);
}
GUEST_CODE static noinline void
guest_handle_enable_nested(struct api_call_1* cmd, uint64 cpu_id)
{
if (get_cpu_vendor() == CPU_VENDOR_INTEL) {
nested_enable_vmx_intel(cpu_id);
} else {
nested_enable_svm_amd(cpu_id);
}
}
// Calculate the size of the unused memory region from the boot arguments.
GUEST_CODE static uint64 get_unused_memory_size()
{
volatile struct syzos_boot_args* args = (volatile struct syzos_boot_args*)X86_SYZOS_ADDR_BOOT_ARGS;
for (uint32 i = 0; i < args->region_count; i++) {
if (args->regions[i].gpa == X86_SYZOS_ADDR_UNUSED)
return args->regions[i].pages * KVM_PAGE_SIZE;
}
return 0;
}
// Allocate a page from the X86_SYZOS_ADDR_UNUSED region using a non-reclaiming bump allocator.
GUEST_CODE static uint64 guest_alloc_page()
{
volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS;
// Lazy initialization of total_size using CAS to prevent races.
if (globals->total_size == 0) {
uint64 size = get_unused_memory_size();
// Attempt to swap 0 with the calculated size.
// If another CPU beat us to it, this does nothing (which is fine).
__sync_val_compare_and_swap(&globals->total_size, 0, size);
}
// Atomic fetch-and-add to reserve space.
uint64 offset = __sync_fetch_and_add(&globals->alloc_offset, KVM_PAGE_SIZE);
if (offset >= globals->total_size)
guest_uexit(UEXIT_ASSERT);
uint64 ptr = X86_SYZOS_ADDR_UNUSED + offset;
guest_memset((void*)ptr, 0, KVM_PAGE_SIZE);
return ptr;
}
// Helper to map a page in L2's EPT/NPT.
GUEST_CODE static void l2_map_page(uint64 cpu_id, uint64 vm_id, uint64 gpa, uint64 host_pa, uint64 flags)
{
// Page table root (PML4).
uint64 pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id);
volatile uint64* pml4 = (volatile uint64*)pml4_addr;
// Allocate Level 4 entries.
uint64 pml4_idx = (gpa >> 39) & 0x1FF;
if (!(pml4[pml4_idx] & X86_PDE64_PRESENT)) {
uint64 page = guest_alloc_page();
pml4[pml4_idx] = page | X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER;
}
// Allocate Level 3 entries.
volatile uint64* pdpt = (volatile uint64*)(pml4[pml4_idx] & ~0xFFF);
uint64 pdpt_idx = (gpa >> 30) & 0x1FF;
if (!(pdpt[pdpt_idx] & X86_PDE64_PRESENT)) {
uint64 page = guest_alloc_page();
pdpt[pdpt_idx] = page | X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER;
}
// Allocate Level 2 entries.
volatile uint64* pd = (volatile uint64*)(pdpt[pdpt_idx] & ~0xFFF);
uint64 pd_idx = (gpa >> 21) & 0x1FF;
if (!(pd[pd_idx] & X86_PDE64_PRESENT)) {
uint64 page = guest_alloc_page();
pd[pd_idx] = page | X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER;
}
// Update Level 1 (PT).
volatile uint64* pt = (volatile uint64*)(pd[pd_idx] & ~0xFFF);
uint64 pt_idx = (gpa >> 12) & 0x1FF;
// Map if not present.
if (!(pt[pt_idx] & X86_PDE64_PRESENT))
pt[pt_idx] = (host_pa & ~0xFFF) | flags;
}
GUEST_CODE static noinline void setup_l2_page_tables(cpu_vendor_id vendor, uint64 cpu_id, uint64 vm_id, uint64 unused_pages)
{
// Note: PML4 and MSR Bitmap must be zeroed by the caller (nested_create_vm)
// so that this function can be called additively by nested_load_syzos.
// Intel EPT: set Read, Write, Execute.
// AMD NPT: set Present, Write, User.
uint64 flags = X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER;
if (vendor == CPU_VENDOR_INTEL) {
flags |= EPT_MEMTYPE_WB | EPT_ACCESSED | EPT_DIRTY;
} else {
flags |= X86_PDE64_ACCESSED | X86_PDE64_DIRTY;
}
// Replicate L1 memory layout from boot args.
volatile struct syzos_boot_args* args = (volatile struct syzos_boot_args*)X86_SYZOS_ADDR_BOOT_ARGS;
for (uint32 i = 0; i < args->region_count; i++) {
struct mem_region r;
r.gpa = args->regions[i].gpa;
r.pages = args->regions[i].pages;
r.flags = args->regions[i].flags;
// Skip NO_HOST_MEM regions (like the Exit/UEXIT region).
// This ensures that L2 accesses to these pages cause a nested page fault
// (EPT Violation / NPT Fault), allowing L1 to intercept and modify the exit code.
if (r.flags & MEM_REGION_FLAG_NO_HOST_MEM)
continue;
// Skip the huge unused heap for now, map fixed small heap if needed or handled by guest_alloc.
// If unused_pages > 0, we map that many pages from the unused region.
if (r.flags & MEM_REGION_FLAG_REMAINING) {
// Map at least a few pages for the allocator overhead if 0 is passed.
r.pages = (unused_pages < 16) ? 16 : unused_pages;
}
for (int p = 0; p < r.pages; p++) {
uint64 gpa = r.gpa + (p * KVM_PAGE_SIZE);
uint64 backing;
if (r.gpa == X86_SYZOS_ADDR_USER_CODE && p == 0) {
// Map start of user code to the VM's dedicated code buffer
backing = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id);
} else if (r.gpa == X86_SYZOS_ADDR_STACK_BOTTOM) {
// Map stack to the VM's dedicated stack buffer
backing = X86_SYZOS_ADDR_VM_STACK(cpu_id, vm_id);
} else {
// Identity map all other regions to prevent L1 OOM exhaustion.
// The L2 guest is transient and does not need duplicates of L1's GDT/IDT/TSS/Heap.
backing = gpa;
}
l2_map_page(cpu_id, vm_id, gpa, backing, flags);
}
}
}
GUEST_CODE static noinline void init_vmcs_control_fields(uint64 cpu_id, uint64 vm_id)
{
// Read and write Pin-Based controls from TRUE MSR.
uint64 vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_PINBASED_CTLS);
vmwrite(VMCS_PIN_BASED_VM_EXEC_CONTROL, (uint32)vmx_msr);
// Setup Secondary Processor-Based controls: enable EPT.
vmx_msr = (uint32)rdmsr(X86_MSR_IA32_VMX_PROCBASED_CTLS2);
vmx_msr |= SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_ENABLE_RDTSCP;
vmwrite(VMCS_SECONDARY_VM_EXEC_CONTROL, vmx_msr);
// Read and write Primary Processor-Based controls from TRUE MSR.
// We also add the bit to enable the secondary controls.
vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_PROCBASED_CTLS);
vmx_msr |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
// Exit on HLT and RDTSC.
vmx_msr |= CPU_BASED_HLT_EXITING | CPU_BASED_RDTSC_EXITING;
vmwrite(VMCS_CPU_BASED_VM_EXEC_CONTROL, (uint32)vmx_msr);
// Set up VM-Exit controls via TRUE MSR: indicate a 64-bit host.
vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_EXIT_CTLS);
vmwrite(VMCS_VM_EXIT_CONTROLS, (uint32)vmx_msr | VM_EXIT_HOST_ADDR_SPACE_SIZE);
// Read and write VM-Entry controls from TRUE MSR
// We add the bit to indicate a 64-bit guest.
vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_ENTRY_CTLS);
vmwrite(VMCS_VM_ENTRY_CONTROLS, (uint32)vmx_msr | VM_ENTRY_IA32E_MODE);
// Set up the EPT Pointer.
// We use the L2 PML4 address we calculate in guest_handle_create_nested_vm.
// The EPT Pointer has:
// - Memory Type = 6 (Write-Back)
// - Page-Walk Length = 3 (meaning 4 levels: PML4, PDPT, PD, PT)
// - Address of the PML4 table
uint64 eptp = (X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id) & ~0xFFF) | (6 << 0) | (3 << 3);
vmwrite(VMCS_EPT_POINTER, eptp);
// Set CR0/CR4 masks and shadows.
// This simple setup (masks=0) means any guest CR0/CR4 write is allowed
// and won't cause a VM-Exit.
vmwrite(VMCS_CR0_GUEST_HOST_MASK, 0);
vmwrite(VMCS_CR4_GUEST_HOST_MASK, 0);
vmwrite(VMCS_CR0_READ_SHADOW, read_cr0());
vmwrite(VMCS_CR4_READ_SHADOW, read_cr4());
// Disable the bitmaps which we do not use.
vmwrite(VMCS_MSR_BITMAP, 0);
vmwrite(VMCS_VMREAD_BITMAP, 0);
vmwrite(VMCS_VMWRITE_BITMAP, 0);
// Intercept #UD (Invalid Opcode)
vmwrite(VMCS_EXCEPTION_BITMAP, (1 << 6));
// Clear unused/unsupported fields.
// TODO(glider): do we need these?
vmwrite(VMCS_VIRTUAL_PROCESSOR_ID, 0);
vmwrite(VMCS_POSTED_INTR_NV, 0);
vmwrite(VMCS_PAGE_FAULT_ERROR_CODE_MASK, 0);
vmwrite(VMCS_PAGE_FAULT_ERROR_CODE_MATCH, -1);
vmwrite(VMCS_CR3_TARGET_COUNT, 0);
vmwrite(VMCS_VM_EXIT_MSR_STORE_COUNT, 0);
vmwrite(VMCS_VM_EXIT_MSR_LOAD_COUNT, 0);
vmwrite(VMCS_VM_ENTRY_MSR_LOAD_COUNT, 0);
vmwrite(VMCS_VM_ENTRY_INTR_INFO_FIELD, 0);
vmwrite(VMCS_TPR_THRESHOLD, 0);
}
// Common L2 exit reasons for Intel and AMD.
typedef enum {
SYZOS_NESTED_EXIT_REASON_HLT = 1,
SYZOS_NESTED_EXIT_REASON_INVD = 2,
SYZOS_NESTED_EXIT_REASON_CPUID = 3,
SYZOS_NESTED_EXIT_REASON_RDTSC = 4,
SYZOS_NESTED_EXIT_REASON_RDTSCP = 5,
SYZOS_NESTED_EXIT_REASON_EPT_VIOLATION = 6,
SYZOS_NESTED_EXIT_REASON_UNKNOWN = 0xFF,
} syz_nested_exit_reason;
GUEST_CODE static void handle_nested_uexit(uint64 exit_code)
{
// Increment the nesting level (top byte).
uint64 level = (exit_code >> 56) + 1;
exit_code = (exit_code & 0x00FFFFFFFFFFFFFFULL) | (level << 56);
// Perform L1 uexit with the modified code.
guest_uexit(exit_code);
// guest_uexit terminates, so we don't return.
}
GUEST_CODE static void guest_uexit_l2(uint64 exit_reason, syz_nested_exit_reason mapped_reason,
cpu_vendor_id vendor)
{
if (mapped_reason != SYZOS_NESTED_EXIT_REASON_UNKNOWN) {
guest_uexit(0xe2e20000 | mapped_reason);
} else if (vendor == CPU_VENDOR_INTEL) {
guest_uexit(0xe2110000 | exit_reason);
} else {
guest_uexit(0xe2aa0000 | exit_reason);
}
}
#define EXIT_REASON_CPUID 0xa
#define EXIT_REASON_HLT 0xc
#define EXIT_REASON_INVD 0xd
#define EXIT_REASON_EPT_VIOLATION 0x30
#define EXIT_REASON_RDTSC 0x10
#define EXIT_REASON_RDTSCP 0x33
GUEST_CODE static syz_nested_exit_reason map_intel_exit_reason(uint64 basic_reason)
{
// Disable optimizations.
volatile uint64 reason = basic_reason;
if (reason == EXIT_REASON_HLT)
return SYZOS_NESTED_EXIT_REASON_HLT;
if (reason == EXIT_REASON_INVD)
return SYZOS_NESTED_EXIT_REASON_INVD;
if (reason == EXIT_REASON_CPUID)
return SYZOS_NESTED_EXIT_REASON_CPUID;
if (reason == EXIT_REASON_RDTSC)
return SYZOS_NESTED_EXIT_REASON_RDTSC;
if (reason == EXIT_REASON_RDTSCP)
return SYZOS_NESTED_EXIT_REASON_RDTSCP;
if (reason == EXIT_REASON_EPT_VIOLATION)
return SYZOS_NESTED_EXIT_REASON_EPT_VIOLATION;
return SYZOS_NESTED_EXIT_REASON_UNKNOWN;
}
GUEST_CODE static void advance_l2_rip_intel(uint64 basic_reason)
{
// Disable optimizations.
volatile uint64 reason = basic_reason;
uint64 rip = vmread(VMCS_GUEST_RIP);
if ((reason == EXIT_REASON_INVD) || (reason == EXIT_REASON_CPUID) ||
(reason == EXIT_REASON_RDTSC)) {
rip += 2;
} else if (reason == EXIT_REASON_RDTSCP) {
// We insist on a single-line compound statement for else-if.
rip += 3;
}
vmwrite(VMCS_GUEST_RIP, rip);
}
// This function is called from inline assembly.
__attribute__((used))
GUEST_CODE static void
nested_vm_exit_handler_intel(uint64 exit_reason, struct l2_guest_regs* regs)
{
volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS;
// Recover cpu_id from the stack. It was pushed before L1 registers.
// Stack: [cpu_id] [launch] [L1 GPRs x6] [L2 GPRs x15]
// Index: 22 21 15..20 0..14
// regs points to the start of L2 GPRs.
uint64 cpu_id = *(uint64*)((char*)regs + sizeof(struct l2_guest_regs) + 7 * 8);
uint64 vm_id = globals->active_vm_id[cpu_id];
// Persist L2 registers.
guest_memcpy((void*)&globals->l2_ctx[cpu_id][vm_id], regs, sizeof(struct l2_guest_regs));
uint64 basic_reason = exit_reason & 0xFFFF;
// Handle EPT Violation (Nested UEXIT).
if (basic_reason == EXIT_REASON_EPT_VIOLATION) {
uint64 gpa = vmread(VMCS_GUEST_PHYSICAL_ADDRESS);
// Only handle violations on the specific UEXIT page.
if ((gpa & ~0xFFF) == X86_SYZOS_ADDR_EXIT) {
// This is a uexit from L2.
// We enforced usage of RAX in guest_uexit.
// Read RAX from the saved L2 guest registers.
// Note: On Intel exit, guest registers are NOT saved to VMCS.
// They are saved to 'regs' by our asm wrapper.
handle_nested_uexit(regs->rax);
// Advance L2 RIP by 3 bytes (movq %rax, (%rdx) is 3 bytes).
vmwrite(VMCS_GUEST_RIP, vmread(VMCS_GUEST_RIP) + 3);
return;
}
}
syz_nested_exit_reason mapped_reason = map_intel_exit_reason(basic_reason);
guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_INTEL);
advance_l2_rip_intel(basic_reason);
}
extern char after_vmentry_label;
__attribute__((naked)) GUEST_CODE static void nested_vm_exit_handler_intel_asm(void)
{
asm volatile(R"(
// Save L2's GPRs. This creates the 'struct l2_guest_regs' on the stack.
// We push in reverse order so that RAX ends up at offset 0 (Top of Stack).
push %%r15
push %%r14
push %%r13
push %%r12
push %%r11
push %%r10
push %%r9
push %%r8
push %%rbp
push %%rdi
push %%rsi
push %%rdx
push %%rcx
push %%rbx
push %%rax
// Prepare arguments for the C handler:
// arg1 (RDI) = exit_reason
// arg2 (RSI) = pointer to the saved registers
mov %%rsp, %%rsi
mov %[vm_exit_reason], %%rbx
vmread %%rbx, %%rdi
// Call the C handler.
call nested_vm_exit_handler_intel
// The C handler has processed the exit. Now, return to the L1 command
// processing loop. VMX remains enabled.
// 1. Discard L2 GPRs.
add %[l2_regs_size], %%rsp
// 2. Restore L1 callee-saved registers.
// Order must be reverse of push: r15, r14, r13, r12, rbp, rbx.
pop %%r15
pop %%r14
pop %%r13
pop %%r12
pop %%rbp
pop %%rbx
// 3. Discard launch flag and cpu_id.
add $16, %%rsp
// 4. Restore Red Zone.
add $128, %%rsp
// Jump to L1 main flow
jmp after_vmentry_label
)"
: : [l2_regs_size] "i"(sizeof(struct l2_guest_regs)),
[vm_exit_reason] "i"(VMCS_VM_EXIT_REASON) : "memory", "cc", "rbx", "rdi", "rsi");
}
#define VMEXIT_RDTSC 0x6e
#define VMEXIT_CPUID 0x72
#define VMEXIT_INVD 0x76
#define VMEXIT_HLT 0x78
#define VMEXIT_NPF 0x400
#define VMEXIT_RDTSCP 0x87
GUEST_CODE static syz_nested_exit_reason map_amd_exit_reason(uint64 basic_reason)
{
// Disable optimizations.
volatile uint64 reason = basic_reason;
if (reason == VMEXIT_HLT)
return SYZOS_NESTED_EXIT_REASON_HLT;
if (reason == VMEXIT_INVD)
return SYZOS_NESTED_EXIT_REASON_INVD;
if (reason == VMEXIT_CPUID)
return SYZOS_NESTED_EXIT_REASON_CPUID;
if (reason == VMEXIT_RDTSC)
return SYZOS_NESTED_EXIT_REASON_RDTSC;
if (reason == VMEXIT_RDTSCP)
return SYZOS_NESTED_EXIT_REASON_RDTSCP;
if (reason == VMEXIT_NPF)
return SYZOS_NESTED_EXIT_REASON_EPT_VIOLATION;
return SYZOS_NESTED_EXIT_REASON_UNKNOWN;
}
GUEST_CODE static void advance_l2_rip_amd(uint64 basic_reason, uint64 cpu_id, uint64 vm_id)
{
// Disable optimizations.
volatile uint64 reason = basic_reason;
uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
uint64 rip = vmcb_read64((volatile uint8*)vmcb_addr, VMCB_GUEST_RIP);
if ((reason == VMEXIT_INVD) || (reason == VMEXIT_CPUID) ||
(reason == VMEXIT_RDTSC)) {
rip += 2;
} else if (reason == VMEXIT_RDTSCP) {
// We insist on a single-line compound statement for else-if.
rip += 3;
}
vmcb_write64(vmcb_addr, VMCB_GUEST_RIP, rip);
}
__attribute__((used)) GUEST_CODE static void
nested_vm_exit_handler_amd(uint64 exit_reason, struct l2_guest_regs* regs)
{
volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS;
// Recover cpu_id from the stack.
// Stack: [cpu_id] [vmcb_addr] [6 L1 GPRs] [exit_code] [15 L2 GPRs]
// Index: 23 22 16..21 15 0..14
// regs points to Index 0.
uint64 cpu_id = *(uint64*)((char*)regs + sizeof(struct l2_guest_regs) + 8 * 8);
uint64 vm_id = globals->active_vm_id[cpu_id];
// Persist L2 registers.
guest_memcpy((void*)&globals->l2_ctx[cpu_id][vm_id], regs, sizeof(struct l2_guest_regs));
volatile uint64 basic_reason = exit_reason & 0xFFFF;
// Handle NPT Fault (Nested UEXIT).
if (basic_reason == VMEXIT_NPF) {
uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
// EXITINFO2 contains the faulting GPA.
uint64 fault_gpa = vmcb_read64((volatile uint8*)vmcb_addr, VMCB_EXITINFO2);
if ((fault_gpa & ~0xFFF) == X86_SYZOS_ADDR_EXIT) {
// RAX is in the saved L2 regs.
handle_nested_uexit(regs->rax);
// Advance L2 RIP by 3 bytes.
uint64 rip = vmcb_read64((volatile uint8*)vmcb_addr, VMCB_GUEST_RIP);
vmcb_write64(vmcb_addr, VMCB_GUEST_RIP, rip + 3);
return;
}
}
syz_nested_exit_reason mapped_reason = map_amd_exit_reason(basic_reason);
guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_AMD);
advance_l2_rip_amd(basic_reason, cpu_id, vm_id);
}
GUEST_CODE static noinline void init_vmcs_host_state(void)
{
// Segment Selectors.
vmwrite(VMCS_HOST_CS_SELECTOR, X86_SYZOS_SEL_CODE);
vmwrite(VMCS_HOST_DS_SELECTOR, X86_SYZOS_SEL_DATA);
vmwrite(VMCS_HOST_ES_SELECTOR, X86_SYZOS_SEL_DATA);
vmwrite(VMCS_HOST_SS_SELECTOR, X86_SYZOS_SEL_DATA);
vmwrite(VMCS_HOST_FS_SELECTOR, X86_SYZOS_SEL_DATA);
vmwrite(VMCS_HOST_GS_SELECTOR, X86_SYZOS_SEL_DATA);
vmwrite(VMCS_HOST_TR_SELECTOR, X86_SYZOS_SEL_TSS64);
// Base addresses.
vmwrite(VMCS_HOST_TR_BASE, X86_SYZOS_ADDR_VAR_TSS);
vmwrite(VMCS_HOST_GDTR_BASE, X86_SYZOS_ADDR_GDT);
vmwrite(VMCS_HOST_IDTR_BASE, X86_SYZOS_ADDR_VAR_IDT);
vmwrite(VMCS_HOST_FS_BASE, rdmsr(X86_MSR_FS_BASE));
vmwrite(VMCS_HOST_GS_BASE, rdmsr(X86_MSR_GS_BASE));
// Exit handler in RIP.
vmwrite(VMCS_HOST_RIP, (uintptr_t)nested_vm_exit_handler_intel_asm);
// Control Registers.
vmwrite(VMCS_HOST_CR0, read_cr0());
vmwrite(VMCS_HOST_CR3, read_cr3());
vmwrite(VMCS_HOST_CR4, read_cr4());
// MSRs.
vmwrite(VMCS_HOST_IA32_PAT, rdmsr(X86_MSR_IA32_CR_PAT));
vmwrite(VMCS_HOST_IA32_EFER, rdmsr(X86_MSR_IA32_EFER));
vmwrite(VMCS_HOST_IA32_PERF_GLOBAL_CTRL, rdmsr(X86_MSR_CORE_PERF_GLOBAL_CTRL));
vmwrite(VMCS_HOST_IA32_SYSENTER_CS, rdmsr(X86_MSR_IA32_SYSENTER_CS));
vmwrite(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(X86_MSR_IA32_SYSENTER_ESP));
vmwrite(VMCS_HOST_IA32_SYSENTER_EIP, rdmsr(X86_MSR_IA32_SYSENTER_EIP));
}
#define COPY_VMCS_FIELD(GUEST_FIELD, HOST_FIELD) \
vmwrite(GUEST_FIELD, vmread(HOST_FIELD))
#define SETUP_L2_SEGMENT(SEG, SELECTOR, BASE, LIMIT, AR) \
vmwrite(VMCS_GUEST_##SEG##_SELECTOR, SELECTOR); \
vmwrite(VMCS_GUEST_##SEG##_BASE, BASE); \
vmwrite(VMCS_GUEST_##SEG##_LIMIT, LIMIT); \
vmwrite(VMCS_GUEST_##SEG##_ACCESS_RIGHTS, AR);
GUEST_CODE static noinline void init_vmcs_guest_state(uint64 cpu_id, uint64 vm_id)
{
uint64 l2_code_addr = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id);
uint64 l2_stack_addr = X86_SYZOS_ADDR_VM_STACK(cpu_id, vm_id);
// Segment Registers.
SETUP_L2_SEGMENT(CS, vmread(VMCS_HOST_CS_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_CODE);
SETUP_L2_SEGMENT(DS, vmread(VMCS_HOST_DS_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK);
SETUP_L2_SEGMENT(ES, vmread(VMCS_HOST_ES_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK);
SETUP_L2_SEGMENT(SS, vmread(VMCS_HOST_SS_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK);
SETUP_L2_SEGMENT(FS, vmread(VMCS_HOST_FS_SELECTOR), vmread(VMCS_HOST_FS_BASE), 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK);
SETUP_L2_SEGMENT(GS, vmread(VMCS_HOST_GS_SELECTOR), vmread(VMCS_HOST_GS_BASE), 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK);
// Task and LDT Registers.
SETUP_L2_SEGMENT(TR, vmread(VMCS_HOST_TR_SELECTOR), vmread(VMCS_HOST_TR_BASE), 0x67, VMX_AR_TSS_BUSY);
SETUP_L2_SEGMENT(LDTR, 0, 0, 0, VMX_AR_LDTR_UNUSABLE);
// Control Registers & CPU State.
vmwrite(VMCS_GUEST_CR0, vmread(VMCS_HOST_CR0));
vmwrite(VMCS_GUEST_CR3, vmread(VMCS_HOST_CR3));
vmwrite(VMCS_GUEST_CR4, vmread(VMCS_HOST_CR4));
vmwrite(VMCS_GUEST_RIP, l2_code_addr);
vmwrite(VMCS_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8);
vmwrite(VMCS_GUEST_RFLAGS, RFLAGS_1_BIT);
// TODO
vmwrite(VMCS_GUEST_DR7, 0x400);
// MSRs - Copy from host or set to default.
COPY_VMCS_FIELD(VMCS_GUEST_IA32_EFER, VMCS_HOST_IA32_EFER);
COPY_VMCS_FIELD(VMCS_GUEST_IA32_PAT, VMCS_HOST_IA32_PAT);
COPY_VMCS_FIELD(VMCS_GUEST_IA32_PERF_GLOBAL_CTRL, VMCS_HOST_IA32_PERF_GLOBAL_CTRL);
COPY_VMCS_FIELD(VMCS_GUEST_SYSENTER_CS, VMCS_HOST_IA32_SYSENTER_CS);
COPY_VMCS_FIELD(VMCS_GUEST_SYSENTER_ESP, VMCS_HOST_IA32_SYSENTER_ESP);
COPY_VMCS_FIELD(VMCS_GUEST_SYSENTER_EIP, VMCS_HOST_IA32_SYSENTER_EIP);
vmwrite(VMCS_GUEST_IA32_DEBUGCTL, 0);
// Descriptor Tables.
vmwrite(VMCS_GUEST_GDTR_BASE, vmread(VMCS_HOST_GDTR_BASE));
vmwrite(VMCS_GUEST_GDTR_LIMIT, 0xffff);
vmwrite(VMCS_GUEST_IDTR_BASE, vmread(VMCS_HOST_IDTR_BASE));
vmwrite(VMCS_GUEST_IDTR_LIMIT, 0xffff);
// Miscellaneous Fields.
vmwrite(VMCS_LINK_POINTER, 0xffffffffffffffff);
// 0 = Active.
vmwrite(VMCS_GUEST_ACTIVITY_STATE, 0);
vmwrite(VMCS_GUEST_INTERRUPTIBILITY_INFO, 0);
vmwrite(VMCS_GUEST_PENDING_DBG_EXCEPTIONS, 0);
vmwrite(VMCS_VMX_PREEMPTION_TIMER_VALUE, 0);
vmwrite(VMCS_GUEST_INTR_STATUS, 0);
vmwrite(VMCS_GUEST_PML_INDEX, 0);
}
GUEST_CODE static noinline void
nested_create_vm_intel(struct api_call_1* cmd, uint64 cpu_id)
{
uint64 vm_id = cmd->arg;
uint64 vmcs_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
uint8 error = 0; // nolint
uint64 l2_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id);
uint64 l2_msr_bitmap = X86_SYZOS_ADDR_MSR_BITMAP(cpu_id, vm_id);
*(uint32*)vmcs_addr = rdmsr(X86_MSR_IA32_VMX_BASIC);
asm volatile("vmclear %1; setna %0"
: "=q"(error)
: "m"(vmcs_addr)
: "memory", "cc");
if (error) {
guest_uexit(0xE2BAD1);
return;
}
nested_vmptrld(cpu_id, vm_id);
// Zero out critical structures.
guest_memset((void*)l2_pml4_addr, 0, KVM_PAGE_SIZE);
guest_memset((void*)l2_msr_bitmap, 0, KVM_PAGE_SIZE);
setup_l2_page_tables(CPU_VENDOR_INTEL, cpu_id, vm_id, 0);
init_vmcs_control_fields(cpu_id, vm_id);
init_vmcs_host_state();
init_vmcs_guest_state(cpu_id, vm_id);
}
// Helper for setting up a segment in the VMCB
#define SETUP_L2_SEGMENT_SVM(VMBC_PTR, SEG_NAME, SELECTOR, BASE, LIMIT, ATTR) \
vmcb_write16(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_SEL, SELECTOR); \
vmcb_write16(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_ATTR, ATTR); \
vmcb_write32(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_LIM, LIMIT); \
vmcb_write64(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_BASE, BASE);
GUEST_CODE static noinline void init_vmcb_guest_state(uint64 cpu_id, uint64 vm_id)
{
uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
uint64 l2_code_addr = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id);
uint64 l2_stack_addr = X86_SYZOS_ADDR_VM_STACK(cpu_id, vm_id);
uint64 npt_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id);
// Setup Guest Segment Registers.
// We copy the L1 guest's segment setup, as it's a good 64-bit environment.
SETUP_L2_SEGMENT_SVM(vmcb_addr, CS, X86_SYZOS_SEL_CODE, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_CODE);
SETUP_L2_SEGMENT_SVM(vmcb_addr, DS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA);
SETUP_L2_SEGMENT_SVM(vmcb_addr, ES, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA);
SETUP_L2_SEGMENT_SVM(vmcb_addr, SS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA);
SETUP_L2_SEGMENT_SVM(vmcb_addr, FS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA);
SETUP_L2_SEGMENT_SVM(vmcb_addr, GS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA);
// Task Register (TR). Must point to a valid, present, 64-bit TSS.
SETUP_L2_SEGMENT_SVM(vmcb_addr, TR, X86_SYZOS_SEL_TSS64, X86_SYZOS_ADDR_VAR_TSS, 0x67, SVM_ATTR_TSS_BUSY);
// LDT Register (LDTR) - Mark as unusable.
// A null selector and attribute is the correct way to disable LDTR.
SETUP_L2_SEGMENT_SVM(vmcb_addr, LDTR, 0, 0, 0, SVM_ATTR_LDTR_UNUSABLE);
// Setup Guest Control Registers & CPU State.
vmcb_write64(vmcb_addr, VMCB_GUEST_CR0, read_cr0() | X86_CR0_WP);
// L2 will use L1's page tables.
vmcb_write64(vmcb_addr, VMCB_GUEST_CR3, read_cr3());
vmcb_write64(vmcb_addr, VMCB_GUEST_CR4, read_cr4());
vmcb_write64(vmcb_addr, VMCB_GUEST_RIP, l2_code_addr);
vmcb_write64(vmcb_addr, VMCB_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8);
vmcb_write64(vmcb_addr, VMCB_GUEST_RFLAGS, RFLAGS_1_BIT);
// Setup Guest EFER. Must have SVME, LME, and LMA for 64-bit nested.
vmcb_write64(vmcb_addr, VMCB_GUEST_EFER, X86_EFER_LME | X86_EFER_LMA | X86_EFER_SVME);
vmcb_write64(vmcb_addr, VMCB_RAX, 0);
// Setup Guest Descriptor Tables.
struct {
uint16 limit;
uint64 base;
} __attribute__((packed)) gdtr, idtr;
asm volatile("sgdt %0" : "=m"(gdtr));
asm volatile("sidt %0" : "=m"(idtr));
vmcb_write64(vmcb_addr, VMCB_GUEST_GDTR_BASE, gdtr.base);
vmcb_write32(vmcb_addr, VMCB_GUEST_GDTR_LIM, gdtr.limit);
vmcb_write64(vmcb_addr, VMCB_GUEST_IDTR_BASE, idtr.base);
vmcb_write32(vmcb_addr, VMCB_GUEST_IDTR_LIM, idtr.limit);
// Setup VMCB Control Fields.
vmcb_write32(vmcb_addr, VMCB_CTRL_INTERCEPT_VEC3, VMCB_CTRL_INTERCEPT_VEC3_ALL);
vmcb_write32(vmcb_addr, VMCB_CTRL_INTERCEPT_VEC4, VMCB_CTRL_INTERCEPT_VEC4_ALL);
// Enable Nested Paging (NPT):
// Write '1' to the NPT Enable field (0x090).
vmcb_write64(vmcb_addr, VMCB_CTRL_NP_ENABLE, (1 << VMCB_CTRL_NPT_ENABLE_BIT));
// 2Write the NPT root address to N_CR3 (0x098)
// Unlike Intel's EPTP, AMD's N_CR3 field is *only* the
// 4K-aligned physical address of the PML4 table.
// It does not contain any control bits.
uint64 npt_pointer = (npt_pml4_addr & ~0xFFF);
vmcb_write64(vmcb_addr, VMCB_CTRL_N_CR3, npt_pointer);
// Set Guest ASID.
vmcb_write32(vmcb_addr, VMCB_CTRL_ASID, 1);
}
GUEST_CODE static noinline void
nested_create_vm_amd(struct api_call_1* cmd, uint64 cpu_id)
{
uint64 vm_id = cmd->arg;
uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
uint64 l2_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id);
uint64 l2_msr_bitmap = X86_SYZOS_ADDR_MSR_BITMAP(cpu_id, vm_id);
guest_memset((void*)vmcb_addr, 0, KVM_PAGE_SIZE);
guest_memset((void*)X86_SYZOS_ADDR_VM_ARCH_SPECIFIC(cpu_id), 0, KVM_PAGE_SIZE);
guest_memset((void*)l2_pml4_addr, 0, KVM_PAGE_SIZE);
guest_memset((void*)l2_msr_bitmap, 0, KVM_PAGE_SIZE);
// Setup NPT (Nested Page Tables)
setup_l2_page_tables(CPU_VENDOR_AMD, cpu_id, vm_id, 0);
// Initialize VMCB Control and Guest State
init_vmcb_guest_state(cpu_id, vm_id);
}
GUEST_CODE static noinline void
guest_handle_nested_create_vm(struct api_call_1* cmd, uint64 cpu_id)
{
if (get_cpu_vendor() == CPU_VENDOR_INTEL) {
nested_create_vm_intel(cmd, cpu_id);
} else {
nested_create_vm_amd(cmd, cpu_id);
}
}
GUEST_CODE static uint64 l2_gpa_to_pa(uint64 cpu_id, uint64 vm_id, uint64 gpa)
{
uint64 pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id);
volatile uint64* pml4 = (volatile uint64*)pml4_addr;
uint64 pml4_idx = (gpa >> 39) & 0x1FF;
if (!(pml4[pml4_idx] & X86_PDE64_PRESENT))
return 0;
volatile uint64* pdpt = (volatile uint64*)(pml4[pml4_idx] & ~0xFFF);
uint64 pdpt_idx = (gpa >> 30) & 0x1FF;
if (!(pdpt[pdpt_idx] & X86_PDE64_PRESENT))
return 0;
volatile uint64* pd = (volatile uint64*)(pdpt[pdpt_idx] & ~0xFFF);
uint64 pd_idx = (gpa >> 21) & 0x1FF;
if (!(pd[pd_idx] & X86_PDE64_PRESENT))
return 0;
volatile uint64* pt = (volatile uint64*)(pd[pd_idx] & ~0xFFF);
uint64 pt_idx = (gpa >> 12) & 0x1FF;
if (!(pt[pt_idx] & X86_PDE64_PRESENT))
return 0;
return (pt[pt_idx] & ~0xFFF) + (gpa & 0xFFF);
}
GUEST_CODE static noinline void
guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_id)
{
uint64 vm_id = cmd->vm_id;
// Backing address in L1 for the L2 User Code (mapped at X86_SYZOS_ADDR_USER_CODE)
uint64 l2_code_backing = l2_gpa_to_pa(cpu_id, vm_id, X86_SYZOS_ADDR_USER_CODE);
if (!l2_code_backing) {
guest_uexit(0xE2BAD4);
return;
}
// Code size = command size - header size - vm_id size.
uint64 l2_code_size = cmd->header.size - sizeof(struct api_call_header) - sizeof(uint64);
if (l2_code_size > KVM_PAGE_SIZE)
l2_code_size = KVM_PAGE_SIZE;
guest_memcpy((void*)l2_code_backing, (void*)cmd->insns,
l2_code_size);
if (get_cpu_vendor() == CPU_VENDOR_INTEL) {
nested_vmptrld(cpu_id, vm_id);
// Start execution at standard User Code address
vmwrite(VMCS_GUEST_RIP, X86_SYZOS_ADDR_USER_CODE);
// Stack is mapped at X86_SYZOS_ADDR_STACK_BOTTOM
vmwrite(VMCS_GUEST_RSP, X86_SYZOS_ADDR_STACK_BOTTOM + KVM_PAGE_SIZE - 8);
} else {
vmcb_write64(X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id), VMCB_GUEST_RIP, X86_SYZOS_ADDR_USER_CODE);
vmcb_write64(X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id), VMCB_GUEST_RSP, X86_SYZOS_ADDR_STACK_BOTTOM + KVM_PAGE_SIZE - 8);
}
}
GUEST_CODE static noinline void
guest_handle_nested_load_syzos(struct api_call_nested_load_syzos* cmd, uint64 cpu_id)
{
uint64 vm_id = cmd->vm_id;
uint64 prog_size = cmd->header.size - __builtin_offsetof(struct api_call_nested_load_syzos, program);
uint64 l2_code_backing = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id);
volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS;
if (prog_size > KVM_PAGE_SIZE)
prog_size = KVM_PAGE_SIZE;
// Copy Payload to Code buffer.
guest_memcpy((void*)l2_code_backing, (void*)cmd->program, prog_size);
// Populate Globals.
uint64 globals_pa = l2_gpa_to_pa(cpu_id, vm_id, X86_SYZOS_ADDR_GLOBALS);
if (!globals_pa) {
guest_uexit(0xE2BAD3);
return;
}
volatile struct syzos_globals* l2_globals = (volatile struct syzos_globals*)globals_pa;
// Set initial state for ALL possible L2 VCPUs of this VM.
for (int i = 0; i < KVM_MAX_VCPU; i++) {
l2_globals->text_sizes[i] = prog_size;
globals->l2_ctx[i][vm_id].rdi = i;
globals->l2_ctx[i][vm_id].rax = 0; // Default RAX
// Note: RSP and RIP are set in the VMCB/VMCS, but they could also be in l2_ctx
// since the shims load them if we wanted. But currently they are in VMCB/VMCS.
}
// Set RIP to guest_main.
uint64 entry_rip = executor_fn_guest_addr(guest_main);
if (get_cpu_vendor() == CPU_VENDOR_INTEL) {
nested_vmptrld(cpu_id, vm_id);
vmwrite(VMCS_GUEST_RIP, entry_rip);
vmwrite(VMCS_GUEST_RSP, X86_SYZOS_ADDR_STACK_BOTTOM + KVM_PAGE_SIZE - 8);
} else {
uint64 vmcb = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
vmcb_write64(vmcb, VMCB_GUEST_RIP, entry_rip);
vmcb_write64(vmcb, VMCB_GUEST_RSP, X86_SYZOS_ADDR_STACK_BOTTOM + KVM_PAGE_SIZE - 8);
}
}
GUEST_CODE static noinline void
guest_handle_nested_vmentry_intel(uint64 vm_id, uint64 cpu_id, bool is_launch)
{
volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS;
struct l2_guest_regs* l2_regs = (struct l2_guest_regs*)&globals->l2_ctx[cpu_id][vm_id];
uint64 vmx_error_code = 0;
uint64 fail_flag = 0; // Will be 1 if EITHER CF or ZF is set
nested_vmptrld(cpu_id, vm_id);
// Mark the VM as active on this CPU.
globals->active_vm_id[cpu_id] = vm_id;
asm volatile(R"(
// 1. Red Zone protection.
sub $128, %%rsp
// 2. Stack Passthrough for Exit Handler.
push %[cpu_id]
push %[launch]
// 3. Save L1 callee-saved registers.
push %%rbx
push %%rbp
push %%r12
push %%r13
push %%r14
push %%r15
// 4. Update VMCS_HOST_RSP with the current stack pointer.
// This stack contains [RedZone] [cpu_id] [launch] [L1 regs].
mov %[host_rsp_field], %%r10
mov %%rsp, %%r11
vmwrite %%r11, %%r10
// 5. Load L2 GPRs from storage.
// We use RAX as a temporary base pointer.
mov %[l2_regs], %%rax
mov 8(%%rax), %%rbx
mov 16(%%rax), %%rcx
mov 24(%%rax), %%rdx
mov 32(%%rax), %%rsi
mov 40(%%rax), %%rdi
mov 48(%%rax), %%rbp
mov 56(%%rax), %%r8
mov 64(%%rax), %%r9
mov 72(%%rax), %%r10
mov 80(%%rax), %%r11
mov 88(%%rax), %%r12
mov 96(%%rax), %%r13
mov 104(%%rax), %%r14
mov 112(%%rax), %%r15
// Finally, load RAX (L2 RAX).
mov 0(%%rax), %%rax
// 6. Execute Launch or Resume.
// Check the launch flag on the stack.
// Stack offset for 'launch': [r15][r14][r13][r12][rbp][rbx] = 6*8 = 48 bytes.
cmpq $0, 48(%%rsp)
je 1f
vmlaunch
jmp 2f
1: vmresume
2: // 7. Failure path.
// Restore L1 registers to return to C.
pop %%r15
pop %%r14
pop %%r13
pop %%r12
pop %%rbp
pop %%rbx
// pop launch and cpu_id
add $16, %%rsp
// restore Red Zone
add $128, %%rsp
mov $1, %[ret]
jmp 3f
// 8. Success path (L2 Exit).
.globl after_vmentry_label
after_vmentry_label:
xor %[ret], %[ret]
3: // Final return to C.
)"
: [ret] "=&r"(fail_flag)
: [launch] "r"((uint64)is_launch),
[host_rsp_field] "i"(VMCS_HOST_RSP),
[cpu_id] "r"(cpu_id),
[l2_regs] "r"(l2_regs)
: "cc", "memory", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11");
if (fail_flag) {
// VMLAUNCH/VMRESUME failed, so VMCS is still valid and can be read.
vmx_error_code = vmread(VMCS_VM_INSTRUCTION_ERROR);
guest_uexit(0xE2E10000 | (uint32)vmx_error_code);
return;
}
}
GUEST_CODE static noinline void
guest_run_amd_vm(uint64 cpu_id, uint64 vm_id)
{
uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS;
globals->active_vm_id[cpu_id] = vm_id;
struct l2_guest_regs* l2_regs = (struct l2_guest_regs*)&globals->l2_ctx[cpu_id][vm_id];
uint8 fail_flag = 0;
asm volatile(R"(
// 1. Red Zone protection.
sub $128, %%rsp
// 2. Stack Passthrough for Exit Handler.
push %[cpu_id]
// Save VMCB address for later use after VMEXIT.
push %[vmcb_addr]
// 3. Save L1 callee-saved registers.
push %%rbx
push %%rbp
push %%r12
push %%r13
push %%r14
push %%r15
// 4. Load L2 GPRs from storage.
mov %[l2_regs], %%rax
// Sync RAX to VMCB (guest RAX).
mov 0(%%rax), %%rbx
mov %[vmcb_addr], %%rcx
mov %%rbx, 0x5f8(%%rcx)
mov 8(%%rax), %%rbx
mov 16(%%rax), %%rcx
mov 24(%%rax), %%rdx
mov 32(%%rax), %%rsi
mov 40(%%rax), %%rdi
mov 48(%%rax), %%rbp
mov 56(%%rax), %%r8
mov 64(%%rax), %%r9
mov 72(%%rax), %%r10
mov 80(%%rax), %%r11
mov 88(%%rax), %%r12
mov 96(%%rax), %%r13
mov 104(%%rax), %%r14
mov 112(%%rax), %%r15
// 4.5 Note: Host State (RSP and RIP) is saved automatically by VMRUN
// to the HSAVE area pointed to by VM_HSAVE_PA.
// There is no need to manually write it to the VMCB.
// 5. Execute VMRUN.
clgi
// VMCB address MUST be in RAX.
// It was pushed at Index 6: 6 * 8 = 48.
mov 48(%%rsp), %%rax
vmrun
1: // Host resumes here.
// Restore RAX as VMRUN clobbers it.
mov 48(%%rsp), %%rax
setc %[fail_flag]
// 6. Save L2's GPRs.
// exit_code (it will be at Index 15)
pushq 0x70(%%rax)
// Save L2 GPRs (Index 14 down to 1).
push %%r15
push %%r14
push %%r13
push %%r12
push %%r11
push %%r10
push %%r9
push %%r8
push %%rbp
push %%rdi
push %%rsi
push %%rdx
push %%rcx
push %%rbx
// Save L2 RAX from VMCB (Index 0).
// Since we pushed 16 regs (L2 RAX + 14 GPRs + exit_code), vmcb_addr is at 48 + 16 * 8 = 176(%%rsp).
mov 176(%%rsp), %%rax
pushq 0x5f8(%%rax)
// 7. Call the C handler.
// arg1 (RDI) = exit reason (at Index 15: 15 * 8 = 120 bytes)
mov 120(%%rsp), %%rdi
// arg2 (RSI) = pointer to the saved registers
mov %%rsp, %%rsi
call nested_vm_exit_handler_amd
// 8. Restore L1 state.
// Discard L2 GPRs (15 regs) + exit_code = 16 regs in total.
add $128, %%rsp
// Restore L1 callee-saved registers.
pop %%r15
pop %%r14
pop %%r13
pop %%r12
pop %%rbp
pop %%rbx
// 9. Discard vmcb_addr and cpu_id.
add $16, %%rsp
// 10. Restore Red Zone.
add $128, %%rsp
stgi
after_vmentry_label_amd:
)"
: [fail_flag] "=m"(fail_flag)
: [cpu_id] "r"(cpu_id), [vmcb_addr] "r"(vmcb_addr), [l2_regs] "r"(l2_regs),
[l2_regs_size] "i"(sizeof(struct l2_guest_regs))
: "cc", "memory", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11");
if (fail_flag) {
// VMRUN failed.
guest_uexit(0xE2E10000 | 0xFFFF);
return;
}
}
GUEST_CODE static noinline void
guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id)
{
uint64 vm_id = cmd->arg;
if (get_cpu_vendor() == CPU_VENDOR_INTEL) {
guest_handle_nested_vmentry_intel(vm_id, cpu_id, true);
} else {
guest_run_amd_vm(cpu_id, vm_id);
}
}
GUEST_CODE static noinline void
guest_handle_nested_vmresume(struct api_call_1* cmd, uint64 cpu_id)
{
uint64 vm_id = cmd->arg;
if (get_cpu_vendor() == CPU_VENDOR_INTEL) {
guest_handle_nested_vmentry_intel(vm_id, cpu_id, false);
} else {
guest_run_amd_vm(cpu_id, vm_id);
}
}
GUEST_CODE static noinline void
guest_handle_nested_intel_vmwrite_mask(struct api_call_5* cmd, uint64 cpu_id)
{
if (get_cpu_vendor() != CPU_VENDOR_INTEL)
return;
uint64 vm_id = cmd->args[0];
nested_vmptrld(cpu_id, vm_id);
uint64 field = cmd->args[1];
uint64 set_mask = cmd->args[2];
uint64 unset_mask = cmd->args[3];
uint64 flip_mask = cmd->args[4];
uint64 current_value = vmread(field);
uint64 new_value = (current_value & ~unset_mask) | set_mask;
new_value ^= flip_mask;
vmwrite(field, new_value);
}
GUEST_CODE static noinline void
guest_handle_nested_amd_vmcb_write_mask(struct api_call_5* cmd, uint64 cpu_id)
{
if (get_cpu_vendor() != CPU_VENDOR_AMD)
return;
uint64 vm_id = cmd->args[0];
uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
uint64 offset = cmd->args[1];
uint64 set_mask = cmd->args[2];
uint64 unset_mask = cmd->args[3];
uint64 flip_mask = cmd->args[4];
uint64 current_value = vmcb_read64((volatile uint8*)vmcb_addr, offset);
uint64 new_value = (current_value & ~unset_mask) | set_mask;
new_value ^= flip_mask;
vmcb_write64(vmcb_addr, offset, new_value);
}
GUEST_CODE static noinline void
guest_handle_nested_amd_invlpga(struct api_call_2* cmd, uint64 cpu_id)
{
if (get_cpu_vendor() != CPU_VENDOR_AMD)
return;
uint64 linear_addr = cmd->args[0];
// ASID (Address Space ID) - only lower 16 bits matter usually, but register is 32-bit.
uint32 asid = (uint32)cmd->args[1];
asm volatile("invlpga" : : "a"(linear_addr), "c"(asid) : "memory");
}
GUEST_CODE static noinline void
guest_handle_nested_amd_stgi()
{
if (get_cpu_vendor() != CPU_VENDOR_AMD)
return;
asm volatile("stgi" ::: "memory");
}
GUEST_CODE static noinline void
guest_handle_nested_amd_clgi()
{
if (get_cpu_vendor() != CPU_VENDOR_AMD)
return;
asm volatile("clgi" ::: "memory");
}
GUEST_CODE static noinline void
guest_handle_nested_amd_inject_event(struct api_call_5* cmd, uint64 cpu_id)
{
if (get_cpu_vendor() != CPU_VENDOR_AMD)
return;
uint64 vm_id = cmd->args[0];
uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
uint64 vector = cmd->args[1] & 0xFF;
uint64 type = cmd->args[2] & 0x7;
uint64 error_code = cmd->args[3] & 0xFFFFFFFF;
uint64 flags = cmd->args[4];
// Flags bit 0: Valid (V)
// Flags bit 1: Error Code Valid (EV)
uint64 event_inj = vector;
event_inj |= (type << 8);
if (flags & 2)
event_inj |= (1ULL << 11); // EV bit
if (flags & 1)
event_inj |= (1ULL << 31); // V bit
event_inj |= (error_code << 32);
// Write to VMCB Offset 0x60 (EVENTINJ)
vmcb_write64(vmcb_addr, 0x60, event_inj);
}
GUEST_CODE static noinline void
guest_handle_nested_amd_set_intercept(struct api_call_5* cmd, uint64 cpu_id)
{
if (get_cpu_vendor() != CPU_VENDOR_AMD)
return;
uint64 vm_id = cmd->args[0];
uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
uint64 offset = cmd->args[1];
uint64 bit_mask = cmd->args[2];
uint64 action = cmd->args[3]; // 1 = Set, 0 = Clear
// Read 32-bit intercept field (Offsets 0x00 - 0x14 are all 32-bit vectors).
uint32 current = vmcb_read32(vmcb_addr, (uint16)offset);
if (action == 1)
current |= (uint32)bit_mask;
else
current &= ~((uint32)bit_mask);
vmcb_write32(vmcb_addr, (uint16)offset, current);
}
GUEST_CODE static noinline void
guest_handle_nested_amd_vmload(struct api_call_1* cmd, uint64 cpu_id)
{
if (get_cpu_vendor() != CPU_VENDOR_AMD)
return;
uint64 vm_id = cmd->arg;
uint64 vmcb_pa = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
asm volatile("vmload %%rax" ::"a"(vmcb_pa) : "memory");
}
GUEST_CODE static noinline void
guest_handle_nested_amd_vmsave(struct api_call_1* cmd, uint64 cpu_id)
{
if (get_cpu_vendor() != CPU_VENDOR_AMD)
return;
uint64 vm_id = cmd->arg;
uint64 vmcb_pa = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
asm volatile("vmsave %%rax" ::"a"(vmcb_pa) : "memory");
}
#endif // EXECUTOR_COMMON_KVM_AMD64_SYZOS_H