blob: fc764b81e7d83865b8b1cb2f1a1fd1281afab614 [file] [log] [blame]
// Copyright 2023 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <lib/arch/asm.h>
#include "register-set-asm.h"
#if defined(__aarch64__)
#define JUMP b
#elif defined(__riscv)
// If vector instructions are not already enabled, do so at the assembler level.
#ifndef __riscv_v
.option arch, +v
#endif
#define JUMP j
#elif defined(__x86_64__)
#define JUMP jmp
#else
#error "what machine?"
#endif
.macro on_32 macro
\macro 0
\macro 1
\macro 2
\macro 3
\macro 4
\macro 5
\macro 6
\macro 7
\macro 8
\macro 9
\macro 10
\macro 11
\macro 12
\macro 13
\macro 14
\macro 15
\macro 16
\macro 17
\macro 18
\macro 19
\macro 20
\macro 21
\macro 22
\macro 23
\macro 24
\macro 25
\macro 26
\macro 27
\macro 28
\macro 29
\macro 30
\macro 31
.endm
.function spin_address, global
JUMP spin_address
.end_function
.function spin_with_general_regs, global
#if defined(__x86_64__)
// Set flags using POPF. Note that we use POPF rather than SAHF
// because POPF is able to set more flags than SAHF.
pushq REGS_RFLAGS(%rdi)
.cfi_adjust_cfa_offset 8
popfq
.cfi_adjust_cfa_offset -8
// Load general purpose registers.
mov REGS_RAX(%rdi), %rax
mov REGS_RBX(%rdi), %rbx
mov REGS_RCX(%rdi), %rcx
mov REGS_RDX(%rdi), %rdx
mov REGS_RSI(%rdi), %rsi
// Skip assigning %rdi here and assign it last.
mov REGS_RBP(%rdi), %rbp
mov REGS_RSP(%rdi), %rsp
mov REGS_R8(%rdi), %r8
mov REGS_R9(%rdi), %r9
mov REGS_R10(%rdi), %r10
mov REGS_R11(%rdi), %r11
mov REGS_R12(%rdi), %r12
mov REGS_R13(%rdi), %r13
mov REGS_R14(%rdi), %r14
mov REGS_R15(%rdi), %r15
mov REGS_RDI(%rdi), %rdi
#elif defined(__aarch64__)
// Load sp via a temporary register.
ldr x1, [x0, #REGS_SP]
mov sp, x1
// Load NZCV flags, a subset of the PSTATE/CPSR register.
ldr x1, [x0, #REGS_CPSR]
msr nzcv, x1
// Load general purpose registers.
// Skip assigning x0 and x1 here and assign them last.
ldp x2, x3, [x0, REGS_X(2)]
ldp x4, x5, [x0, REGS_X(4)]
ldp x6, x7, [x0, REGS_X(6)]
ldp x8, x9, [x0, REGS_X(8)]
ldp x10, x11, [x0, REGS_X(10)]
ldp x12, x13, [x0, REGS_X(12)]
ldp x14, x15, [x0, REGS_X(14)]
ldp x16, x17, [x0, REGS_X(16)]
ldp x18, x19, [x0, REGS_X(18)]
ldp x20, x21, [x0, REGS_X(20)]
ldp x22, x23, [x0, REGS_X(22)]
ldp x24, x25, [x0, REGS_X(24)]
ldp x26, x27, [x0, REGS_X(26)]
ldp x28, x29, [x0, REGS_X(28)]
ldr x30, [x0, REGS_X(30)]
ldp x0, x1, [x0, REGS_X(0)]
#elif defined(__riscv)
ld ra, REGS_RA(a0) // x1
ld sp, REGS_SP(a0) // x2
ld gp, REGS_GP(a0) // x3
ld tp, REGS_TP(a0) // x4
ld t0, REGS_T0(a0) // x5
ld t1, REGS_T1(a0) // x6
ld t2, REGS_T2(a0) // x7
ld s0, REGS_S0(a0) // x8
ld s1, REGS_S1(a0) // x9
// The pointer is in a0, so load that last.
ld a1, REGS_A1(a0) // x11
ld a2, REGS_A2(a0) // x12
ld a3, REGS_A3(a0) // x13
ld a4, REGS_A4(a0) // x14
ld a5, REGS_A5(a0) // x15
ld a6, REGS_A6(a0) // x16
ld a7, REGS_A7(a0) // x17
ld s2, REGS_S2(a0) // x18
ld s3, REGS_S3(a0) // x19
ld s4, REGS_S4(a0) // x20
ld s5, REGS_S5(a0) // x21
ld s6, REGS_S6(a0) // x22
ld s7, REGS_S7(a0) // x23
ld s8, REGS_S8(a0) // x24
ld s9, REGS_S9(a0) // x25
ld s10, REGS_S10(a0) // x26
ld s11, REGS_S11(a0) // x27
ld t3, REGS_T3(a0) // x28
ld t4, REGS_T4(a0) // x29
ld t5, REGS_T5(a0) // x30
ld t6, REGS_T6(a0) // x31
ld a0, REGS_A0(a0) // X10
#else
#error Unsupported architecture
#endif
JUMP spin_address
.end_function
.function spin_with_fp_regs, global
#if defined(__x86_64__)
mov $0x9999, %rax
movq %rax, %xmm0
movq REGS_ST(0)(%rdi), %mm0
movq REGS_ST(1)(%rdi), %mm1
movq REGS_ST(2)(%rdi), %mm2
movq REGS_ST(3)(%rdi), %mm3
movq REGS_ST(4)(%rdi), %mm4
movq REGS_ST(5)(%rdi), %mm5
movq REGS_ST(6)(%rdi), %mm6
movq REGS_ST(7)(%rdi), %mm7
#elif defined(__aarch64__)
// Just spins and does nothing. ARM64 doesn't define a separate FP state,
// but doing this allows the rest of the code to be platform-independent.
#elif defined(__riscv)
.macro load_fp n
fld f\n, REGS_F(\n)(a0)
.endm
on_32 load_fp
#else
#error Unsupported architecture
#endif
JUMP spin_address
.end_function
.function spin_with_vector_regs, global
#if defined(__x86_64__)
// This only loads XMM registers which are guaranteed to exist.
movdqu REGS_ZMM(0)(%rdi), %xmm0
movdqu REGS_ZMM(1)(%rdi), %xmm1
movdqu REGS_ZMM(2)(%rdi), %xmm2
movdqu REGS_ZMM(3)(%rdi), %xmm3
movdqu REGS_ZMM(4)(%rdi), %xmm4
movdqu REGS_ZMM(5)(%rdi), %xmm5
movdqu REGS_ZMM(6)(%rdi), %xmm6
movdqu REGS_ZMM(7)(%rdi), %xmm7
movdqu REGS_ZMM(8)(%rdi), %xmm8
movdqu REGS_ZMM(9)(%rdi), %xmm9
movdqu REGS_ZMM(10)(%rdi), %xmm10
movdqu REGS_ZMM(11)(%rdi), %xmm11
movdqu REGS_ZMM(12)(%rdi), %xmm12
movdqu REGS_ZMM(13)(%rdi), %xmm13
movdqu REGS_ZMM(14)(%rdi), %xmm14
movdqu REGS_ZMM(15)(%rdi), %xmm15
#elif defined(__aarch64__)
// FPCR and FPSR are first.
#if REGS_FPSR != REGS_FPCR + 4
#error "bad layout"
#endif
ldp w1, w2, [x0, REGS_FPCR]
msr fpcr, x1
msr fpsr, x2
// Each register is 128 bits = 16 bytes, so each pair is 32 bytes.
add x0, x0, REGS_Q(0)
ldp q0, q1, [x0], #32
ldp q2, q3, [x0], #32
ldp q4, q5, [x0], #32
ldp q6, q7, [x0], #32
ldp q8, q9, [x0], #32
ldp q10, q11, [x0], #32
ldp q12, q13, [x0], #32
ldp q14, q15, [x0], #32
ldp q16, q17, [x0], #32
ldp q18, q19, [x0], #32
ldp q20, q21, [x0], #32
ldp q22, q23, [x0], #32
ldp q24, q25, [x0], #32
ldp q26, q27, [x0], #32
ldp q28, q29, [x0], #32
ldp q30, q31, [x0]
#elif defined(__riscv)
// For indexing into the memory from which we want to load 8 vector
// registers at a time, compute 8 * VLENB.
csrr a1, vlenb
slli a1, a1, 3
addi t0, a0, RISCV64_VECTOR_STATE_V
vl8r.v v0, (t0)
add t0, t0, a1
vl8r.v v8, (t0)
add t0, t0, a1
vl8r.v v16, (t0)
add t0, t0, a1
vl8r.v v24, (t0)
lw a1, RISCV64_VECTOR_STATE_VCSR(a0)
csrw vcsr, a1
lw a1, RISCV64_VECTOR_STATE_VL(a0)
lw a2, RISCV64_VECTOR_STATE_VTYPE(a0)
vsetvl zero, a1, a2
// Any vector operation will reset `vstart`, so be sure to install this
// value last.
lw a1, RISCV64_VECTOR_STATE_VSTART(a0)
csrw vstart, a1
#else
#error Unsupported architecture
#endif
JUMP spin_address
.end_function
.function spin_with_debug_regs, global
// Do nothing. The register state will be set through syscalls because
// setting the debug registers is a privileged instruction.
JUMP spin_address
.end_function
.function save_general_regs_and_exit_thread, global
#if defined(__x86_64__)
mov %rax, REGS_RAX(%rsp)
mov %rbx, REGS_RBX(%rsp)
mov %rcx, REGS_RCX(%rsp)
mov %rdx, REGS_RDX(%rsp)
mov %rsi, REGS_RSI(%rsp)
mov %rdi, REGS_RDI(%rsp)
mov %rbp, REGS_RBP(%rsp)
mov %rsp, REGS_RSP(%rsp)
mov %r8, REGS_R8(%rsp)
mov %r9, REGS_R9(%rsp)
mov %r10, REGS_R10(%rsp)
mov %r11, REGS_R11(%rsp)
mov %r12, REGS_R12(%rsp)
mov %r13, REGS_R13(%rsp)
mov %r14, REGS_R14(%rsp)
mov %r15, REGS_R15(%rsp)
// Save the flags register.
pushfq
pop %rax
mov %rax, REGS_RFLAGS(%rsp)
// Fill out the %rip field with known value.
lea save_general_regs_and_exit_thread(%rip), %rax
mov %rax, REGS_RIP(%rsp)
jmp zx_thread_exit@PLT
ud2
#elif defined(__aarch64__)
stp x0, x1, [sp, REGS_X(0)]
stp x2, x3, [sp, REGS_X(2)]
stp x4, x5, [sp, REGS_X(4)]
stp x6, x7, [sp, REGS_X(6)]
stp x8, x9, [sp, REGS_X(8)]
stp x10, x11, [sp, REGS_X(10)]
stp x12, x13, [sp, REGS_X(12)]
stp x14, x15, [sp, REGS_X(14)]
stp x16, x17, [sp, REGS_X(16)]
stp x18, x19, [sp, REGS_X(18)]
stp x20, x21, [sp, REGS_X(20)]
stp x22, x23, [sp, REGS_X(22)]
stp x24, x25, [sp, REGS_X(24)]
stp x26, x27, [sp, REGS_X(26)]
stp x28, x29, [sp, REGS_X(28)]
str x30, [sp, #REGS_X(30)]
// Save the sp register.
mov x0, sp
str x0, [sp, REGS_SP]
// Fill out the pc field with a known value.
adr x0, save_general_regs_and_exit_thread
str x0, [sp, REGS_PC]
// Save NZCV flags, a subset of the PSTATE/CPSR register.
mrs x0, nzcv
str x0, [sp, REGS_CPSR]
bl zx_thread_exit
brk 0
#elif defined(__riscv)
sd ra, REGS_RA(sp)
sd sp, REGS_SP(sp)
sd gp, REGS_GP(sp)
sd tp, REGS_TP(sp)
sd t0, REGS_T0(sp)
sd t1, REGS_T1(sp)
sd t2, REGS_T2(sp)
sd s0, REGS_S0(sp)
sd s1, REGS_S1(sp)
sd a0, REGS_A0(sp)
sd a1, REGS_A1(sp)
sd a2, REGS_A2(sp)
sd a3, REGS_A3(sp)
sd a4, REGS_A4(sp)
sd a5, REGS_A5(sp)
sd a6, REGS_A6(sp)
sd a7, REGS_A7(sp)
sd s2, REGS_S2(sp)
sd s3, REGS_S3(sp)
sd s4, REGS_S4(sp)
sd s5, REGS_S5(sp)
sd s6, REGS_S6(sp)
sd s7, REGS_S7(sp)
sd s8, REGS_S8(sp)
sd s9, REGS_S9(sp)
sd s10, REGS_S10(sp)
sd s11, REGS_S11(sp)
sd t3, REGS_T3(sp)
sd t4, REGS_T4(sp)
sd t5, REGS_T5(sp)
sd t6, REGS_T6(sp)
// Fill out the pc field with a known value.
lla a0, save_general_regs_and_exit_thread
sd a0, REGS_PC(sp)
call zx_thread_exit
unimp
#else
#error Unsupported architecture
#endif
.end_function
.function save_fp_regs_and_exit_thread, global
#if defined(__x86_64__)
movq %mm0, REGS_ST(0)(%rsp)
movq %mm1, REGS_ST(1)(%rsp)
movq %mm2, REGS_ST(2)(%rsp)
movq %mm3, REGS_ST(3)(%rsp)
movq %mm4, REGS_ST(4)(%rsp)
movq %mm5, REGS_ST(5)(%rsp)
movq %mm6, REGS_ST(6)(%rsp)
movq %mm7, REGS_ST(7)(%rsp)
jmp zx_thread_exit@PLT
ud2
#elif defined(__aarch64__)
// Does nothing (no FP values).
bl zx_thread_exit
brk 0
#elif defined(__riscv)
// Save a structure that directly matches zx_riscv64_thread_state_fp_regs_t.
li a0, -1
.macro save_fp_on_stack n
fsd f\n, REGS_F(\n)(sp)
sd a0, (REGS_F(\n)+8)(sp)
.endm
on_32 save_fp_on_stack
csrr a0, fcsr
sw a0, REGS_F(32)(sp)
call zx_thread_exit
unimp
#else
#error Unsupported architecture
#endif
.end_function
.function save_vector_regs_and_exit_thread, global
#if defined(__x86_64__)
// Each vector is 512 bits (64 bytes).
// We only read the first 128 (XMM registers).
movdqu %xmm0, REGS_ZMM(0)(%rsp)
movdqu %xmm1, REGS_ZMM(1)(%rsp)
movdqu %xmm2, REGS_ZMM(2)(%rsp)
movdqu %xmm3, REGS_ZMM(3)(%rsp)
movdqu %xmm4, REGS_ZMM(4)(%rsp)
movdqu %xmm5, REGS_ZMM(5)(%rsp)
movdqu %xmm6, REGS_ZMM(6)(%rsp)
movdqu %xmm7, REGS_ZMM(7)(%rsp)
movdqu %xmm8, REGS_ZMM(8)(%rsp)
movdqu %xmm9, REGS_ZMM(9)(%rsp)
movdqu %xmm10, REGS_ZMM(10)(%rsp)
movdqu %xmm11, REGS_ZMM(11)(%rsp)
movdqu %xmm12, REGS_ZMM(12)(%rsp)
movdqu %xmm13, REGS_ZMM(13)(%rsp)
movdqu %xmm14, REGS_ZMM(14)(%rsp)
movdqu %xmm15, REGS_ZMM(15)(%rsp)
jmp zx_thread_exit@PLT
ud2
#elif defined(__aarch64__)
// FPCR and FPSR.
mrs x0, fpcr
mrs x1, fpsr
stp w0, w1, [sp, REGS_FPCR]
add x0, sp, REGS_Q(0)
stp q0, q1, [x0], #32
stp q2, q3, [x0], #32
stp q4, q5, [x0], #32
stp q6, q7, [x0], #32
stp q8, q9, [x0], #32
stp q10, q11, [x0], #32
stp q12, q13, [x0], #32
stp q14, q15, [x0], #32
stp q16, q17, [x0], #32
stp q18, q19, [x0], #32
stp q20, q21, [x0], #32
stp q22, q23, [x0], #32
stp q24, q25, [x0], #32
stp q26, q27, [x0], #32
stp q28, q29, [x0], #32
stp q30, q31, [x0]
bl zx_thread_exit
brk 0
bl zx_thread_exit
brk 0
#elif defined(__riscv)
// Any vector operation will reset `vstart`, so be sure to save this
// value first.
csrr a0, vstart
sd a0, RISCV64_VECTOR_STATE_VSTART(sp)
csrr a0, vcsr
sd a0, RISCV64_VECTOR_STATE_VCSR(sp)
csrr a0, vl
sd a0, RISCV64_VECTOR_STATE_VL(sp)
csrr a0, vtype
sd a0, RISCV64_VECTOR_STATE_VTYPE(sp)
// For indexing into the memory at which we want to store 8 vector registers
// at a time, compute 8 * VLENB.
csrr a1, vlenb
slli a1, a1, 3
addi a0, sp, RISCV64_VECTOR_STATE_V
vs8r.v v0, (a0)
add a0, a0, a1
vs8r.v v8, (a0)
add a0, a0, a1
vs8r.v v16, (a0)
add a0, a0, a1
vs8r.v v24, (a0)
call zx_thread_exit
unimp
#else
#error Unsupported architecture
#endif
.end_function
.function save_thread_local_regs_and_exit_thread, global
#if defined(__x86_64__)
// Read from %fs.base and %gs.base into the output.
// The test will assert the correct values were read.
movq %fs:0, %rax
movq %rax, (%rsp)
movq %gs:0, %rax
movq %rax, 8(%rsp)
// Write constants %fs.base and %gs_base.
// The test will assert the correct values were written.
// **NOTE:** This doesn't really set much since it doesn't
// change the registers themselves, but not all CPUs support
// the wrfsbase and wrgsbase instructions to do that from user ode.
movq $0x12345678, %fs:0
movq $0x7890abcd, %gs:0
jmp zx_thread_exit@PLT
ud2
#elif defined(__aarch64__)
mrs x1, tpidr_el0
ldr x2, [x1]
str x2, [sp]
movlit x2, 0x12345678
str x2, [x1]
bl zx_thread_exit
brk 0
#elif defined(__riscv)
ld a0, (tp)
sd a0, (sp)
li a0, 0x12345678
sd a0, (tp)
call zx_thread_exit
unimp
#else
#error Unsupported architecture
#endif
.end_function