zircon/system/utest/core/threads/register-set.S - fuchsia - Git at Google

 // Copyright 2023 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include <lib/arch/asm.h>

 #include "register-set-asm.h"

 #if defined(__aarch64__)

 #define JUMP b

 #elif defined(__riscv)

 // If vector instructions are not already enabled, do so at the assembler level.
 #ifndef __riscv_v
 .option arch, +v
 #endif

 #define JUMP j

 #elif defined(__x86_64__)

 #define JUMP jmp

 #else

 #error "what machine?"

 #endif

 .macro on_32 macro
   \macro 0
   \macro 1
   \macro 2
   \macro 3
   \macro 4
   \macro 5
   \macro 6
   \macro 7
   \macro 8
   \macro 9
   \macro 10
   \macro 11
   \macro 12
   \macro 13
   \macro 14
   \macro 15
   \macro 16
   \macro 17
   \macro 18
   \macro 19
   \macro 20
   \macro 21
   \macro 22
   \macro 23
   \macro 24
   \macro 25
   \macro 26
   \macro 27
   \macro 28
   \macro 29
   \macro 30
   \macro 31
 .endm

 .function spin_address, global
   JUMP spin_address
 .end_function

 .function spin_with_general_regs, global

 #if defined(__x86_64__)

   // Set flags using POPF.  Note that we use POPF rather than SAHF
   // because POPF is able to set more flags than SAHF.
   pushq REGS_RFLAGS(%rdi)
   .cfi_adjust_cfa_offset 8
   popfq
   .cfi_adjust_cfa_offset -8

   // Load general purpose registers.
   mov REGS_RAX(%rdi), %rax
   mov REGS_RBX(%rdi), %rbx
   mov REGS_RCX(%rdi), %rcx
   mov REGS_RDX(%rdi), %rdx
   mov REGS_RSI(%rdi), %rsi
   // Skip assigning %rdi here and assign it last.
   mov REGS_RBP(%rdi), %rbp
   mov REGS_RSP(%rdi), %rsp
   mov REGS_R8(%rdi), %r8
   mov REGS_R9(%rdi), %r9
   mov REGS_R10(%rdi), %r10
   mov REGS_R11(%rdi), %r11
   mov REGS_R12(%rdi), %r12
   mov REGS_R13(%rdi), %r13
   mov REGS_R14(%rdi), %r14
   mov REGS_R15(%rdi), %r15
   mov REGS_RDI(%rdi), %rdi

 #elif defined(__aarch64__)

   // Load sp via a temporary register.
   ldr x1, [x0, #REGS_SP]
   mov sp, x1

   // Load NZCV flags, a subset of the PSTATE/CPSR register.
   ldr x1, [x0, #REGS_CPSR]
   msr nzcv, x1

   // Load general purpose registers.
   // Skip assigning x0 and x1 here and assign them last.
   ldp x2, x3, [x0, REGS_X(2)]
   ldp x4, x5, [x0, REGS_X(4)]
   ldp x6, x7, [x0, REGS_X(6)]
   ldp x8, x9, [x0, REGS_X(8)]
   ldp x10, x11, [x0, REGS_X(10)]
   ldp x12, x13, [x0, REGS_X(12)]
   ldp x14, x15, [x0, REGS_X(14)]
   ldp x16, x17, [x0, REGS_X(16)]
   ldp x18, x19, [x0, REGS_X(18)]
   ldp x20, x21, [x0, REGS_X(20)]
   ldp x22, x23, [x0, REGS_X(22)]
   ldp x24, x25, [x0, REGS_X(24)]
   ldp x26, x27, [x0, REGS_X(26)]
   ldp x28, x29, [x0, REGS_X(28)]
   ldr x30, [x0, REGS_X(30)]
   ldp x0, x1, [x0, REGS_X(0)]

 #elif defined(__riscv)

   ld ra, REGS_RA(a0)    // x1
   ld sp, REGS_SP(a0)    // x2
   ld gp, REGS_GP(a0)    // x3
   ld tp, REGS_TP(a0)    // x4
   ld t0, REGS_T0(a0)    // x5
   ld t1, REGS_T1(a0)    // x6
   ld t2, REGS_T2(a0)    // x7
   ld s0, REGS_S0(a0)    // x8
   ld s1, REGS_S1(a0)    // x9
   // The pointer is in a0, so load that last.
   ld a1, REGS_A1(a0)    // x11
   ld a2, REGS_A2(a0)    // x12
   ld a3, REGS_A3(a0)    // x13
   ld a4, REGS_A4(a0)    // x14
   ld a5, REGS_A5(a0)    // x15
   ld a6, REGS_A6(a0)    // x16
   ld a7, REGS_A7(a0)    // x17
   ld s2, REGS_S2(a0)    // x18
   ld s3, REGS_S3(a0)    // x19
   ld s4, REGS_S4(a0)    // x20
   ld s5, REGS_S5(a0)    // x21
   ld s6, REGS_S6(a0)    // x22
   ld s7, REGS_S7(a0)    // x23
   ld s8, REGS_S8(a0)    // x24
   ld s9, REGS_S9(a0)    // x25
   ld s10, REGS_S10(a0)  // x26
   ld s11, REGS_S11(a0)  // x27
   ld t3, REGS_T3(a0)    // x28
   ld t4, REGS_T4(a0)    // x29
   ld t5, REGS_T5(a0)    // x30
   ld t6, REGS_T6(a0)    // x31
   ld a0, REGS_A0(a0)    // X10

 #else

 #error Unsupported architecture

 #endif

   JUMP spin_address
 .end_function

 .function spin_with_fp_regs, global

 #if defined(__x86_64__)

   mov $0x9999, %rax
   movq %rax, %xmm0

   movq REGS_ST(0)(%rdi), %mm0
   movq REGS_ST(1)(%rdi), %mm1
   movq REGS_ST(2)(%rdi), %mm2
   movq REGS_ST(3)(%rdi), %mm3
   movq REGS_ST(4)(%rdi), %mm4
   movq REGS_ST(5)(%rdi), %mm5
   movq REGS_ST(6)(%rdi), %mm6
   movq REGS_ST(7)(%rdi), %mm7

 #elif defined(__aarch64__)

 // Just spins and does nothing. ARM64 doesn't define a separate FP state,
 // but doing this allows the rest of the code to be platform-independent.

 #elif defined(__riscv)

   .macro load_fp n
     fld f\n, REGS_F(\n)(a0)
   .endm
   on_32 load_fp

 #else

 #error Unsupported architecture

 #endif

   JUMP spin_address
 .end_function

 .function spin_with_vector_regs, global

 #if defined(__x86_64__)

   // This only loads XMM registers which are guaranteed to exist.
   movdqu REGS_ZMM(0)(%rdi), %xmm0
   movdqu REGS_ZMM(1)(%rdi), %xmm1
   movdqu REGS_ZMM(2)(%rdi), %xmm2
   movdqu REGS_ZMM(3)(%rdi), %xmm3
   movdqu REGS_ZMM(4)(%rdi), %xmm4
   movdqu REGS_ZMM(5)(%rdi), %xmm5
   movdqu REGS_ZMM(6)(%rdi), %xmm6
   movdqu REGS_ZMM(7)(%rdi), %xmm7
   movdqu REGS_ZMM(8)(%rdi), %xmm8
   movdqu REGS_ZMM(9)(%rdi), %xmm9
   movdqu REGS_ZMM(10)(%rdi), %xmm10
   movdqu REGS_ZMM(11)(%rdi), %xmm11
   movdqu REGS_ZMM(12)(%rdi), %xmm12
   movdqu REGS_ZMM(13)(%rdi), %xmm13
   movdqu REGS_ZMM(14)(%rdi), %xmm14
   movdqu REGS_ZMM(15)(%rdi), %xmm15

 #elif defined(__aarch64__)

   // FPCR and FPSR are first.
 #if REGS_FPSR != REGS_FPCR + 4
 #error "bad layout"
 #endif
   ldp w1, w2, [x0, REGS_FPCR]
   msr fpcr, x1
   msr fpsr, x2

   // Each register is 128 bits = 16 bytes, so each pair is 32 bytes.
   add x0, x0, REGS_Q(0)
   ldp q0, q1, [x0], #32
   ldp q2, q3, [x0], #32
   ldp q4, q5, [x0], #32
   ldp q6, q7, [x0], #32
   ldp q8, q9, [x0], #32
   ldp q10, q11, [x0], #32
   ldp q12, q13, [x0], #32
   ldp q14, q15, [x0], #32
   ldp q16, q17, [x0], #32
   ldp q18, q19, [x0], #32
   ldp q20, q21, [x0], #32
   ldp q22, q23, [x0], #32
   ldp q24, q25, [x0], #32
   ldp q26, q27, [x0], #32
   ldp q28, q29, [x0], #32
   ldp q30, q31, [x0]

 #elif defined(__riscv)
   // For indexing into the memory from which we want to load 8 vector
   // registers at a time, compute 8 * VLENB.
   csrr  a1, vlenb
   slli  a1, a1, 3

   addi    t0, a0, RISCV64_VECTOR_STATE_V
   vl8r.v  v0, (t0)
   add     t0, t0, a1
   vl8r.v  v8, (t0)
   add     t0, t0, a1
   vl8r.v  v16, (t0)
   add     t0, t0, a1
   vl8r.v  v24, (t0)

   lw      a1, RISCV64_VECTOR_STATE_VCSR(a0)
   csrw    vcsr, a1

   lw      a1, RISCV64_VECTOR_STATE_VL(a0)
   lw      a2, RISCV64_VECTOR_STATE_VTYPE(a0)
   vsetvl  zero, a1, a2

   // Any vector operation will reset `vstart`, so be sure to install this
   // value last.
   lw      a1, RISCV64_VECTOR_STATE_VSTART(a0)
   csrw    vstart, a1

 #else

 #error Unsupported architecture

 #endif

   JUMP spin_address
 .end_function

 .function spin_with_debug_regs, global

   // Do nothing.  The register state will be set through syscalls because
   // setting the debug registers is a privileged instruction.

   JUMP spin_address
 .end_function

 .function save_general_regs_and_exit_thread, global

 #if defined(__x86_64__)

   mov %rax, REGS_RAX(%rsp)
   mov %rbx, REGS_RBX(%rsp)
   mov %rcx, REGS_RCX(%rsp)
   mov %rdx, REGS_RDX(%rsp)
   mov %rsi, REGS_RSI(%rsp)
   mov %rdi, REGS_RDI(%rsp)
   mov %rbp, REGS_RBP(%rsp)
   mov %rsp, REGS_RSP(%rsp)
   mov %r8, REGS_R8(%rsp)
   mov %r9, REGS_R9(%rsp)
   mov %r10, REGS_R10(%rsp)
   mov %r11, REGS_R11(%rsp)
   mov %r12, REGS_R12(%rsp)
   mov %r13, REGS_R13(%rsp)
   mov %r14, REGS_R14(%rsp)
   mov %r15, REGS_R15(%rsp)

   // Save the flags register.
   pushfq
   pop %rax
   mov %rax, REGS_RFLAGS(%rsp)

   // Fill out the %rip field with known value.
   lea save_general_regs_and_exit_thread(%rip), %rax
   mov %rax, REGS_RIP(%rsp)

   jmp zx_thread_exit@PLT
   ud2

 #elif defined(__aarch64__)

   stp x0, x1, [sp, REGS_X(0)]
   stp x2, x3, [sp, REGS_X(2)]
   stp x4, x5, [sp, REGS_X(4)]
   stp x6, x7, [sp, REGS_X(6)]
   stp x8, x9, [sp, REGS_X(8)]
   stp x10, x11, [sp, REGS_X(10)]
   stp x12, x13, [sp, REGS_X(12)]
   stp x14, x15, [sp, REGS_X(14)]
   stp x16, x17, [sp, REGS_X(16)]
   stp x18, x19, [sp, REGS_X(18)]
   stp x20, x21, [sp, REGS_X(20)]
   stp x22, x23, [sp, REGS_X(22)]
   stp x24, x25, [sp, REGS_X(24)]
   stp x26, x27, [sp, REGS_X(26)]
   stp x28, x29, [sp, REGS_X(28)]
   str x30, [sp, #REGS_X(30)]

   // Save the sp register.
   mov x0, sp
   str x0, [sp, REGS_SP]

   // Fill out the pc field with a known value.
   adr x0, save_general_regs_and_exit_thread
   str x0, [sp, REGS_PC]

   // Save NZCV flags, a subset of the PSTATE/CPSR register.
   mrs x0, nzcv
   str x0, [sp, REGS_CPSR]
   bl zx_thread_exit
   brk 0

 #elif defined(__riscv)

   sd ra, REGS_RA(sp)
   sd sp, REGS_SP(sp)
   sd gp, REGS_GP(sp)
   sd tp, REGS_TP(sp)
   sd t0, REGS_T0(sp)
   sd t1, REGS_T1(sp)
   sd t2, REGS_T2(sp)
   sd s0, REGS_S0(sp)
   sd s1, REGS_S1(sp)
   sd a0, REGS_A0(sp)
   sd a1, REGS_A1(sp)
   sd a2, REGS_A2(sp)
   sd a3, REGS_A3(sp)
   sd a4, REGS_A4(sp)
   sd a5, REGS_A5(sp)
   sd a6, REGS_A6(sp)
   sd a7, REGS_A7(sp)
   sd s2, REGS_S2(sp)
   sd s3, REGS_S3(sp)
   sd s4, REGS_S4(sp)
   sd s5, REGS_S5(sp)
   sd s6, REGS_S6(sp)
   sd s7, REGS_S7(sp)
   sd s8, REGS_S8(sp)
   sd s9, REGS_S9(sp)
   sd s10, REGS_S10(sp)
   sd s11, REGS_S11(sp)
   sd t3, REGS_T3(sp)
   sd t4, REGS_T4(sp)
   sd t5, REGS_T5(sp)
   sd t6, REGS_T6(sp)

   // Fill out the pc field with a known value.
   lla a0, save_general_regs_and_exit_thread
   sd a0, REGS_PC(sp)

   call zx_thread_exit
   unimp

 #else

 #error Unsupported architecture

 #endif

 .end_function

 .function save_fp_regs_and_exit_thread, global

 #if defined(__x86_64__)

   movq %mm0, REGS_ST(0)(%rsp)
   movq %mm1, REGS_ST(1)(%rsp)
   movq %mm2, REGS_ST(2)(%rsp)
   movq %mm3, REGS_ST(3)(%rsp)
   movq %mm4, REGS_ST(4)(%rsp)
   movq %mm5, REGS_ST(5)(%rsp)
   movq %mm6, REGS_ST(6)(%rsp)
   movq %mm7, REGS_ST(7)(%rsp)

   jmp zx_thread_exit@PLT
   ud2

 #elif defined(__aarch64__)

   // Does nothing (no FP values).

   bl zx_thread_exit
   brk 0

 #elif defined(__riscv)

   // Save a structure that directly matches zx_riscv64_thread_state_fp_regs_t.
   li      a0, -1
   .macro save_fp_on_stack n
     fsd   f\n, REGS_F(\n)(sp)
     sd    a0, (REGS_F(\n)+8)(sp)
   .endm
   on_32 save_fp_on_stack
   csrr    a0, fcsr
   sw      a0, REGS_F(32)(sp)

   call zx_thread_exit
   unimp

 #else

 #error Unsupported architecture

 #endif

 .end_function

 .function save_vector_regs_and_exit_thread, global

 #if defined(__x86_64__)

   // Each vector is 512 bits (64 bytes).
   // We only read the first 128 (XMM registers).
   movdqu %xmm0, REGS_ZMM(0)(%rsp)
   movdqu %xmm1, REGS_ZMM(1)(%rsp)
   movdqu %xmm2, REGS_ZMM(2)(%rsp)
   movdqu %xmm3, REGS_ZMM(3)(%rsp)
   movdqu %xmm4, REGS_ZMM(4)(%rsp)
   movdqu %xmm5, REGS_ZMM(5)(%rsp)
   movdqu %xmm6, REGS_ZMM(6)(%rsp)
   movdqu %xmm7, REGS_ZMM(7)(%rsp)
   movdqu %xmm8, REGS_ZMM(8)(%rsp)
   movdqu %xmm9, REGS_ZMM(9)(%rsp)
   movdqu %xmm10, REGS_ZMM(10)(%rsp)
   movdqu %xmm11, REGS_ZMM(11)(%rsp)
   movdqu %xmm12, REGS_ZMM(12)(%rsp)
   movdqu %xmm13, REGS_ZMM(13)(%rsp)
   movdqu %xmm14, REGS_ZMM(14)(%rsp)
   movdqu %xmm15, REGS_ZMM(15)(%rsp)

   jmp zx_thread_exit@PLT
   ud2

 #elif defined(__aarch64__)

   // FPCR and FPSR.
   mrs x0, fpcr
   mrs x1, fpsr
   stp w0, w1, [sp, REGS_FPCR]

   add x0, sp, REGS_Q(0)
   stp q0, q1, [x0], #32
   stp q2, q3, [x0], #32
   stp q4, q5, [x0], #32
   stp q6, q7, [x0], #32
   stp q8, q9, [x0], #32
   stp q10, q11, [x0], #32
   stp q12, q13, [x0], #32
   stp q14, q15, [x0], #32
   stp q16, q17, [x0], #32
   stp q18, q19, [x0], #32
   stp q20, q21, [x0], #32
   stp q22, q23, [x0], #32
   stp q24, q25, [x0], #32
   stp q26, q27, [x0], #32
   stp q28, q29, [x0], #32
   stp q30, q31, [x0]

   bl zx_thread_exit
   brk 0

   bl zx_thread_exit
   brk 0

 #elif defined(__riscv)

   // Any vector operation will reset `vstart`, so be sure to save this
   // value first.
   csrr    a0, vstart
   sd      a0, RISCV64_VECTOR_STATE_VSTART(sp)

   csrr    a0, vcsr
   sd      a0, RISCV64_VECTOR_STATE_VCSR(sp)

   csrr    a0, vl
   sd      a0, RISCV64_VECTOR_STATE_VL(sp)

   csrr    a0, vtype
   sd      a0, RISCV64_VECTOR_STATE_VTYPE(sp)

   // For indexing into the memory at which we want to store 8 vector registers
   // at a time, compute 8 * VLENB.
   csrr  a1, vlenb
   slli  a1, a1, 3

   addi    a0, sp, RISCV64_VECTOR_STATE_V
   vs8r.v  v0, (a0)
   add     a0, a0, a1
   vs8r.v  v8, (a0)
   add     a0, a0, a1
   vs8r.v  v16, (a0)
   add     a0, a0, a1
   vs8r.v  v24, (a0)

   call zx_thread_exit
   unimp

 #else

 #error Unsupported architecture

 #endif

 .end_function

 .function save_thread_local_regs_and_exit_thread, global

 #if defined(__x86_64__)

   // Read from %fs.base and %gs.base into the output.
   // The test will assert the correct values were read.
   movq %fs:0, %rax
   movq %rax, (%rsp)
   movq %gs:0, %rax
   movq %rax, 8(%rsp)

   // Write constants %fs.base and %gs_base.
   // The test will assert the correct values were written.
   // **NOTE:** This doesn't really set much since it doesn't
   // change the registers themselves, but not all CPUs support
   // the wrfsbase and wrgsbase instructions to do that from user ode.
   movq $0x12345678, %fs:0
   movq $0x7890abcd, %gs:0

   jmp zx_thread_exit@PLT
   ud2

 #elif defined(__aarch64__)

   mrs x1, tpidr_el0
   ldr x2, [x1]
   str x2, [sp]
   movlit x2, 0x12345678
   str x2, [x1]

   bl zx_thread_exit
   brk 0

 #elif defined(__riscv)

   ld a0, (tp)
   sd a0, (sp)
   li a0, 0x12345678
   sd a0, (tp)

   call zx_thread_exit
   unimp

 #else

 #error Unsupported architecture

 #endif

 .end_function
	// Copyright 2023 The Fuchsia Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include <lib/arch/asm.h>

	#include "register-set-asm.h"

	#if defined(__aarch64__)

	#define JUMP b

	#elif defined(__riscv)

	// If vector instructions are not already enabled, do so at the assembler level.
	#ifndef __riscv_v
	.option arch, +v
	#endif

	#define JUMP j

	#elif defined(__x86_64__)

	#define JUMP jmp

	#else

	#error "what machine?"

	#endif

	.macro on_32 macro
	\macro 0
	\macro 1
	\macro 2
	\macro 3
	\macro 4
	\macro 5
	\macro 6
	\macro 7
	\macro 8
	\macro 9
	\macro 10
	\macro 11
	\macro 12
	\macro 13
	\macro 14
	\macro 15
	\macro 16
	\macro 17
	\macro 18
	\macro 19
	\macro 20
	\macro 21
	\macro 22
	\macro 23
	\macro 24
	\macro 25
	\macro 26
	\macro 27
	\macro 28
	\macro 29
	\macro 30
	\macro 31
	.endm

	.function spin_address, global
	JUMP spin_address
	.end_function

	.function spin_with_general_regs, global

	#if defined(__x86_64__)

	// Set flags using POPF. Note that we use POPF rather than SAHF
	// because POPF is able to set more flags than SAHF.
	pushq REGS_RFLAGS(%rdi)
	.cfi_adjust_cfa_offset 8
	popfq
	.cfi_adjust_cfa_offset -8

	// Load general purpose registers.
	mov REGS_RAX(%rdi), %rax
	mov REGS_RBX(%rdi), %rbx
	mov REGS_RCX(%rdi), %rcx
	mov REGS_RDX(%rdi), %rdx
	mov REGS_RSI(%rdi), %rsi
	// Skip assigning %rdi here and assign it last.
	mov REGS_RBP(%rdi), %rbp
	mov REGS_RSP(%rdi), %rsp
	mov REGS_R8(%rdi), %r8
	mov REGS_R9(%rdi), %r9
	mov REGS_R10(%rdi), %r10
	mov REGS_R11(%rdi), %r11
	mov REGS_R12(%rdi), %r12
	mov REGS_R13(%rdi), %r13
	mov REGS_R14(%rdi), %r14
	mov REGS_R15(%rdi), %r15
	mov REGS_RDI(%rdi), %rdi

	#elif defined(__aarch64__)

	// Load sp via a temporary register.
	ldr x1, [x0, #REGS_SP]
	mov sp, x1

	// Load NZCV flags, a subset of the PSTATE/CPSR register.
	ldr x1, [x0, #REGS_CPSR]
	msr nzcv, x1

	// Load general purpose registers.
	// Skip assigning x0 and x1 here and assign them last.
	ldp x2, x3, [x0, REGS_X(2)]
	ldp x4, x5, [x0, REGS_X(4)]
	ldp x6, x7, [x0, REGS_X(6)]
	ldp x8, x9, [x0, REGS_X(8)]
	ldp x10, x11, [x0, REGS_X(10)]
	ldp x12, x13, [x0, REGS_X(12)]
	ldp x14, x15, [x0, REGS_X(14)]
	ldp x16, x17, [x0, REGS_X(16)]
	ldp x18, x19, [x0, REGS_X(18)]
	ldp x20, x21, [x0, REGS_X(20)]
	ldp x22, x23, [x0, REGS_X(22)]
	ldp x24, x25, [x0, REGS_X(24)]
	ldp x26, x27, [x0, REGS_X(26)]
	ldp x28, x29, [x0, REGS_X(28)]
	ldr x30, [x0, REGS_X(30)]
	ldp x0, x1, [x0, REGS_X(0)]

	#elif defined(__riscv)

	ld ra, REGS_RA(a0) // x1
	ld sp, REGS_SP(a0) // x2
	ld gp, REGS_GP(a0) // x3
	ld tp, REGS_TP(a0) // x4
	ld t0, REGS_T0(a0) // x5
	ld t1, REGS_T1(a0) // x6
	ld t2, REGS_T2(a0) // x7
	ld s0, REGS_S0(a0) // x8
	ld s1, REGS_S1(a0) // x9
	// The pointer is in a0, so load that last.
	ld a1, REGS_A1(a0) // x11
	ld a2, REGS_A2(a0) // x12
	ld a3, REGS_A3(a0) // x13
	ld a4, REGS_A4(a0) // x14
	ld a5, REGS_A5(a0) // x15
	ld a6, REGS_A6(a0) // x16
	ld a7, REGS_A7(a0) // x17
	ld s2, REGS_S2(a0) // x18
	ld s3, REGS_S3(a0) // x19
	ld s4, REGS_S4(a0) // x20
	ld s5, REGS_S5(a0) // x21
	ld s6, REGS_S6(a0) // x22
	ld s7, REGS_S7(a0) // x23
	ld s8, REGS_S8(a0) // x24
	ld s9, REGS_S9(a0) // x25
	ld s10, REGS_S10(a0) // x26
	ld s11, REGS_S11(a0) // x27
	ld t3, REGS_T3(a0) // x28
	ld t4, REGS_T4(a0) // x29
	ld t5, REGS_T5(a0) // x30
	ld t6, REGS_T6(a0) // x31
	ld a0, REGS_A0(a0) // X10

	#else

	#error Unsupported architecture

	#endif

	JUMP spin_address
	.end_function

	.function spin_with_fp_regs, global

	#if defined(__x86_64__)

	mov $0x9999, %rax
	movq %rax, %xmm0

	movq REGS_ST(0)(%rdi), %mm0
	movq REGS_ST(1)(%rdi), %mm1
	movq REGS_ST(2)(%rdi), %mm2
	movq REGS_ST(3)(%rdi), %mm3
	movq REGS_ST(4)(%rdi), %mm4
	movq REGS_ST(5)(%rdi), %mm5
	movq REGS_ST(6)(%rdi), %mm6
	movq REGS_ST(7)(%rdi), %mm7

	#elif defined(__aarch64__)

	// Just spins and does nothing. ARM64 doesn't define a separate FP state,
	// but doing this allows the rest of the code to be platform-independent.

	#elif defined(__riscv)

	.macro load_fp n
	fld f\n, REGS_F(\n)(a0)
	.endm
	on_32 load_fp

	#else

	#error Unsupported architecture

	#endif

	JUMP spin_address
	.end_function

	.function spin_with_vector_regs, global

	#if defined(__x86_64__)

	// This only loads XMM registers which are guaranteed to exist.
	movdqu REGS_ZMM(0)(%rdi), %xmm0
	movdqu REGS_ZMM(1)(%rdi), %xmm1
	movdqu REGS_ZMM(2)(%rdi), %xmm2
	movdqu REGS_ZMM(3)(%rdi), %xmm3
	movdqu REGS_ZMM(4)(%rdi), %xmm4
	movdqu REGS_ZMM(5)(%rdi), %xmm5
	movdqu REGS_ZMM(6)(%rdi), %xmm6
	movdqu REGS_ZMM(7)(%rdi), %xmm7
	movdqu REGS_ZMM(8)(%rdi), %xmm8
	movdqu REGS_ZMM(9)(%rdi), %xmm9
	movdqu REGS_ZMM(10)(%rdi), %xmm10
	movdqu REGS_ZMM(11)(%rdi), %xmm11
	movdqu REGS_ZMM(12)(%rdi), %xmm12
	movdqu REGS_ZMM(13)(%rdi), %xmm13
	movdqu REGS_ZMM(14)(%rdi), %xmm14
	movdqu REGS_ZMM(15)(%rdi), %xmm15

	#elif defined(__aarch64__)

	// FPCR and FPSR are first.
	#if REGS_FPSR != REGS_FPCR + 4
	#error "bad layout"
	#endif
	ldp w1, w2, [x0, REGS_FPCR]
	msr fpcr, x1
	msr fpsr, x2

	// Each register is 128 bits = 16 bytes, so each pair is 32 bytes.
	add x0, x0, REGS_Q(0)
	ldp q0, q1, [x0], #32
	ldp q2, q3, [x0], #32
	ldp q4, q5, [x0], #32
	ldp q6, q7, [x0], #32
	ldp q8, q9, [x0], #32
	ldp q10, q11, [x0], #32
	ldp q12, q13, [x0], #32
	ldp q14, q15, [x0], #32
	ldp q16, q17, [x0], #32
	ldp q18, q19, [x0], #32
	ldp q20, q21, [x0], #32
	ldp q22, q23, [x0], #32
	ldp q24, q25, [x0], #32
	ldp q26, q27, [x0], #32
	ldp q28, q29, [x0], #32
	ldp q30, q31, [x0]

	#elif defined(__riscv)
	// For indexing into the memory from which we want to load 8 vector
	// registers at a time, compute 8 * VLENB.
	csrr a1, vlenb
	slli a1, a1, 3

	addi t0, a0, RISCV64_VECTOR_STATE_V
	vl8r.v v0, (t0)
	add t0, t0, a1
	vl8r.v v8, (t0)
	add t0, t0, a1
	vl8r.v v16, (t0)
	add t0, t0, a1
	vl8r.v v24, (t0)

	lw a1, RISCV64_VECTOR_STATE_VCSR(a0)
	csrw vcsr, a1

	lw a1, RISCV64_VECTOR_STATE_VL(a0)
	lw a2, RISCV64_VECTOR_STATE_VTYPE(a0)
	vsetvl zero, a1, a2

	// Any vector operation will reset `vstart`, so be sure to install this
	// value last.
	lw a1, RISCV64_VECTOR_STATE_VSTART(a0)
	csrw vstart, a1

	#else

	#error Unsupported architecture

	#endif

	JUMP spin_address
	.end_function

	.function spin_with_debug_regs, global

	// Do nothing. The register state will be set through syscalls because
	// setting the debug registers is a privileged instruction.

	JUMP spin_address
	.end_function

	.function save_general_regs_and_exit_thread, global

	#if defined(__x86_64__)

	mov %rax, REGS_RAX(%rsp)
	mov %rbx, REGS_RBX(%rsp)
	mov %rcx, REGS_RCX(%rsp)
	mov %rdx, REGS_RDX(%rsp)
	mov %rsi, REGS_RSI(%rsp)
	mov %rdi, REGS_RDI(%rsp)
	mov %rbp, REGS_RBP(%rsp)
	mov %rsp, REGS_RSP(%rsp)
	mov %r8, REGS_R8(%rsp)
	mov %r9, REGS_R9(%rsp)
	mov %r10, REGS_R10(%rsp)
	mov %r11, REGS_R11(%rsp)
	mov %r12, REGS_R12(%rsp)
	mov %r13, REGS_R13(%rsp)
	mov %r14, REGS_R14(%rsp)
	mov %r15, REGS_R15(%rsp)

	// Save the flags register.
	pushfq
	pop %rax
	mov %rax, REGS_RFLAGS(%rsp)

	// Fill out the %rip field with known value.
	lea save_general_regs_and_exit_thread(%rip), %rax
	mov %rax, REGS_RIP(%rsp)

	jmp zx_thread_exit@PLT
	ud2

	#elif defined(__aarch64__)

	stp x0, x1, [sp, REGS_X(0)]
	stp x2, x3, [sp, REGS_X(2)]
	stp x4, x5, [sp, REGS_X(4)]
	stp x6, x7, [sp, REGS_X(6)]
	stp x8, x9, [sp, REGS_X(8)]
	stp x10, x11, [sp, REGS_X(10)]
	stp x12, x13, [sp, REGS_X(12)]
	stp x14, x15, [sp, REGS_X(14)]
	stp x16, x17, [sp, REGS_X(16)]
	stp x18, x19, [sp, REGS_X(18)]
	stp x20, x21, [sp, REGS_X(20)]
	stp x22, x23, [sp, REGS_X(22)]
	stp x24, x25, [sp, REGS_X(24)]
	stp x26, x27, [sp, REGS_X(26)]
	stp x28, x29, [sp, REGS_X(28)]
	str x30, [sp, #REGS_X(30)]

	// Save the sp register.
	mov x0, sp
	str x0, [sp, REGS_SP]

	// Fill out the pc field with a known value.
	adr x0, save_general_regs_and_exit_thread
	str x0, [sp, REGS_PC]

	// Save NZCV flags, a subset of the PSTATE/CPSR register.
	mrs x0, nzcv
	str x0, [sp, REGS_CPSR]
	bl zx_thread_exit
	brk 0

	#elif defined(__riscv)

	sd ra, REGS_RA(sp)
	sd sp, REGS_SP(sp)
	sd gp, REGS_GP(sp)
	sd tp, REGS_TP(sp)
	sd t0, REGS_T0(sp)
	sd t1, REGS_T1(sp)
	sd t2, REGS_T2(sp)
	sd s0, REGS_S0(sp)
	sd s1, REGS_S1(sp)
	sd a0, REGS_A0(sp)
	sd a1, REGS_A1(sp)
	sd a2, REGS_A2(sp)
	sd a3, REGS_A3(sp)
	sd a4, REGS_A4(sp)
	sd a5, REGS_A5(sp)
	sd a6, REGS_A6(sp)
	sd a7, REGS_A7(sp)
	sd s2, REGS_S2(sp)
	sd s3, REGS_S3(sp)
	sd s4, REGS_S4(sp)
	sd s5, REGS_S5(sp)
	sd s6, REGS_S6(sp)
	sd s7, REGS_S7(sp)
	sd s8, REGS_S8(sp)
	sd s9, REGS_S9(sp)
	sd s10, REGS_S10(sp)
	sd s11, REGS_S11(sp)
	sd t3, REGS_T3(sp)
	sd t4, REGS_T4(sp)
	sd t5, REGS_T5(sp)
	sd t6, REGS_T6(sp)

	// Fill out the pc field with a known value.
	lla a0, save_general_regs_and_exit_thread
	sd a0, REGS_PC(sp)

	call zx_thread_exit
	unimp

	#else

	#error Unsupported architecture

	#endif

	.end_function

	.function save_fp_regs_and_exit_thread, global

	#if defined(__x86_64__)

	movq %mm0, REGS_ST(0)(%rsp)
	movq %mm1, REGS_ST(1)(%rsp)
	movq %mm2, REGS_ST(2)(%rsp)
	movq %mm3, REGS_ST(3)(%rsp)
	movq %mm4, REGS_ST(4)(%rsp)
	movq %mm5, REGS_ST(5)(%rsp)
	movq %mm6, REGS_ST(6)(%rsp)
	movq %mm7, REGS_ST(7)(%rsp)

	jmp zx_thread_exit@PLT
	ud2

	#elif defined(__aarch64__)

	// Does nothing (no FP values).

	bl zx_thread_exit
	brk 0

	#elif defined(__riscv)

	// Save a structure that directly matches zx_riscv64_thread_state_fp_regs_t.
	li a0, -1
	.macro save_fp_on_stack n
	fsd f\n, REGS_F(\n)(sp)
	sd a0, (REGS_F(\n)+8)(sp)
	.endm
	on_32 save_fp_on_stack
	csrr a0, fcsr
	sw a0, REGS_F(32)(sp)

	call zx_thread_exit
	unimp

	#else

	#error Unsupported architecture

	#endif

	.end_function

	.function save_vector_regs_and_exit_thread, global

	#if defined(__x86_64__)

	// Each vector is 512 bits (64 bytes).
	// We only read the first 128 (XMM registers).
	movdqu %xmm0, REGS_ZMM(0)(%rsp)
	movdqu %xmm1, REGS_ZMM(1)(%rsp)
	movdqu %xmm2, REGS_ZMM(2)(%rsp)
	movdqu %xmm3, REGS_ZMM(3)(%rsp)
	movdqu %xmm4, REGS_ZMM(4)(%rsp)
	movdqu %xmm5, REGS_ZMM(5)(%rsp)
	movdqu %xmm6, REGS_ZMM(6)(%rsp)
	movdqu %xmm7, REGS_ZMM(7)(%rsp)
	movdqu %xmm8, REGS_ZMM(8)(%rsp)
	movdqu %xmm9, REGS_ZMM(9)(%rsp)
	movdqu %xmm10, REGS_ZMM(10)(%rsp)
	movdqu %xmm11, REGS_ZMM(11)(%rsp)
	movdqu %xmm12, REGS_ZMM(12)(%rsp)
	movdqu %xmm13, REGS_ZMM(13)(%rsp)
	movdqu %xmm14, REGS_ZMM(14)(%rsp)
	movdqu %xmm15, REGS_ZMM(15)(%rsp)

	jmp zx_thread_exit@PLT
	ud2

	#elif defined(__aarch64__)

	// FPCR and FPSR.
	mrs x0, fpcr
	mrs x1, fpsr
	stp w0, w1, [sp, REGS_FPCR]

	add x0, sp, REGS_Q(0)
	stp q0, q1, [x0], #32
	stp q2, q3, [x0], #32
	stp q4, q5, [x0], #32
	stp q6, q7, [x0], #32
	stp q8, q9, [x0], #32
	stp q10, q11, [x0], #32
	stp q12, q13, [x0], #32
	stp q14, q15, [x0], #32
	stp q16, q17, [x0], #32
	stp q18, q19, [x0], #32
	stp q20, q21, [x0], #32
	stp q22, q23, [x0], #32
	stp q24, q25, [x0], #32
	stp q26, q27, [x0], #32
	stp q28, q29, [x0], #32
	stp q30, q31, [x0]

	bl zx_thread_exit
	brk 0

	bl zx_thread_exit
	brk 0

	#elif defined(__riscv)

	// Any vector operation will reset `vstart`, so be sure to save this
	// value first.
	csrr a0, vstart
	sd a0, RISCV64_VECTOR_STATE_VSTART(sp)

	csrr a0, vcsr
	sd a0, RISCV64_VECTOR_STATE_VCSR(sp)

	csrr a0, vl
	sd a0, RISCV64_VECTOR_STATE_VL(sp)

	csrr a0, vtype
	sd a0, RISCV64_VECTOR_STATE_VTYPE(sp)

	// For indexing into the memory at which we want to store 8 vector registers
	// at a time, compute 8 * VLENB.
	csrr a1, vlenb
	slli a1, a1, 3

	addi a0, sp, RISCV64_VECTOR_STATE_V
	vs8r.v v0, (a0)
	add a0, a0, a1
	vs8r.v v8, (a0)
	add a0, a0, a1
	vs8r.v v16, (a0)
	add a0, a0, a1
	vs8r.v v24, (a0)

	call zx_thread_exit
	unimp

	#else

	#error Unsupported architecture

	#endif

	.end_function

	.function save_thread_local_regs_and_exit_thread, global

	#if defined(__x86_64__)

	// Read from %fs.base and %gs.base into the output.
	// The test will assert the correct values were read.
	movq %fs:0, %rax
	movq %rax, (%rsp)
	movq %gs:0, %rax
	movq %rax, 8(%rsp)

	// Write constants %fs.base and %gs_base.
	// The test will assert the correct values were written.
	// NOTE: This doesn't really set much since it doesn't
	// change the registers themselves, but not all CPUs support
	// the wrfsbase and wrgsbase instructions to do that from user ode.
	movq $0x12345678, %fs:0
	movq $0x7890abcd, %gs:0

	jmp zx_thread_exit@PLT
	ud2

	#elif defined(__aarch64__)

	mrs x1, tpidr_el0
	ldr x2, [x1]
	str x2, [sp]
	movlit x2, 0x12345678
	str x2, [x1]

	bl zx_thread_exit
	brk 0

	#elif defined(__riscv)

	ld a0, (tp)
	sd a0, (sp)
	li a0, 0x12345678
	sd a0, (tp)

	call zx_thread_exit
	unimp

	#else

	#error Unsupported architecture

	#endif

	.end_function