system/utest/core/threads/register-set.cpp - zircon/ - Git at Google

 // Copyright 2017 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include <assert.h>
 #include <math.h>
 #include <stdio.h>
 #include <string.h>
 #include <unittest/unittest.h>

 #include "register-set.h"

 namespace {

 // Write a NaN double value to the given uint64_t (which is how most of the
 // registers are stored in the structs).
 void WriteNaNDouble(uint64_t* output) {
     double nan_value = nan("");
     memcpy(output, &nan_value, sizeof(double));
 }

 } // namespace

 // Fill Test Values -------------------------------------------------------------------------------

 void general_regs_fill_test_values(zx_thread_state_general_regs_t* regs) {
     for (uint32_t index = 0; index < sizeof(*regs); ++index) {
         ((uint8_t*)regs)[index] = static_cast<uint8_t>(index + 1);
     }
 // Set various flags bits that will read back the same.
 #if defined(__x86_64__)
     // Here we set all flag bits that are modifiable from user space or
     // that are not modifiable but are expected to read back as 1, with the
     // exception of the trap flag (bit 8, which would interfere with
     // execution if we set it).
     //
     // Note that setting the direction flag (bit 10) helps test whether the
     // kernel correctly handles taking an interrupt when that flag is set
     // (see ZX-998).
     regs->rflags =
         (1 << 0) |  // CF: carry flag
         (1 << 1) |  // Reserved, always 1
         (1 << 2) |  // PF: parity flag
         (1 << 4) |  // AF: adjust flag
         (1 << 6) |  // ZF: zero flag
         (1 << 7) |  // SF: sign flag
         (1 << 9) |  // IF: interrupt enable flag (set by kernel)
         (1 << 10) | // DF: direction flag
         (1 << 11) | // OF: overflow flag
         (1 << 14) | // NT: nested task flag
         (1 << 18) | // AC: alignment check flag
         (1 << 21);  // ID: used for testing for CPUID support
 #elif defined(__aarch64__)
     // Only set the 4 flag bits that are readable and writable by the
     // instructions "msr nzcv, REG" and "mrs REG, nzcv".
     regs->cpsr = 0xf0000000;
 #endif
 }

 void fp_regs_fill_test_values(zx_thread_state_fp_regs* regs) {
     memset(regs, 0, sizeof(zx_thread_state_fp_regs));
 #if defined(__x86_64__)
     for (size_t i = 0; i < 7; i++)
         regs->st[i].low = i;

     // Write NaN to the last value.
     WriteNaNDouble(&regs->st[7].low);
 #elif defined(__aarch64__)
 // No FP struct on ARM (vector only).
 #else
 #error Unsupported architecture
 #endif
 }

 void vector_regs_fill_test_values(zx_thread_state_vector_regs* regs) {
     memset(regs, 0, sizeof(zx_thread_state_vector_regs));
 #if defined(__x86_64__)
     for (uint64_t i = 0; i < 16; i++) {
         // Only sets the XMM registers (first two) since that's all that's guaranteed.
         regs->zmm[i].v[0] = i;
         regs->zmm[i].v[1] = i << 8;
         regs->zmm[i].v[2] = 0;
         regs->zmm[i].v[3] = 0;
     }

     // Write NaN to the last value.
     WriteNaNDouble(&regs->zmm[15].v[0]);
 #elif defined(__aarch64__)
     for (uint64_t i = 0; i < 32; i++) {
         regs->v[i].low = i;
         regs->v[i].high = i << 8;
     }

     // Write NaN to the last value.
     WriteNaNDouble(&regs->v[31].low);
 #else
 #error Unsupported architecture
 #endif
 }

 void debug_regs_fill_test_values(zx_thread_state_debug_regs_t* to_write,
                                  zx_thread_state_debug_regs_t* expected) {
 #if defined(__x86_64__)
   // The kernel will validate that the addresses set into the debug registers are valid userspace
   // one. We use values relative to this function, as it is guaranteed to be in the userspace
   // range.
   uint64_t base = reinterpret_cast<uint64_t>(debug_regs_fill_test_values);
   to_write->dr[0] = base;
   to_write->dr[1] = base + 0x4000;
   to_write->dr[2] = base + 0x8000;
   to_write->dr[3] = 0x0; // Zero is also valid.
   to_write->dr6 = 0;
   to_write->dr7 = 0x33; // Activate all breakpoints.

   expected->dr[0] = base;
   expected->dr[1] = base + 0x4000;
   expected->dr[2] = base + 0x8000;
   expected->dr[3] = 0x0;
   expected->dr6 = 0xffff0ff0; // No breakpoint event detected.
   expected->dr7 = 0x733;      // Activate all breakpoints.

 #elif defined(__aarch64__)
     // TODO(donoso): Support arm64 debug registers.
 #else
 #error Unsupported architecture
 #endif
 }

 // Expect Eq Functions ----------------------------------------------------------------------------

 bool general_regs_expect_eq(const zx_thread_state_general_regs_t& regs1,
                             const zx_thread_state_general_regs_t& regs2) {
     BEGIN_HELPER;
 #define CHECK_REG(FIELD) EXPECT_EQ(regs1.FIELD, regs2.FIELD, "Reg " #FIELD)
 #if defined(__x86_64__)
     CHECK_REG(rax);
     CHECK_REG(rbx);
     CHECK_REG(rcx);
     CHECK_REG(rdx);
     CHECK_REG(rsi);
     CHECK_REG(rdi);
     CHECK_REG(rbp);
     CHECK_REG(rsp);
     CHECK_REG(r8);
     CHECK_REG(r9);
     CHECK_REG(r10);
     CHECK_REG(r11);
     CHECK_REG(r12);
     CHECK_REG(r13);
     CHECK_REG(r14);
     CHECK_REG(r15);
     CHECK_REG(rip);
     CHECK_REG(rflags);
 #elif defined(__aarch64__)
     for (int regnum = 0; regnum < 30; ++regnum) {
         char name[10];
         snprintf(name, sizeof(name), "Reg r[%d]", regnum);
         EXPECT_EQ(regs1.r[regnum], regs2.r[regnum], name);
     }
     CHECK_REG(lr);
     CHECK_REG(sp);
     CHECK_REG(pc);
     CHECK_REG(cpsr);
 #else
 #error Unsupported architecture
 #endif
 #undef CHECK_REG
     END_HELPER;
 }

 bool fp_regs_expect_eq(const zx_thread_state_fp_regs_t& regs1,
                        const zx_thread_state_fp_regs_t& regs2) {
 #if defined(__x86_64__)
     BEGIN_HELPER;

     // This just tests the MMX registers.
     EXPECT_EQ(regs1.st[0].low, regs2.st[0].low, "Reg st[0].low");
     EXPECT_EQ(regs1.st[1].low, regs2.st[1].low, "Reg st[1].low");
     EXPECT_EQ(regs1.st[2].low, regs2.st[2].low, "Reg st[2].low");
     EXPECT_EQ(regs1.st[3].low, regs2.st[3].low, "Reg st[3].low");
     EXPECT_EQ(regs1.st[4].low, regs2.st[4].low, "Reg st[4].low");
     EXPECT_EQ(regs1.st[5].low, regs2.st[5].low, "Reg st[5].low");
     EXPECT_EQ(regs1.st[6].low, regs2.st[6].low, "Reg st[6].low");
     EXPECT_EQ(regs1.st[7].low, regs2.st[7].low, "Reg st[7].low");

     END_HELPER;
 #elif defined(__aarch64__)
     // No FP regs on ARM (uses vector regs for FP).
     (void)regs1;
     (void)regs2;
     return true;
 #else
 #error Unsupported architecture
 #endif
 }

 bool vector_regs_expect_eq(const zx_thread_state_vector_regs_t& regs1,
                            const zx_thread_state_vector_regs_t& regs2) {
     BEGIN_HELPER;
 #if defined(__x86_64__)
     // Only check the first 16 registers (guaranteed to work).
     for (int reg = 0; reg < 16; reg++) {
         // Only check the low 128 bits (guaranteed to work).
         EXPECT_EQ(regs1.zmm[reg].v[0], regs2.zmm[reg].v[0]);
         EXPECT_EQ(regs1.zmm[reg].v[1], regs2.zmm[reg].v[1]);
     }
 #elif defined(__aarch64__)
     for (int i = 0; i < 32; i++) {
         EXPECT_EQ(regs1.v[i].high, regs2.v[i].high);
         EXPECT_EQ(regs1.v[i].low, regs2.v[i].low);
     }
 #else
 #error Unsupported architecture
 #endif
     END_HELPER;
 }

 bool debug_regs_expect_eq(const char* file, int line,
                           const zx_thread_state_debug_regs_t& regs1,
                           const zx_thread_state_debug_regs_t& regs2) {
 #if defined(__x86_64__)
     char buf[1024];
     BEGIN_HELPER;
     snprintf(buf, sizeof(buf), "%s:%d: %s", file, line, "Reg DR0");
     EXPECT_EQ(regs1.dr[0], regs2.dr[0], buf);
     snprintf(buf, sizeof(buf), "%s:%d: %s", file, line, "Reg DR1");
     EXPECT_EQ(regs1.dr[1], regs2.dr[1], buf);
     snprintf(buf, sizeof(buf), "%s:%d: %s", file, line, "Reg DR2");
     EXPECT_EQ(regs1.dr[2], regs2.dr[2], buf);
     snprintf(buf, sizeof(buf), "%s:%d: %s", file, line, "Reg DR3");
     EXPECT_EQ(regs1.dr[3], regs2.dr[3], buf);
     snprintf(buf, sizeof(buf), "%s:%d: %s", file, line, "Reg DR6");
     EXPECT_EQ(regs1.dr6, regs2.dr6, buf);
     snprintf(buf, sizeof(buf), "%s:%d: %s", file, line, "Reg DR7");
     EXPECT_EQ(regs1.dr7, regs2.dr7, buf);
     END_HELPER;
 #elif defined(__aarch64__)
     // TODO(donosoc): Write the debug register support.
     (void)regs1;
     (void)regs2;
     return true;
 #else
 #error Unsupported architecture
 #endif
 }

 // Spin Functions --------------------------------------------------------------------------------

 // spin_with_general_regs() function.
 #if defined(__x86_64__)
 static_assert(offsetof(zx_thread_state_general_regs_t, rax) == 8 * 0, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, rbx) == 8 * 1, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, rcx) == 8 * 2, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, rdx) == 8 * 3, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, rsi) == 8 * 4, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, rdi) == 8 * 5, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, rbp) == 8 * 6, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, rsp) == 8 * 7, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, r8) == 8 * 8, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, r9) == 8 * 9, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, r10) == 8 * 10, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, r11) == 8 * 11, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, r12) == 8 * 12, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, r13) == 8 * 13, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, r14) == 8 * 14, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, r15) == 8 * 15, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, rip) == 8 * 16, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, rflags) == 8 * 17, "");
 static_assert(sizeof(zx_thread_state_general_regs_t) == 8 * 18, "");
 __asm__(".pushsection .text, \"ax\", @progbits\n"
         ".global spin_with_general_regs\n"
         "spin_with_general_regs:\n"
         // Set flags using POPF.  Note that we use POPF rather than SAHF
         // because POPF is able to set more flags than SAHF.
         "pushq 8*17(%rdi)\n"
         "popfq\n"
         // Load general purpose registers.
         "movq 8*0(%rdi), %rax\n"
         "movq 8*1(%rdi), %rbx\n"
         "movq 8*2(%rdi), %rcx\n"
         "movq 8*3(%rdi), %rdx\n"
         "movq 8*4(%rdi), %rsi\n"
         // Skip assigning rdi here and assign it last.
         "movq 8*6(%rdi), %rbp\n"
         "movq 8*7(%rdi), %rsp\n"
         "movq 8*8(%rdi), %r8\n"
         "movq 8*9(%rdi), %r9\n"
         "movq 8*10(%rdi), %r10\n"
         "movq 8*11(%rdi), %r11\n"
         "movq 8*12(%rdi), %r12\n"
         "movq 8*13(%rdi), %r13\n"
         "movq 8*14(%rdi), %r14\n"
         "movq 8*15(%rdi), %r15\n"
         "movq 8*5(%rdi), %rdi\n"
         ".global spin_with_general_regs_spin_address\n"
         "spin_with_general_regs_spin_address:\n"
         "jmp spin_with_general_regs_spin_address\n"
         ".popsection\n");
 #elif defined(__aarch64__)
 static_assert(offsetof(zx_thread_state_general_regs_t, r[0]) == 8 * 0, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, r[1]) == 8 * 1, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, lr) == 8 * 30, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, sp) == 8 * 31, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, pc) == 8 * 32, "");
 static_assert(offsetof(zx_thread_state_general_regs_t, cpsr) == 8 * 33, "");
 static_assert(sizeof(zx_thread_state_general_regs_t) == 8 * 34, "");
 __asm__(".pushsection .text, \"ax\", %progbits\n"
         ".global spin_with_general_regs\n"
         "spin_with_general_regs:\n"
         // Load sp via a temporary register.
         "ldr x1, [x0, #8*31]\n"
         "mov sp, x1\n"
         // Load NZCV flags, a subset of the PSTATE/CPSR register.
         "ldr x1, [x0, #8*33]\n"
         "msr nzcv, x1\n"
         // Load general purpose registers.
         // Skip assigning x0 and x1 here and assign them last.
         "ldp x2, x3, [x0, #8*2]\n"
         "ldp x4, x5, [x0, #8*4]\n"
         "ldp x6, x7, [x0, #8*6]\n"
         "ldp x8, x9, [x0, #8*8]\n"
         "ldp x10, x11, [x0, #8*10]\n"
         "ldp x12, x13, [x0, #8*12]\n"
         "ldp x14, x15, [x0, #8*14]\n"
         "ldp x16, x17, [x0, #8*16]\n"
         "ldp x18, x19, [x0, #8*18]\n"
         "ldp x20, x21, [x0, #8*20]\n"
         "ldp x22, x23, [x0, #8*22]\n"
         "ldp x24, x25, [x0, #8*24]\n"
         "ldp x26, x27, [x0, #8*26]\n"
         "ldp x28, x29, [x0, #8*28]\n"
         "ldr x30, [x0, #8*30]\n"
         "ldp x0, x1, [x0]\n"
         ".global spin_with_general_regs_spin_address\n"
         "spin_with_general_regs_spin_address:\n"
         "b spin_with_general_regs_spin_address\n"
         ".popsection\n");
 #else
 #error Unsupported architecture
 #endif

 // spin_with_fp_regs() function.
 #if defined(__x86_64__)
 static_assert(offsetof(zx_thread_state_fp_regs_t, fcw) == 0, "");
 static_assert(offsetof(zx_thread_state_fp_regs_t, fsw) == 2, "");
 static_assert(offsetof(zx_thread_state_fp_regs_t, ftw) == 4, "");
 static_assert(offsetof(zx_thread_state_fp_regs_t, fop) == 6, "");
 static_assert(offsetof(zx_thread_state_fp_regs_t, fip) == 8, "");
 static_assert(offsetof(zx_thread_state_fp_regs_t, fdp) == 16, "");
 static_assert(offsetof(zx_thread_state_fp_regs_t, st) == 32, "");
 __asm__(".pushsection .text, \"ax\", @progbits\n"
         ".global spin_with_fp_regs\n"
         "spin_with_fp_regs:\n"

         // rdi = &zx_thread_state_fp_regs_t.st[0]
         "lea 32(%rdi), %rdi\n"

         "movq $0x9999, %rax\n"
         "movq %rax, %xmm0\n"

         "movq 16*0(%rdi), %mm0\n"
         "movq 16*1(%rdi), %mm1\n"
         "movq 16*2(%rdi), %mm2\n"
         "movq 16*3(%rdi), %mm3\n"
         "movq 16*4(%rdi), %mm4\n"
         "movq 16*5(%rdi), %mm5\n"
         "movq 16*6(%rdi), %mm6\n"
         "movq 16*7(%rdi), %mm7\n"

         "spin_with_fp_regs_spin_address:\n"
         "jmp spin_with_fp_regs_spin_address\n"
         ".popsection\n");
 #elif defined(__aarch64__)
 // Just spins and does nothing. ARM64 doesn't define a separate FP state, but doing this allows the
 // rest of the code to be platform-independent.
 __asm__(".pushsection .text, \"ax\", %progbits\n"
         ".global spin_with_fp_regs\n"
         "spin_with_fp_regs:\n"

         // Do nothing.

         "spin_with_fp_regs_spin_address:\n"
         "b spin_with_fp_regs_spin_address\n"
         ".popsection\n");
 #else
 #error Unsupported architecture
 #endif

 // spin_with_vector_regs() function.
 #if defined(__x86_64__)
 __asm__(".pushsection .text, \"ax\", @progbits\n"
         ".global spin_with_vector_regs\n"
         "spin_with_vector_regs:\n"

         // rdi = zmm[0] on call. This only loads xmm registers which are guaranteed to exist.
         // Each zmm input is 512 bits = 64 bytes.
         "movdqu 64*0(%rdi), %xmm0\n"
         "movdqu 64*1(%rdi), %xmm1\n"
         "movdqu 64*2(%rdi), %xmm2\n"
         "movdqu 64*3(%rdi), %xmm3\n"
         "movdqu 64*4(%rdi), %xmm4\n"
         "movdqu 64*5(%rdi), %xmm5\n"
         "movdqu 64*6(%rdi), %xmm6\n"
         "movdqu 64*7(%rdi), %xmm7\n"
         "movdqu 64*8(%rdi), %xmm8\n"
         "movdqu 64*9(%rdi), %xmm9\n"
         "movdqu 64*10(%rdi), %xmm10\n"
         "movdqu 64*11(%rdi), %xmm11\n"
         "movdqu 64*12(%rdi), %xmm12\n"
         "movdqu 64*13(%rdi), %xmm13\n"
         "movdqu 64*14(%rdi), %xmm14\n"
         "movdqu 64*15(%rdi), %xmm15\n"

         "spin_with_vector_regs_spin_address:\n"
         "jmp spin_with_vector_regs_spin_address\n"
         ".popsection\n");
 #elif defined(__aarch64__)
 static_assert(offsetof(zx_thread_state_vector_regs_t, fpcr) == 0, "");
 static_assert(offsetof(zx_thread_state_vector_regs_t, fpsr) == 4, "");
 static_assert(offsetof(zx_thread_state_vector_regs_t, v) == 8, "");
 __asm__(".pushsection .text, \"ax\", %progbits\n"
         ".global spin_with_vector_regs\n"
         "spin_with_vector_regs:\n"

         // FPCR and FPSR are first.
         "ldp w1, w2, [x0]\n"
         "msr fpcr, x1\n"
         "msr fpsr, x2\n"

         // Skip to the vector registers.
         "add x0, x0, 8\n"

         // Each register is 128 bits = 16 bytes, so each pair is 32 bytes.
         "ldp q0, q1, [x0, #(0 * 32)]\n"
         "ldp q2, q3, [x0, #(1 * 32)]\n"
         "ldp q4, q5, [x0, #(2 * 32)]\n"
         "ldp q6, q7, [x0, #(3 * 32)]\n"
         "ldp q8, q9, [x0, #(4 * 32)]\n"
         "ldp q10, q11, [x0, #(5 * 32)]\n"
         "ldp q12, q13, [x0, #(6 * 32)]\n"
         "ldp q14, q15, [x0, #(7 * 32)]\n"
         "ldp q16, q17, [x0, #(8 * 32)]\n"
         "ldp q18, q19, [x0, #(9 * 32)]\n"
         "ldp q20, q21, [x0, #(10 * 32)]\n"
         "ldp q22, q23, [x0, #(11 * 32)]\n"
         "ldp q24, q25, [x0, #(12 * 32)]\n"
         "ldp q26, q27, [x0, #(13 * 32)]\n"
         "ldp q28, q29, [x0, #(14 * 32)]\n"
         "ldp q30, q31, [x0, #(15 * 32)]\n"

         "spin_with_vector_regs_spin_address:\n"
         "b spin_with_vector_regs_spin_address\n"
         ".popsection\n");
 #else
 #error Unsupported architecture
 #endif

 // spin_with_debug_regs() function.

 // spin_with_debug_regs
 #if defined(__x86_64__)
 static_assert(offsetof(zx_thread_state_debug_regs_t, dr) == 8 * 0, "");
 static_assert(offsetof(zx_thread_state_debug_regs_t, dr6) == 8 * 4, "");
 static_assert(offsetof(zx_thread_state_debug_regs_t, dr7) == 8 * 5, "");
 __asm__(".pushsection .text, \"ax\", @progbits\n"
         ".global spin_with_debug_regs\n"
         "spin_with_debug_regs:\n"

         // Do nothing.
         // The register state will be set through syscalls because setting the debug registers
         // is a privileged instruction.

         ".global spin_with_debug_regs_spin_address\n"
         "spin_with_debug_regs_spin_address:\n"
         "jmp spin_with_debug_regs_spin_address\n"
         ".popsection\n");
 #elif defined(__aarch64__)
 __asm__(".pushsection .text, \"ax\", %progbits\n"
         ".global spin_with_debug_regs\n"
         "spin_with_debug_regs:\n"

         // Do nothing.
         // The register state will be set through syscalls because setting the debug registers
         // is a privileged instruction.

         "spin_with_debug_regs_spin_address:\n"
         "b spin_with_debug_regs_spin_address\n"
         ".popsection\n");
 #else
 #error Unsupported architecture
 #endif


 // Save and Exit Functions ------------------------------------------------------------------------

 // save_general_regs_and_exit_thread() function.
 #if defined(__x86_64__)
 __asm__(".pushsection .text,\"ax\", @progbits\n"
         ".global save_general_regs_and_exit_thread\n"
         "save_general_regs_and_exit_thread:\n"
         "movq %rax, 8*0(%rsp)\n"
         "movq %rbx, 8*1(%rsp)\n"
         "movq %rcx, 8*2(%rsp)\n"
         "movq %rdx, 8*3(%rsp)\n"
         "movq %rsi, 8*4(%rsp)\n"
         "movq %rdi, 8*5(%rsp)\n"
         "movq %rbp, 8*6(%rsp)\n"
         "movq %rsp, 8*7(%rsp)\n"
         "movq %r8, 8*8(%rsp)\n"
         "movq %r9, 8*9(%rsp)\n"
         "movq %r10, 8*10(%rsp)\n"
         "movq %r11, 8*11(%rsp)\n"
         "movq %r12, 8*12(%rsp)\n"
         "movq %r13, 8*13(%rsp)\n"
         "movq %r14, 8*14(%rsp)\n"
         "movq %r15, 8*15(%rsp)\n"
         // Save the flags register.
         "pushfq\n"
         "popq %rax\n"
         "movq %rax, 8*17(%rsp)\n"
         // Fill out the rip field with known value.
         "leaq save_general_regs_and_exit_thread(%rip), %rax\n"
         "movq %rax, 8*16(%rsp)\n"
         "call zx_thread_exit@PLT\n"
         "ud2\n"
         ".popsection\n");
 #elif defined(__aarch64__)
 __asm__(".pushsection .text, \"ax\", %progbits\n"
         ".global save_general_regs_and_exit_thread\n"
         "save_general_regs_and_exit_thread:\n"
         "stp x0, x1, [sp, #8*0]\n"
         "stp x2, x3, [sp, #8*2]\n"
         "stp x4, x5, [sp, #8*4]\n"
         "stp x6, x7, [sp, #8*6]\n"
         "stp x8, x9, [sp, #8*8]\n"
         "stp x10, x11, [sp, #8*10]\n"
         "stp x12, x13, [sp, #8*12]\n"
         "stp x14, x15, [sp, #8*14]\n"
         "stp x16, x17, [sp, #8*16]\n"
         "stp x18, x19, [sp, #8*18]\n"
         "stp x20, x21, [sp, #8*20]\n"
         "stp x22, x23, [sp, #8*22]\n"
         "stp x24, x25, [sp, #8*24]\n"
         "stp x26, x27, [sp, #8*26]\n"
         "stp x28, x29, [sp, #8*28]\n"
         "str x30, [sp, #8*30]\n"
         // Save the sp register.
         "mov x0, sp\n"
         "str x0, [sp, #8*31]\n"
         // Fill out the pc field with known value.
         "adr x0, save_general_regs_and_exit_thread\n"
         "str x0, [sp, #8*32]\n"
         // Save NZCV flags, a subset of the PSTATE/CPSR register.
         "mrs x0, nzcv\n"
         "str x0, [sp, #8*33]\n"
         "bl zx_thread_exit\n"
         "brk 0\n"
         ".popsection\n");
 #else
 #error Unsupported architecture
 #endif

 // save_fp_regs_and_exit_thread() function.
 #if defined(__x86_64__)
 static_assert(offsetof(zx_thread_state_fp_regs, st) == 32, "");
 __asm__(".pushsection .text,\"ax\", @progbits\n"
         ".global save_fp_regs_and_exit_thread\n"
         "save_fp_regs_and_exit_thread:\n"

         // This only saves the low 64 bits, which is the MMX register. Each slot in the struct is
         // 128 bits so need to add 16 bytes each time. The 32 bytes is the start of the FP regs in
         // the struct (see static assert above).
         "movq %mm0, 32 + 16*0(%rsp)\n"
         "movq %mm1, 32 + 16*1(%rsp)\n"
         "movq %mm2, 32 + 16*2(%rsp)\n"
         "movq %mm3, 32 + 16*3(%rsp)\n"
         "movq %mm4, 32 + 16*4(%rsp)\n"
         "movq %mm5, 32 + 16*5(%rsp)\n"
         "movq %mm6, 32 + 16*6(%rsp)\n"
         "movq %mm7, 32 + 16*7(%rsp)\n"

         "call zx_thread_exit@PLT\n"
         "ud2\n"
         ".popsection\n");
 #elif defined(__aarch64__)
 __asm__(".pushsection .text, \"ax\", %progbits\n"
         ".global save_fp_regs_and_exit_thread\n"
         "save_fp_regs_and_exit_thread:\n"

         // Does nothing (no FP values).

         "bl zx_thread_exit\n"
         "brk 0\n"
         ".popsection\n");
 #else
 #error Unsupported architecture
 #endif

 // save_vector_regs_and_exit_thread() function.
 #if defined(__x86_64__)
 static_assert(offsetof(zx_thread_state_vector_regs, zmm) == 0, "");
 __asm__(".pushsection .text,\"ax\", @progbits\n"
         ".global save_vector_regs_and_exit_thread\n"
         "save_vector_regs_and_exit_thread:\n"

         // Each vector is 512 bits (64 bytes). We only read the first 128 (xmm registers).
         "movdqu %xmm0, 64*0(%rsp)\n"
         "movdqu %xmm1, 64*1(%rsp)\n"
         "movdqu %xmm2, 64*2(%rsp)\n"
         "movdqu %xmm3, 64*3(%rsp)\n"
         "movdqu %xmm4, 64*4(%rsp)\n"
         "movdqu %xmm5, 64*5(%rsp)\n"
         "movdqu %xmm6, 64*6(%rsp)\n"
         "movdqu %xmm7, 64*7(%rsp)\n"
         "movdqu %xmm8, 64*8(%rsp)\n"
         "movdqu %xmm9, 64*9(%rsp)\n"
         "movdqu %xmm10, 64*10(%rsp)\n"
         "movdqu %xmm11, 64*11(%rsp)\n"
         "movdqu %xmm12, 64*12(%rsp)\n"
         "movdqu %xmm13, 64*13(%rsp)\n"
         "movdqu %xmm14, 64*14(%rsp)\n"
         "movdqu %xmm15, 64*15(%rsp)\n"

         "call zx_thread_exit@PLT\n"
         "ud2\n"
         ".popsection\n");
 #elif defined(__aarch64__)
 __asm__(".pushsection .text, \"ax\", %progbits\n"
         ".global save_vector_regs_and_exit_thread\n"
         "save_vector_regs_and_exit_thread:\n"

         // Input is in SP.
         "mov x0, sp\n"

         // FPCR and FPSR.
         "mrs x1, fpcr\n"
         "mrs x2, fpsr\n"
         "stp w1, w2, [x0]\n"

         // Skip to the vector registers
         "add x0, x0, 8\n"

         // Each register is 128 bits = 16 bytes, so each pair is 32 bytes.
         "stp q0, q1, [x0, #(0 * 32)]\n"
         "stp q2, q3, [x0, #(1 * 32)]\n"
         "stp q4, q5, [x0, #(2 * 32)]\n"
         "stp q6, q7, [x0, #(3 * 32)]\n"
         "stp q8, q9, [x0, #(4 * 32)]\n"
         "stp q10, q11, [x0, #(5 * 32)]\n"
         "stp q12, q13, [x0, #(6 * 32)]\n"
         "stp q14, q15, [x0, #(7 * 32)]\n"
         "stp q16, q17, [x0, #(8 * 32)]\n"
         "stp q18, q19, [x0, #(9 * 32)]\n"
         "stp q20, q21, [x0, #(10 * 32)]\n"
         "stp q22, q23, [x0, #(11 * 32)]\n"
         "stp q24, q25, [x0, #(12 * 32)]\n"
         "stp q26, q27, [x0, #(13 * 32)]\n"
         "stp q28, q29, [x0, #(14* 32)]\n"
         "stp q30, q31, [x0, #(15 * 32)]\n"

         "bl zx_thread_exit\n"
         "brk 0\n"
         ".popsection\n");
 #else
 #error Unsupported architecture
 #endif