zircon/kernel/phys/trampoline-boot.cc - fuchsia - Git at Google

 // Copyright 2021 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #include "phys/trampoline-boot.h"

 #include <lib/arch/zbi-boot.h>
 #include <lib/memalloc/pool.h>
 #include <lib/zbitl/items/mem-config.h>
 #include <zircon/assert.h>

 #include <cstddef>
 #include <cstdint>
 #include <cstring>

 #include <fbl/algorithm.h>
 #include <ktl/byte.h>
 #include <phys/address-space.h>
 #include <phys/main.h>
 #include <phys/stdio.h>

 #include <ktl/enforce.h>

 namespace {

 #if defined(__x86_64__) || defined(__i386__)
 // In the legacy fixed-address format, the entry address is always above 1M.
 // In the new format, it's an offset and in practice it's never > 1M.  So
 // this is a safe-enough heuristic to distinguish the new from the ol
 bool IsLegacyEntryAddress(uint64_t address) { return address > TrampolineBoot::kLegacyLoadAddress; }

 // Relocated blob size must be aligned to |kRelocateAlign|.
 constexpr size_t kRelocateAlign = 1;

 // When a RelocatedTarget is copied forward, source and destination offsets
 // must be adjusted by this.
 constexpr int64_t kForwardBias = 0;

 // When a RelocatedTarget is copied backwards, source and destination offsets
 // must be adjusted by this.
 constexpr int64_t kBackwardBias = -1;

 #elif defined(__aarch64__)

 // ARM does not use legacy fixed address format.
 bool IsLegacyEntryAddress(uint64_t address) { return false; }

 // Relocated blob size must be aligned to |kRelocateAlign|.
 constexpr size_t kRelocateAlign = 32;

 // When a RelocatedTarget is copied forward, source and destination offsets
 // must be adjusted by this.
 constexpr int64_t kForwardBias = -16;

 // When a RelocatedTarget is copied backwards, source and destination offsets
 // must be adjusted by this.
 constexpr int64_t kBackwardBias = 0;

 #elif defined(__riscv)

 // RISC-V does not use legacy fixed address format.
 bool IsLegacyEntryAddress(uint64_t address) { return false; }

 // Relocated blob size must be aligned to |kRelocateAlign|.
 constexpr size_t kRelocateAlign = 8;

 // When a RelocatedTarget is copied forward, source and destination offsets
 // must be adjusted by this.
 constexpr int64_t kForwardBias = 0;

 // When a RelocatedTarget is copied backwards, source and destination offsets
 // must be adjusted by this.
 constexpr int64_t kBackwardBias = 0;

 #else

 #error "What architecture?"

 #endif

 struct RelocateTarget {
   RelocateTarget() = default;

   RelocateTarget(uintptr_t destination, ktl::span<const ktl::byte> blob)
       : src(reinterpret_cast<uintptr_t>(blob.data())),
         dst(destination),
         count(fbl::round_up(blob.size(), kRelocateAlign)),
         backwards(dst > src && dst - src < count) {
     if (backwards) {
       dst += count + kBackwardBias;
       src += count + kBackwardBias;
     } else {
       dst += kForwardBias;
       src += kForwardBias;
     }
   }

   constexpr uint64_t destination() const {
     return backwards ? dst - count - kBackwardBias : dst - kForwardBias;
   }

   uint64_t src = 0;
   uint64_t dst = 0;
   uint64_t count = 0;

   // When the addresses overlap, the copying can be done backwards and so the
   // direction flag is set for REP MOVSB and the starting pointers are at the
   // last byte rather than the first. While this is a boolean flag, we can
   // use fewer ASM instruction in the inline assembly by increasinng its width.
   uint64_t backwards = 0;
 };

 #if __aarch64__

 static_assert(offsetof(RelocateTarget, src) == offsetof(RelocateTarget, dst) - sizeof(uint64_t),
               "Must be contiguous for arm64 ldp instruction.");
 static_assert(offsetof(RelocateTarget, count) ==
                   offsetof(RelocateTarget, backwards) - sizeof(uint64_t),
               "Must be contiguous for arm64 ldp instruction.");
 #endif

 }  // namespace

 // This describes the "trampoline" area that is set up in some memory that's
 // safely out of the way: not part of this shim's own image, which might be
 // overwritten, and not part of the fixed-position kernel load image or reserve
 // memory, not part of the kernel image being relocated, and not part of the
 // data ZBI image.  Trampoline::size() bytes must be allocated in the safe
 // place and then it must be constructed with new (ptr) (Trampoline::size())
 // before Boot() is finally called.
 class TrampolineBoot::Trampoline {
  public:
   explicit Trampoline(size_t space) {
     ZX_ASSERT(space >= size());
     const zbitl::ByteView code = TrampolineCode();
     memcpy(code_, code.data(), code.size());
   }

   static size_t size() { return offsetof(Trampoline, code_) + TrampolineCode().size(); }

   [[noreturn]] void Boot(RelocateTarget kernel, RelocateTarget zbi, uint64_t entry_address) {
     args_ = {
         .kernel = kernel,
         .zbi = zbi,
         .data_zbi = zbi.destination(),
         .entry = entry_address,
     };
     ZX_ASSERT(args_.entry == entry_address);
     ZbiBootRaw(reinterpret_cast<uintptr_t>(code_), &args_);
   }

  private:
   // This packs up the arguments for the trampoline code, which are pretty much
   // the operands for REP MOVSB plus the entry point and data ZBI addresses.
   struct TrampolineArgs {
     RelocateTarget kernel;
     RelocateTarget zbi;
     uint64_t data_zbi;
     uint64_t entry;
   };

   // We must require the compiler not to inline |TrampolineCode| to prevent
   // more that one instance of |TrampolineCode| to exist. The real issue,
   // is that inlining may introduce alignment or jump relaxation between
   // instances causing the size of the assembler code to be different.
   [[gnu::const, gnu::noinline]] static zbitl::ByteView TrampolineCode() {
     // This tiny bit of code will be copied someplace out of the way.  Then it
     // will be entered with %rsi pointing at TrampolineArgs, which can be on
     // the stack since it's read immediately.  Since this code is safely out of
     // the way, it can perform a copy that might clobber this boot shim's own
     // code, data, bss, and stack.  After the copy, it jumps directly to the
     // fixed-address ZBI kernel's entry point and %rsi points to the data ZBI.
     //
     // First the code loads the backwards flag into %al, the entry address into
     // %rbx, and the ZBI address into %rdx.  Then it loads the registers used
     // by REP MOVSB (%rcx, %rdi, and %rsi).  It then tests the %al flag to set
     // the Direction flag (STD) for backwards mode.  Then REP MOVSB does the
     // copy, whether forwards or backwards.  After that, the SP and FP are
     // cleared, the D flag is cleared again and interrupts disabled for good
     // measure, before finally moving the ZBI pointer into place (%rsi) and
     // jumping to the entry point (%rbx).
     const ktl::byte* code;
     size_t size;
 #if defined(__x86_64__) || defined(__i386__)
     __asm__(
         R"""(
 .code64
 .pushsection .rodata.trampoline, "a?", %%progbits
 0:
   # Save |rsi| in |rbx|, where |rbx| will always point to '&args'.
   mov %%rsi, %%rbx
   mov %c[zbi_count](%%rbx), %%rcx
   test %%rcx, %%rcx
   jz 2f
   mov %c[zbi_dst](%%rbx), %%rdi
   mov %c[zbi_src](%%rbx), %%rsi
   cmp %%rdi, %%rsi
   je 2f
   mov %c[zbi_backwards](%%rbx), %%al
   testb %%al,%%al
   jz 1f
   std
 1:
   rep movsb
   cld
 2:
   mov %c[kernel_count](%%rbx), %%rcx
   mov %c[kernel_dst](%%rbx), %%rdi
   mov %c[kernel_src](%%rbx), %%rsi
   cmp %%rdi, %%rsi
   je 4f
   mov %c[kernel_backwards](%%rbx), %%al
   testb %%al, %%al
   jz 3f
   std
 3:
   rep movsb
 4:
   # Clean stack pointers before jumping into the kernel.
   xor %%esp, %%esp
   xor %%ebp, %%ebp
   cld
   cli
   # The data ZBI must be in rsi before jumping into the kernel entry address.
   mov %c[data_zbi](%%rbx), %%rsi
   mov %c[entry](%%rbx), %%rbx
   jmp *%%rbx
 4:
 .popsection
 )"""

 #ifdef __i386__
         R"""(
 .code32
   mov $0b, %[code]
   mov $(4b - 0b), %[size]
             )"""
 #else
         R"""(
   lea 0b(%%rip), %[code]
   mov $(4b - 0b), %[size]
             )"""
 #endif

         : [code] "=r"(code), [size] "=r"(size)
         : [kernel_backwards] "i"(offsetof(TrampolineArgs, kernel.backwards)),  //
           [kernel_dst] "i"(offsetof(TrampolineArgs, kernel.dst)),              //
           [kernel_src] "i"(offsetof(TrampolineArgs, kernel.src)),              //
           [kernel_count] "i"(offsetof(TrampolineArgs, kernel.count)),          //
           [zbi_dst] "i"(offsetof(TrampolineArgs, zbi.dst)),                    //
           [zbi_src] "i"(offsetof(TrampolineArgs, zbi.src)),                    //
           [zbi_count] "i"(offsetof(TrampolineArgs, zbi.count)),                //
           [zbi_backwards] "i"(offsetof(TrampolineArgs, zbi.backwards)),        //
           [data_zbi] "i"(offsetof(TrampolineArgs, data_zbi)),                  //
           [entry] "i"(offsetof(TrampolineArgs, entry)));
 #elif defined(__aarch64__)
     __asm__(
         R""(
 .pushsection .rodata.trampoline, "a?", %%progbits
 // x0 contains |&args|.
 .Ltrampoline_start.%=:
   mov x10, x0
   ldp x0, x1, [x10, %[zbi_dst_offset]]
 .Ltrampoline_zbi.%=:
   add x9, x10, %[data_offset]
   bl .Lcopy_start.%=
 .Ltrampoline_kernel.%=:
   add x9, x10, %[kernel_offset]
   bl .Lcopy_start.%=
 .Ltrampoline_exit.%=:
   mov x29, xzr
   mov x30, xzr
   mov sp, x29
   br x1

 // Expectation:
 //   x9: RelocatableTarget*
 //   x2-x8 are used during this procedure.
 .Lcopy_start.%=:
   // x2 -> src address
   // x3 -> dst address
   // x4 -> count (in bytes)
   // x5 -> backwards (direction)
   ldp x2, x3, [x9]
   ldp x4, x5, [x9, %[count_offset]]
   cbz x4, .Lcopy_ret.%=
   cmp x2, x3
   beq .Lcopy_ret.%=
   // test direction flag.
   cbnz x5, .Lcopy_backwards.%=

 // x2 and x3 hold the first byte in the range to copy, and x4 holds the number of bytes,
 // which is a multiple of 32.
 .Lcopy_forward.%=:
   ldp x5, x6, [x2, #16]
   ldp x7, x8, [x2, #32]!
   stp x5, x6, [x3, #16]
   stp x7, x8, [x3, #32]!
   sub x4, x4, #32
   cbnz x4, .Lcopy_forward.%=
   ret

 // In backwards mode, the src and dst registers point the last, non inclusive, byte and
 // is guaranteed to be a multiple of 32b, hence we can just loop.
 .Lcopy_backwards.%=:
   ldp x5, x6, [x2, #-16]
   ldp x7, x8, [x2, #-32]!
   stp x5, x6, [x3, #-16]
   stp x7, x8, [x3, #-32]!
   sub x4, x4, #32
   cbnz x4, .Lcopy_backwards.%=
 .Lcopy_ret.%=:
   ret

 // Used to calculate code size.
 .Ltrampoline_end.%=:
 .popsection

 adrp %[code], .Ltrampoline_start.%=
 add %[code], %[code], #:lo12:.Ltrampoline_start.%=
 mov %[size], (.Ltrampoline_end.%= - .Ltrampoline_start.%=)
         )""
         : [code] "=r"(code), [size] "=r"(size)
         : [kernel_offset] "i"(offsetof(TrampolineArgs, kernel)),
           [data_offset] "i"(offsetof(TrampolineArgs, zbi)),
           [src_offset] "i"(offsetof(RelocateTarget, src)),
           [count_offset] "i"(offsetof(RelocateTarget, count)),
           [zbi_dst_offset] "i"(offsetof(TrampolineArgs, data_zbi)),
           [entry] "i"(offsetof(TrampolineArgs, entry)));
 #elif defined(__riscv)
     const ktl::byte* code_end;
     // This starts with the hart ID in a0 and the "data pointer" in a1.
     // a0 is left alone throughout to pass it along to the real kernel.
     // a1 points to the TrampolineArgs and is replaced with args.data_zbi.
     //
     // TODO(mcgrathr): maybe unroll the copying loops some
     __asm__(
         R"""(
         .pushsection .rodata.trampoline, "a?", %%progbits
         .Ltrampoline_start.%=:

           add t0, a1, %[data_offset]
           jal .Lcopy_start.%=
           add t0, a1, %[kernel_offset]
           jal .Lcopy_start.%=

           mv s0, zero
           mv ra, zero
           mv sp, zero
           mv gp, zero
           mv tp, zero
           ld t0, %[entry](a1)
           ld a1, %[zbi_dst_offset](a1)
           jr t0

         .Lcopy_start.%=:
           ld t1, %[src_offset](t0)
           ld t2, %[dst_offset](t0)
           ld t3, %[count_offset](t0)
           ld t4, %[backwards_offset](t0)
           bnez t4, .Lcopy_backwards.%=

         .Lcopy_forward.%=:
           ld t4, (t1)
           sd t4, (t2)
           add t3, t3, -8
           add t1, t1, 8
           add t2, t2, 8
           bnez t3, .Lcopy_forward.%=
           ret

         .Lcopy_backwards.%=:
           ld t4, -8(t1)
           sd t4, -8(t2)
           add t3, t3, -8
           add t1, t1, -8
           add t2, t2, -8
           bnez t3, .Lcopy_backwards.%=
           ret

         .Ltrampoline_end.%=:
         .popsection

         lla %[start], .Ltrampoline_start.%=
         lla %[end], .Ltrampoline_end.%=
         )"""
         : [start] "=r"(code), [end] "=r"(code_end)
         : [kernel_offset] "i"(offsetof(TrampolineArgs, kernel)),
           [data_offset] "i"(offsetof(TrampolineArgs, zbi)),
           [src_offset] "i"(offsetof(RelocateTarget, src)),
           [dst_offset] "i"(offsetof(RelocateTarget, dst)),
           [count_offset] "i"(offsetof(RelocateTarget, count)),
           [backwards_offset] "i"(offsetof(RelocateTarget, backwards)),
           [zbi_dst_offset] "i"(offsetof(TrampolineArgs, data_zbi)),
           [entry] "i"(offsetof(TrampolineArgs, entry)));
     size = code_end - code;
 #else
 #error "what architecture?"
 #endif
     return {code, size};
   }

   TrampolineArgs args_;
   ktl::byte code_[];
 };

 void TrampolineBoot::SetKernelAddresses() {
   kernel_entry_address_ = BootZbi::KernelEntryAddress();
   if (IsLegacyEntryAddress(KernelHeader()->entry)) {
     set_kernel_load_address(kLegacyLoadAddress);
     kernel_entry_address_ = KernelHeader()->entry;
   }
 }

 fit::result<BootZbi::Error> TrampolineBoot::Load(uint32_t extra_data_capacity,
                                                  ktl::optional<uint64_t> kernel_load_address,
                                                  ktl::optional<uint64_t> data_load_address) {
   if (kernel_load_address) {
     set_kernel_load_address(*kernel_load_address);
   }

   if (data_load_address) {
     data_load_address_ = data_load_address;
   }

   if (!kernel_load_address_) {
     // New-style position-independent kernel.
     return BootZbi::Load(extra_data_capacity);
   }

   // Now we know how much space the kernel image needs.
   // Reserve it at the fixed load address.
   auto& pool = Allocation::GetPool();
   if (auto result = pool.UpdateFreeRamSubranges(memalloc::Type::kFixedAddressKernel,
                                                 *kernel_load_address_, KernelMemorySize());
       result.is_error()) {
     return fit::error{BootZbi::Error{.zbi_error = "unable to reserve kernel's load image"sv}};
   }

   if (data_load_address_) {
     if (auto result = pool.UpdateFreeRamSubranges(memalloc::Type::kDataZbi, *data_load_address_,
                                                   DataLoadSize() + extra_data_capacity);
         result.is_error()) {
       return fit::error{BootZbi::Error{.zbi_error = "unable to reserve data ZBI's load image"sv}};
     }
   }

   // The trampoline needs someplace safely neither in the kernel image, nor in
   // the data ZBI image, nor in this shim's own image since that might overlap
   // the fixed-address target region.  It's tiny, so just extend the extra data
   // capacity to cover it and use the few bytes just after the data ZBI.  The
   // space is safely allocated in our present reckoning so it's disjoint from
   // the data and kernel image memory and from this shim's own image, but as
   // soon as we boot into the new kernel it will be reclaimable memory.
   if (auto result = BootZbi::Load(extra_data_capacity + static_cast<uint32_t>(Trampoline::size()),
                                   kernel_load_address_);
       result.is_error()) {
     return result.take_error();
   }

   auto extra_space = DataZbi().storage().subspan(DataZbi().size_bytes());
   auto trampoline = extra_space.subspan(extra_data_capacity);
   trampoline_ = new (trampoline.data()) Trampoline(trampoline.size());

   // In the x86-64 case, we set up page-tables out of the .bss, which must
   // persist after booting the next kernel payload; however, this part of the
   // .bss might be clobbered by that self-same fixed load image. To avoid that
   // issue, now that physical memory management has been bootstrapped, we
   // re-set up the address space out of the allocator, which will avoid
   // allocating from out of the load image's range that we just reserved.
   //
   // In the x86-32 case, page tables came from the normal Allocation pool
   // originally, but the pages chosen when ArchSetUpAddressSpace ran might
   // overlap with areas that are now reserved.  But as on x86-64, after other
   // reservations it's now safe to choose page table pages with the allocator.
   //
   // On other machines, this is a no-op.  The page tables now in use may indeed
   // be clobbered by the trampoline copying, but by then the MMU will be off.
   ArchPrepareAddressSpaceForTrampoline();

   return fit::ok();
 }

 [[noreturn]] void TrampolineBoot::Boot(ktl::optional<void*> argument) {
   ZX_ASSERT(!MustRelocateDataZbi());

   uintptr_t entry = static_cast<uintptr_t>(KernelEntryAddress());
   ZX_ASSERT(entry == KernelEntryAddress());

   uintptr_t zbi = static_cast<uintptr_t>(DataLoadAddress());
   ZX_ASSERT(zbi == DataLoadAddress());

   uintptr_t kernel_first = static_cast<uintptr_t>(KernelLoadAddress());
   uintptr_t kernel_last = static_cast<uintptr_t>(KernelLoadAddress() + KernelLoadSize() - 1);
   ZX_ASSERT(kernel_first == KernelLoadAddress());
   ZX_ASSERT(kernel_last == KernelLoadAddress() + KernelLoadSize() - 1);

   uintptr_t kernel_size = static_cast<uintptr_t>(KernelLoadSize());
   ZX_ASSERT(kernel_size == KernelLoadSize());

   if (kernel_load_address_) {
     uintptr_t fixed_first = static_cast<uintptr_t>(kernel_load_address_.value());
     uintptr_t fixed_last = static_cast<uintptr_t>(*kernel_load_address_ + KernelLoadSize() - 1);
     ZX_ASSERT_MSG(fixed_first == *kernel_load_address_, "0x%016" PRIx64 " != 0x%016" PRIx64 " ",
                   static_cast<uint64_t>(fixed_first), *kernel_load_address_);
     ZX_ASSERT(fixed_last == *kernel_load_address_ + KernelLoadSize() - 1);
   }

   if (!trampoline_) {
     // This is a new-style position-independent kernel.  Boot it where it is.
     BootZbi::Boot(argument);
   }

   uintptr_t zbi_location =
       reinterpret_cast<uintptr_t>(argument.value_or(DataZbi().storage().data()));
   auto kernel_blob = ktl::span<const ktl::byte>(reinterpret_cast<const ktl::byte*>(KernelImage()),
                                                 KernelLoadSize());
   auto zbi_blob = ktl::span<const ktl::byte>(reinterpret_cast<const ktl::byte*>(zbi_location),
                                              DataZbi().size_bytes());
   trampoline_->Boot(
       RelocateTarget(static_cast<uintptr_t>(*kernel_load_address_), kernel_blob),
       RelocateTarget(static_cast<uintptr_t>(data_load_address_.value_or(zbi_location)), zbi_blob),
       KernelEntryAddress());
 }

 fit::result<TrampolineBoot::Error> TrampolineBoot::Init(InputZbi zbi) {
   auto res = BootZbi::Init(zbi);
   SetKernelAddresses();
   return res;
 }

 fit::result<TrampolineBoot::Error> TrampolineBoot::Init(InputZbi zbi,
                                                         InputZbi::iterator kernel_item) {
   auto res = BootZbi::Init(zbi, kernel_item);
   SetKernelAddresses();
   return res;
 }

 void TrampolineBoot::Log() {
   LogAddresses();
   if (trampoline_) {
     LogFixedAddresses();
   }
   LogBoot(KernelEntryAddress());
 }

 // This output lines up with what BootZbi::LogAddresses() prints.
 void TrampolineBoot::LogFixedAddresses() const {
 #define ADDR "0x%016" PRIx64
   const uint64_t kernel = kernel_load_address_.value();
   const uint64_t bss = kernel + KernelLoadSize();
   const uint64_t end = kernel + KernelMemorySize();
   debugf("%s: Relocated\n", ProgramName());
   debugf("%s:    Kernel @ [" ADDR ", " ADDR ")\n", ProgramName(), kernel, bss);
   debugf("%s:       BSS @ [" ADDR ", " ADDR ")\n", ProgramName(), bss, end);
   if (data_load_address_) {
     debugf("%s:       ZBI @ [" ADDR ", " ADDR ")\n", ProgramName(), *data_load_address_,
            *data_load_address_ + DataLoadSize());
   }
 }
	// Copyright 2021 The Fuchsia Authors
	//
	// Use of this source code is governed by a MIT-style
	// license that can be found in the LICENSE file or at
	// https://opensource.org/licenses/MIT

	#include "phys/trampoline-boot.h"

	#include <lib/arch/zbi-boot.h>
	#include <lib/memalloc/pool.h>
	#include <lib/zbitl/items/mem-config.h>
	#include <zircon/assert.h>

	#include <cstddef>
	#include <cstdint>
	#include <cstring>

	#include <fbl/algorithm.h>
	#include <ktl/byte.h>
	#include <phys/address-space.h>
	#include <phys/main.h>
	#include <phys/stdio.h>

	#include <ktl/enforce.h>

	namespace {

	#if defined(__x86_64__) \|\| defined(__i386__)
	// In the legacy fixed-address format, the entry address is always above 1M.
	// In the new format, it's an offset and in practice it's never > 1M. So
	// this is a safe-enough heuristic to distinguish the new from the ol
	bool IsLegacyEntryAddress(uint64_t address) { return address > TrampolineBoot::kLegacyLoadAddress; }

	// Relocated blob size must be aligned to \|kRelocateAlign\|.
	constexpr size_t kRelocateAlign = 1;

	// When a RelocatedTarget is copied forward, source and destination offsets
	// must be adjusted by this.
	constexpr int64_t kForwardBias = 0;

	// When a RelocatedTarget is copied backwards, source and destination offsets
	// must be adjusted by this.
	constexpr int64_t kBackwardBias = -1;

	#elif defined(__aarch64__)

	// ARM does not use legacy fixed address format.
	bool IsLegacyEntryAddress(uint64_t address) { return false; }

	// Relocated blob size must be aligned to \|kRelocateAlign\|.
	constexpr size_t kRelocateAlign = 32;

	// When a RelocatedTarget is copied forward, source and destination offsets
	// must be adjusted by this.
	constexpr int64_t kForwardBias = -16;

	// When a RelocatedTarget is copied backwards, source and destination offsets
	// must be adjusted by this.
	constexpr int64_t kBackwardBias = 0;

	#elif defined(__riscv)

	// RISC-V does not use legacy fixed address format.
	bool IsLegacyEntryAddress(uint64_t address) { return false; }

	// Relocated blob size must be aligned to \|kRelocateAlign\|.
	constexpr size_t kRelocateAlign = 8;

	// When a RelocatedTarget is copied forward, source and destination offsets
	// must be adjusted by this.
	constexpr int64_t kForwardBias = 0;

	// When a RelocatedTarget is copied backwards, source and destination offsets
	// must be adjusted by this.
	constexpr int64_t kBackwardBias = 0;

	#else

	#error "What architecture?"

	#endif

	struct RelocateTarget {
	RelocateTarget() = default;

	RelocateTarget(uintptr_t destination, ktl::span<const ktl::byte> blob)
	: src(reinterpret_cast<uintptr_t>(blob.data())),
	dst(destination),
	count(fbl::round_up(blob.size(), kRelocateAlign)),
	backwards(dst > src && dst - src < count) {
	if (backwards) {
	dst += count + kBackwardBias;
	src += count + kBackwardBias;
	} else {
	dst += kForwardBias;
	src += kForwardBias;
	}
	}

	constexpr uint64_t destination() const {
	return backwards ? dst - count - kBackwardBias : dst - kForwardBias;
	}

	uint64_t src = 0;
	uint64_t dst = 0;
	uint64_t count = 0;

	// When the addresses overlap, the copying can be done backwards and so the
	// direction flag is set for REP MOVSB and the starting pointers are at the
	// last byte rather than the first. While this is a boolean flag, we can
	// use fewer ASM instruction in the inline assembly by increasinng its width.
	uint64_t backwards = 0;
	};

	#if __aarch64__

	static_assert(offsetof(RelocateTarget, src) == offsetof(RelocateTarget, dst) - sizeof(uint64_t),
	"Must be contiguous for arm64 ldp instruction.");
	static_assert(offsetof(RelocateTarget, count) ==
	offsetof(RelocateTarget, backwards) - sizeof(uint64_t),
	"Must be contiguous for arm64 ldp instruction.");
	#endif

	} // namespace

	// This describes the "trampoline" area that is set up in some memory that's
	// safely out of the way: not part of this shim's own image, which might be
	// overwritten, and not part of the fixed-position kernel load image or reserve
	// memory, not part of the kernel image being relocated, and not part of the
	// data ZBI image. Trampoline::size() bytes must be allocated in the safe
	// place and then it must be constructed with new (ptr) (Trampoline::size())
	// before Boot() is finally called.
	class TrampolineBoot::Trampoline {
	public:
	explicit Trampoline(size_t space) {
	ZX_ASSERT(space >= size());
	const zbitl::ByteView code = TrampolineCode();
	memcpy(code_, code.data(), code.size());
	}

	static size_t size() { return offsetof(Trampoline, code_) + TrampolineCode().size(); }

	[[noreturn]] void Boot(RelocateTarget kernel, RelocateTarget zbi, uint64_t entry_address) {
	args_ = {
	.kernel = kernel,
	.zbi = zbi,
	.data_zbi = zbi.destination(),
	.entry = entry_address,
	};
	ZX_ASSERT(args_.entry == entry_address);
	ZbiBootRaw(reinterpret_cast<uintptr_t>(code_), &args_);
	}

	private:
	// This packs up the arguments for the trampoline code, which are pretty much
	// the operands for REP MOVSB plus the entry point and data ZBI addresses.
	struct TrampolineArgs {
	RelocateTarget kernel;
	RelocateTarget zbi;
	uint64_t data_zbi;
	uint64_t entry;
	};

	// We must require the compiler not to inline \|TrampolineCode\| to prevent
	// more that one instance of \|TrampolineCode\| to exist. The real issue,
	// is that inlining may introduce alignment or jump relaxation between
	// instances causing the size of the assembler code to be different.
	[[gnu::const, gnu::noinline]] static zbitl::ByteView TrampolineCode() {
	// This tiny bit of code will be copied someplace out of the way. Then it
	// will be entered with %rsi pointing at TrampolineArgs, which can be on
	// the stack since it's read immediately. Since this code is safely out of
	// the way, it can perform a copy that might clobber this boot shim's own
	// code, data, bss, and stack. After the copy, it jumps directly to the
	// fixed-address ZBI kernel's entry point and %rsi points to the data ZBI.
	//
	// First the code loads the backwards flag into %al, the entry address into
	// %rbx, and the ZBI address into %rdx. Then it loads the registers used
	// by REP MOVSB (%rcx, %rdi, and %rsi). It then tests the %al flag to set
	// the Direction flag (STD) for backwards mode. Then REP MOVSB does the
	// copy, whether forwards or backwards. After that, the SP and FP are
	// cleared, the D flag is cleared again and interrupts disabled for good
	// measure, before finally moving the ZBI pointer into place (%rsi) and
	// jumping to the entry point (%rbx).
	const ktl::byte* code;
	size_t size;
	#if defined(__x86_64__) \|\| defined(__i386__)
	__asm__(
	R"""(
	.code64
	.pushsection .rodata.trampoline, "a?", %%progbits
	0:
	# Save \|rsi\| in \|rbx\|, where \|rbx\| will always point to '&args'.
	mov %%rsi, %%rbx
	mov %c[zbi_count](%%rbx), %%rcx
	test %%rcx, %%rcx
	jz 2f
	mov %c[zbi_dst](%%rbx), %%rdi
	mov %c[zbi_src](%%rbx), %%rsi
	cmp %%rdi, %%rsi
	je 2f
	mov %c[zbi_backwards](%%rbx), %%al
	testb %%al,%%al
	jz 1f
	std
	1:
	rep movsb
	cld
	2:
	mov %c[kernel_count](%%rbx), %%rcx
	mov %c[kernel_dst](%%rbx), %%rdi
	mov %c[kernel_src](%%rbx), %%rsi
	cmp %%rdi, %%rsi
	je 4f
	mov %c[kernel_backwards](%%rbx), %%al
	testb %%al, %%al
	jz 3f
	std
	3:
	rep movsb
	4:
	# Clean stack pointers before jumping into the kernel.
	xor %%esp, %%esp
	xor %%ebp, %%ebp
	cld
	cli
	# The data ZBI must be in rsi before jumping into the kernel entry address.
	mov %c[data_zbi](%%rbx), %%rsi
	mov %c[entry](%%rbx), %%rbx
	jmp *%%rbx
	4:
	.popsection
	)"""

	#ifdef __i386__
	R"""(
	.code32
	mov $0b, %[code]
	mov $(4b - 0b), %[size]
	)"""
	#else
	R"""(
	lea 0b(%%rip), %[code]
	mov $(4b - 0b), %[size]
	)"""
	#endif

	: [code] "=r"(code), [size] "=r"(size)
	: [kernel_backwards] "i"(offsetof(TrampolineArgs, kernel.backwards)), //
	[kernel_dst] "i"(offsetof(TrampolineArgs, kernel.dst)), //
	[kernel_src] "i"(offsetof(TrampolineArgs, kernel.src)), //
	[kernel_count] "i"(offsetof(TrampolineArgs, kernel.count)), //
	[zbi_dst] "i"(offsetof(TrampolineArgs, zbi.dst)), //
	[zbi_src] "i"(offsetof(TrampolineArgs, zbi.src)), //
	[zbi_count] "i"(offsetof(TrampolineArgs, zbi.count)), //
	[zbi_backwards] "i"(offsetof(TrampolineArgs, zbi.backwards)), //
	[data_zbi] "i"(offsetof(TrampolineArgs, data_zbi)), //
	[entry] "i"(offsetof(TrampolineArgs, entry)));
	#elif defined(__aarch64__)
	__asm__(
	R""(
	.pushsection .rodata.trampoline, "a?", %%progbits
	// x0 contains \|&args\|.
	.Ltrampoline_start.%=:
	mov x10, x0
	ldp x0, x1, [x10, %[zbi_dst_offset]]
	.Ltrampoline_zbi.%=:
	add x9, x10, %[data_offset]
	bl .Lcopy_start.%=
	.Ltrampoline_kernel.%=:
	add x9, x10, %[kernel_offset]
	bl .Lcopy_start.%=
	.Ltrampoline_exit.%=:
	mov x29, xzr
	mov x30, xzr
	mov sp, x29
	br x1

	// Expectation:
	// x9: RelocatableTarget*
	// x2-x8 are used during this procedure.
	.Lcopy_start.%=:
	// x2 -> src address
	// x3 -> dst address
	// x4 -> count (in bytes)
	// x5 -> backwards (direction)
	ldp x2, x3, [x9]
	ldp x4, x5, [x9, %[count_offset]]
	cbz x4, .Lcopy_ret.%=
	cmp x2, x3
	beq .Lcopy_ret.%=
	// test direction flag.
	cbnz x5, .Lcopy_backwards.%=

	// x2 and x3 hold the first byte in the range to copy, and x4 holds the number of bytes,
	// which is a multiple of 32.
	.Lcopy_forward.%=:
	ldp x5, x6, [x2, #16]
	ldp x7, x8, [x2, #32]!
	stp x5, x6, [x3, #16]
	stp x7, x8, [x3, #32]!
	sub x4, x4, #32
	cbnz x4, .Lcopy_forward.%=
	ret

	// In backwards mode, the src and dst registers point the last, non inclusive, byte and
	// is guaranteed to be a multiple of 32b, hence we can just loop.
	.Lcopy_backwards.%=:
	ldp x5, x6, [x2, #-16]
	ldp x7, x8, [x2, #-32]!
	stp x5, x6, [x3, #-16]
	stp x7, x8, [x3, #-32]!
	sub x4, x4, #32
	cbnz x4, .Lcopy_backwards.%=
	.Lcopy_ret.%=:
	ret

	// Used to calculate code size.
	.Ltrampoline_end.%=:
	.popsection

	adrp %[code], .Ltrampoline_start.%=
	add %[code], %[code], #:lo12:.Ltrampoline_start.%=
	mov %[size], (.Ltrampoline_end.%= - .Ltrampoline_start.%=)
	)""
	: [code] "=r"(code), [size] "=r"(size)
	: [kernel_offset] "i"(offsetof(TrampolineArgs, kernel)),
	[data_offset] "i"(offsetof(TrampolineArgs, zbi)),
	[src_offset] "i"(offsetof(RelocateTarget, src)),
	[count_offset] "i"(offsetof(RelocateTarget, count)),
	[zbi_dst_offset] "i"(offsetof(TrampolineArgs, data_zbi)),
	[entry] "i"(offsetof(TrampolineArgs, entry)));
	#elif defined(__riscv)
	const ktl::byte* code_end;
	// This starts with the hart ID in a0 and the "data pointer" in a1.
	// a0 is left alone throughout to pass it along to the real kernel.
	// a1 points to the TrampolineArgs and is replaced with args.data_zbi.
	//
	// TODO(mcgrathr): maybe unroll the copying loops some
	__asm__(
	R"""(
	.pushsection .rodata.trampoline, "a?", %%progbits
	.Ltrampoline_start.%=:

	add t0, a1, %[data_offset]
	jal .Lcopy_start.%=
	add t0, a1, %[kernel_offset]
	jal .Lcopy_start.%=

	mv s0, zero
	mv ra, zero
	mv sp, zero
	mv gp, zero
	mv tp, zero
	ld t0, %[entry](a1)
	ld a1, %[zbi_dst_offset](a1)
	jr t0

	.Lcopy_start.%=:
	ld t1, %[src_offset](t0)
	ld t2, %[dst_offset](t0)
	ld t3, %[count_offset](t0)
	ld t4, %[backwards_offset](t0)
	bnez t4, .Lcopy_backwards.%=

	.Lcopy_forward.%=:
	ld t4, (t1)
	sd t4, (t2)
	add t3, t3, -8
	add t1, t1, 8
	add t2, t2, 8
	bnez t3, .Lcopy_forward.%=
	ret

	.Lcopy_backwards.%=:
	ld t4, -8(t1)
	sd t4, -8(t2)
	add t3, t3, -8
	add t1, t1, -8
	add t2, t2, -8
	bnez t3, .Lcopy_backwards.%=
	ret

	.Ltrampoline_end.%=:
	.popsection

	lla %[start], .Ltrampoline_start.%=
	lla %[end], .Ltrampoline_end.%=
	)"""
	: [start] "=r"(code), [end] "=r"(code_end)
	: [kernel_offset] "i"(offsetof(TrampolineArgs, kernel)),
	[data_offset] "i"(offsetof(TrampolineArgs, zbi)),
	[src_offset] "i"(offsetof(RelocateTarget, src)),
	[dst_offset] "i"(offsetof(RelocateTarget, dst)),
	[count_offset] "i"(offsetof(RelocateTarget, count)),
	[backwards_offset] "i"(offsetof(RelocateTarget, backwards)),
	[zbi_dst_offset] "i"(offsetof(TrampolineArgs, data_zbi)),
	[entry] "i"(offsetof(TrampolineArgs, entry)));
	size = code_end - code;
	#else
	#error "what architecture?"
	#endif
	return {code, size};
	}

	TrampolineArgs args_;
	ktl::byte code_[];
	};

	void TrampolineBoot::SetKernelAddresses() {
	kernel_entry_address_ = BootZbi::KernelEntryAddress();
	if (IsLegacyEntryAddress(KernelHeader()->entry)) {
	set_kernel_load_address(kLegacyLoadAddress);
	kernel_entry_address_ = KernelHeader()->entry;
	}
	}

	fit::result<BootZbi::Error> TrampolineBoot::Load(uint32_t extra_data_capacity,
	ktl::optional<uint64_t> kernel_load_address,
	ktl::optional<uint64_t> data_load_address) {
	if (kernel_load_address) {
	set_kernel_load_address(*kernel_load_address);
	}

	if (data_load_address) {
	data_load_address_ = data_load_address;
	}

	if (!kernel_load_address_) {
	// New-style position-independent kernel.
	return BootZbi::Load(extra_data_capacity);
	}

	// Now we know how much space the kernel image needs.
	// Reserve it at the fixed load address.
	auto& pool = Allocation::GetPool();
	if (auto result = pool.UpdateFreeRamSubranges(memalloc::Type::kFixedAddressKernel,
	*kernel_load_address_, KernelMemorySize());
	result.is_error()) {
	return fit::error{BootZbi::Error{.zbi_error = "unable to reserve kernel's load image"sv}};
	}

	if (data_load_address_) {
	if (auto result = pool.UpdateFreeRamSubranges(memalloc::Type::kDataZbi, *data_load_address_,
	DataLoadSize() + extra_data_capacity);
	result.is_error()) {
	return fit::error{BootZbi::Error{.zbi_error = "unable to reserve data ZBI's load image"sv}};
	}
	}

	// The trampoline needs someplace safely neither in the kernel image, nor in
	// the data ZBI image, nor in this shim's own image since that might overlap
	// the fixed-address target region. It's tiny, so just extend the extra data
	// capacity to cover it and use the few bytes just after the data ZBI. The
	// space is safely allocated in our present reckoning so it's disjoint from
	// the data and kernel image memory and from this shim's own image, but as
	// soon as we boot into the new kernel it will be reclaimable memory.
	if (auto result = BootZbi::Load(extra_data_capacity + static_cast<uint32_t>(Trampoline::size()),
	kernel_load_address_);
	result.is_error()) {
	return result.take_error();
	}

	auto extra_space = DataZbi().storage().subspan(DataZbi().size_bytes());
	auto trampoline = extra_space.subspan(extra_data_capacity);
	trampoline_ = new (trampoline.data()) Trampoline(trampoline.size());

	// In the x86-64 case, we set up page-tables out of the .bss, which must
	// persist after booting the next kernel payload; however, this part of the
	// .bss might be clobbered by that self-same fixed load image. To avoid that
	// issue, now that physical memory management has been bootstrapped, we
	// re-set up the address space out of the allocator, which will avoid
	// allocating from out of the load image's range that we just reserved.
	//
	// In the x86-32 case, page tables came from the normal Allocation pool
	// originally, but the pages chosen when ArchSetUpAddressSpace ran might
	// overlap with areas that are now reserved. But as on x86-64, after other
	// reservations it's now safe to choose page table pages with the allocator.
	//
	// On other machines, this is a no-op. The page tables now in use may indeed
	// be clobbered by the trampoline copying, but by then the MMU will be off.
	ArchPrepareAddressSpaceForTrampoline();

	return fit::ok();
	}

	[[noreturn]] void TrampolineBoot::Boot(ktl::optional<void*> argument) {
	ZX_ASSERT(!MustRelocateDataZbi());

	uintptr_t entry = static_cast<uintptr_t>(KernelEntryAddress());
	ZX_ASSERT(entry == KernelEntryAddress());

	uintptr_t zbi = static_cast<uintptr_t>(DataLoadAddress());
	ZX_ASSERT(zbi == DataLoadAddress());

	uintptr_t kernel_first = static_cast<uintptr_t>(KernelLoadAddress());
	uintptr_t kernel_last = static_cast<uintptr_t>(KernelLoadAddress() + KernelLoadSize() - 1);
	ZX_ASSERT(kernel_first == KernelLoadAddress());
	ZX_ASSERT(kernel_last == KernelLoadAddress() + KernelLoadSize() - 1);

	uintptr_t kernel_size = static_cast<uintptr_t>(KernelLoadSize());
	ZX_ASSERT(kernel_size == KernelLoadSize());

	if (kernel_load_address_) {
	uintptr_t fixed_first = static_cast<uintptr_t>(kernel_load_address_.value());
	uintptr_t fixed_last = static_cast<uintptr_t>(*kernel_load_address_ + KernelLoadSize() - 1);
	ZX_ASSERT_MSG(fixed_first == *kernel_load_address_, "0x%016" PRIx64 " != 0x%016" PRIx64 " ",
	static_cast<uint64_t>(fixed_first), *kernel_load_address_);
	ZX_ASSERT(fixed_last == *kernel_load_address_ + KernelLoadSize() - 1);
	}

	if (!trampoline_) {
	// This is a new-style position-independent kernel. Boot it where it is.
	BootZbi::Boot(argument);
	}

	uintptr_t zbi_location =
	reinterpret_cast<uintptr_t>(argument.value_or(DataZbi().storage().data()));
	auto kernel_blob = ktl::span<const ktl::byte>(reinterpret_cast<const ktl::byte*>(KernelImage()),
	KernelLoadSize());
	auto zbi_blob = ktl::span<const ktl::byte>(reinterpret_cast<const ktl::byte*>(zbi_location),
	DataZbi().size_bytes());
	trampoline_->Boot(
	RelocateTarget(static_cast<uintptr_t>(*kernel_load_address_), kernel_blob),
	RelocateTarget(static_cast<uintptr_t>(data_load_address_.value_or(zbi_location)), zbi_blob),
	KernelEntryAddress());
	}

	fit::result<TrampolineBoot::Error> TrampolineBoot::Init(InputZbi zbi) {
	auto res = BootZbi::Init(zbi);
	SetKernelAddresses();
	return res;
	}

	fit::result<TrampolineBoot::Error> TrampolineBoot::Init(InputZbi zbi,
	InputZbi::iterator kernel_item) {
	auto res = BootZbi::Init(zbi, kernel_item);
	SetKernelAddresses();
	return res;
	}

	void TrampolineBoot::Log() {
	LogAddresses();
	if (trampoline_) {
	LogFixedAddresses();
	}
	LogBoot(KernelEntryAddress());
	}

	// This output lines up with what BootZbi::LogAddresses() prints.
	void TrampolineBoot::LogFixedAddresses() const {
	#define ADDR "0x%016" PRIx64
	const uint64_t kernel = kernel_load_address_.value();
	const uint64_t bss = kernel + KernelLoadSize();
	const uint64_t end = kernel + KernelMemorySize();
	debugf("%s: Relocated\n", ProgramName());
	debugf("%s: Kernel @ [" ADDR ", " ADDR ")\n", ProgramName(), kernel, bss);
	debugf("%s: BSS @ [" ADDR ", " ADDR ")\n", ProgramName(), bss, end);
	if (data_load_address_) {
	debugf("%s: ZBI @ [" ADDR ", " ADDR ")\n", ProgramName(), *data_load_address_,
	*data_load_address_ + DataLoadSize());
	}
	}