[fzl] Implement memory-probe isolated thread function in assembly

A special-case entry point that doesn't have all the normal ABI setup
is very hard to get reliably right in C or C++.  It's straightforward
in assembly and the memory-probe entry point is trivial in assembly.

Bug: 3370
Change-Id: I927a20391ba63960e5f1f2c3438c5793c8b3420b
diff --git a/zircon/system/ulib/fzl/memory-probe.cc b/zircon/system/ulib/fzl/memory-probe.cc
index bc594a7..1c7c4e0 100644
--- a/zircon/system/ulib/fzl/memory-probe.cc
+++ b/zircon/system/ulib/fzl/memory-probe.cc
@@ -3,34 +3,64 @@
 // found in the LICENSE file.
 
 #include <lib/fzl/memory-probe.h>
-
-#include <limits.h>
-#include <stdio.h>
-#include <zircon/assert.h>
-#include <zircon/syscalls/exception.h>
 #include <lib/zx/channel.h>
 #include <lib/zx/exception.h>
 #include <lib/zx/process.h>
 #include <lib/zx/thread.h>
+#include <limits.h>
+#include <stdio.h>
+#include <zircon/assert.h>
+#include <zircon/syscalls/exception.h>
 
 namespace {
 
-enum class ProbeOperation { kRead, kWrite };
+// These are not really functions, but entry points for a thread that has a
+// tiny stack and no other setup.  They're not really entered with the C
+// ABI as such.  Rather, they're entered with the first argument register
+// set to an address and with the SP at the very top of the allocated
+// stack.  They're defined in pure assembly so that there are no issues
+// with compiler-generated code's assumptions about the proper ABI setup,
+// instrumentation, etc.
+//
+// Since this calls into the vDSO, it must adhere to the vDSO's ABI, which is
+// the "vanilla" C calling convention (no safe-stack or shadow-call-stack).
+// As well as the register usage conventions, this mandates a stack of some
+// reasonable minimum size, even on AArch64 where the calling convention
+// doesn't per se involve the stack (but it is specified that the SP must be
+// "valid" on function entry).  Today's vDSO implementation might not actually
+// make use of the stack in the zx_thread_exit call, but it always could.  The
+// x86 C calling convention mandates that the stack pointer have exactly the
+// alignment it gets from the call instruction on an aligned stack (that is,
+// SP % 16 == 8).
+extern "C" void read_thread_func(uintptr_t address, uintptr_t);
+extern "C" void write_thread_func(uintptr_t address, uintptr_t);
 
-#if __has_feature(address_sanitizer)
-[[clang::no_sanitize("address")]]
+#define PROBE_FUNC(name, insn)                              \
+  __asm__(".pushsection .text." #name ",\"ax\",%progbits\n" \
+          ".balign 4\n"                                     \
+          ".type " #name ",%function\n"                     \
+          ".cfi_startproc\n"                                \
+          #name ":\n" insn "\n" CALL_INSN " zx_thread_exit\n"\
+          ".cfi_endproc\n"                                  \
+          ".size " #name ", . - " #name "\n"                \
+          ".popsection");
+
+#ifdef __aarch64__
+#define CALL_INSN "bl"
+#define READ_PROBE_INSN "ldrb w1, [x0]"
+#define WRITE_PROBE_INSN "strb wzr, [x0]"
+#elif defined(__x86_64__)
+#define CALL_INSN "call"
+#define READ_PROBE_INSN "movb (%rdi), %al"
+#define WRITE_PROBE_INSN "xor %eax, %eax; movb %al, (%rdi)"
+#else
+#error "what machine?"
 #endif
-void except_thread_func(uintptr_t op, uintptr_t address) {
-  volatile char* ch_address = reinterpret_cast<char*>(address);
 
-  char ch = *ch_address;
-  if (static_cast<ProbeOperation>(op) == ProbeOperation::kWrite)
-    *ch_address = ch;
+PROBE_FUNC(read_thread_func, READ_PROBE_INSN)
+PROBE_FUNC(write_thread_func, WRITE_PROBE_INSN)
 
-  zx_thread_exit();
-}
-
-bool do_probe(ProbeOperation op, const void* addr) {
+bool do_probe(void (*op)(uintptr_t address, uintptr_t), uintptr_t addr) {
   // This function starts a new thread to perform the read/write test, and catches any exceptions
   // in this thread to see if it failed or not.
   zx::thread thread;
@@ -46,8 +76,7 @@
   if (status != ZX_OK)
     return false;
 
-  thread.start(&except_thread_func, stack, static_cast<uintptr_t>(op),
-               reinterpret_cast<uintptr_t>(addr));
+  thread.start(op, stack, addr, 0);
 
   // Wait for crash or thread completion.
   zx_signals_t signals = 0;
@@ -69,6 +98,10 @@
 
 }  // namespace
 
-bool probe_for_read(const void* addr) { return do_probe(ProbeOperation::kRead, addr); }
+bool probe_for_read(const void* addr) {
+  return do_probe(read_thread_func, reinterpret_cast<uintptr_t>(addr));
+}
 
-bool probe_for_write(void* addr) { return do_probe(ProbeOperation::kWrite, addr); }
+bool probe_for_write(void* addr) {
+  return do_probe(write_thread_func, reinterpret_cast<uintptr_t>(addr));
+}