[arm64] Replace some inline asm with arm_acle standard extensions.

Replaced uses of inline assembly with language extensions where available.
The extensions come from the <arm_acle.h>, which is included with gcc
and clang, though the gcc version is woefully incomplete. Local
arm_acle.h implements missing functionality.

__arm_rsr/wsr for reading and writing MSRs.
__dmb/dsb/isb for barriers.

Some minor optimizations of ISB abundance, as they were previously
included by default in ARM64_WRITE_SYSREG(). Conservatively removed some
in cases where it seemed like they could be combined.

The builtins provide the compiler more information about the operation
that's being done, and allow improved optimization, instruction fencing,
as well as safety (avoiding common assembler pitfalls such as incomplete
side effect flags).

Change-Id: I8b776d11095b333f6614d6e9a6c485efd32346e4
Test: Builds with Clang and GCC, manual verification of code gen, qemu boot.
diff --git a/kernel/arch/arm64/arch.cpp b/kernel/arch/arm64/arch.cpp
index af30c95..99574c4 100644
--- a/kernel/arch/arm64/arch.cpp
+++ b/kernel/arch/arm64/arch.cpp
@@ -151,32 +151,40 @@
     arm64_init_percpu_early();
 
     // Set the vector base.
-    ARM64_WRITE_SYSREG(VBAR_EL1, (uint64_t)&arm64_el1_exception_base);
+    __arm_wsr64("vbar_el1", (uint64_t)&arm64_el1_exception_base);
+    __isb(ARM_MB_SY);
 
     // Set some control bits in sctlr.
-    uint64_t sctlr = ARM64_READ_SYSREG(sctlr_el1);
+    uint64_t sctlr = __arm_rsr64("sctlr_el1");
     sctlr |= SCTLR_EL1_UCI | SCTLR_EL1_UCT | SCTLR_EL1_DZE | SCTLR_EL1_SA0 | SCTLR_EL1_SA;
     sctlr &= ~SCTLR_EL1_AC;  // Disable alignment checking for EL1, EL0.
-    ARM64_WRITE_SYSREG(sctlr_el1, sctlr);
+    __arm_wsr64("sctlr_el1", sctlr);
+    __isb(ARM_MB_SY);
 
     // Save all of the features of the cpu.
     arm64_feature_init();
 
     // Enable cycle counter.
-    ARM64_WRITE_SYSREG(pmcr_el0, PMCR_EL0_ENABLE_BIT | PMCR_EL0_LONG_COUNTER_BIT);
-    ARM64_WRITE_SYSREG(pmcntenset_el0, PMCNTENSET_EL0_ENABLE);
+    __arm_wsr64("pmcr_el0", PMCR_EL0_ENABLE_BIT | PMCR_EL0_LONG_COUNTER_BIT);
+    __isb(ARM_MB_SY);
+    __arm_wsr64("pmcntenset_el0", PMCNTENSET_EL0_ENABLE);
+    __isb(ARM_MB_SY);
 
     // Enable user space access to cycle counter.
-    ARM64_WRITE_SYSREG(pmuserenr_el0, PMUSERENR_EL0_ENABLE);
+    __arm_wsr64("pmuserenr_el0", PMUSERENR_EL0_ENABLE);
+    __isb(ARM_MB_SY);
 
     // Enable Debug Exceptions by Disabling the OS Lock. The OSLAR_EL1 is a WO
     // register with only the low bit defined as OSLK. Write 0 to disable.
-    ARM64_WRITE_SYSREG(oslar_el1, 0x0);
+    __arm_wsr64("oslar_el1", 0x0);
+    __isb(ARM_MB_SY);
 
     // Enable user space access to virtual counter (CNTVCT_EL0).
-    ARM64_WRITE_SYSREG(cntkctl_el1, CNTKCTL_EL1_ENABLE_VIRTUAL_COUNTER);
+    __arm_wsr64("cntkctl_el1", CNTKCTL_EL1_ENABLE_VIRTUAL_COUNTER);
+    __isb(ARM_MB_SY);
 
-    ARM64_WRITE_SYSREG(mdscr_el1, MSDCR_EL1_INITIAL_VALUE);
+    __arm_wsr64("mdscr_el1", MSDCR_EL1_INITIAL_VALUE);
+    __isb(ARM_MB_SY);
 
     arch_enable_fiqs();
 }
diff --git a/kernel/arch/arm64/debugger.cpp b/kernel/arch/arm64/debugger.cpp
index cc27d31..0fc8e5e 100644
--- a/kernel/arch/arm64/debugger.cpp
+++ b/kernel/arch/arm64/debugger.cpp
@@ -158,8 +158,8 @@
     // debug registers.
     // TODO(ZX-3038): This should be exposed through a standard interface.
     //                Either the sysinfo fidl, the vDSO info mapping or some other mechanism.
-    out->hw_bps[AARCH64_MAX_HW_BREAKPOINTS - 1].dbgbvr = ARM64_READ_SYSREG(id_aa64dfr0_el1);
-    out->hw_bps[AARCH64_MAX_HW_BREAKPOINTS - 2].dbgbvr = ARM64_READ_SYSREG(mdscr_el1);
+    out->hw_bps[AARCH64_MAX_HW_BREAKPOINTS - 1].dbgbvr = __arm_rsr64("id_aa64dfr0_el1");
+    out->hw_bps[AARCH64_MAX_HW_BREAKPOINTS - 2].dbgbvr = __arm_rsr64("mdscr_el1");
 
     return ZX_OK;
 }
diff --git a/kernel/arch/arm64/exceptions_c.cpp b/kernel/arch/arm64/exceptions_c.cpp
index 8ee0d0a..9f459c7 100644
--- a/kernel/arch/arm64/exceptions_c.cpp
+++ b/kernel/arch/arm64/exceptions_c.cpp
@@ -158,7 +158,7 @@
 static void arm64_instruction_abort_handler(struct arm64_iframe_long* iframe, uint exception_flags,
                                             uint32_t esr) {
     /* read the FAR register */
-    uint64_t far = ARM64_READ_SYSREG(far_el1);
+    uint64_t far = __arm_rsr64("far_el1");
     uint32_t ec = BITS_SHIFT(esr, 31, 26);
     uint32_t iss = BITS(esr, 24, 0);
     bool is_user = !BIT(ec, 0);
@@ -198,7 +198,7 @@
 static void arm64_data_abort_handler(struct arm64_iframe_long* iframe, uint exception_flags,
                                      uint32_t esr) {
     /* read the FAR register */
-    uint64_t far = ARM64_READ_SYSREG(far_el1);
+    uint64_t far = __arm_rsr64("far_el1");
     uint32_t ec = BITS_SHIFT(esr, 31, 26);
     uint32_t iss = BITS(esr, 24, 0);
     bool is_user = !BIT(ec, 0);
diff --git a/kernel/arch/arm64/feature.cpp b/kernel/arch/arm64/feature.cpp
index 20f73ec..54065b6 100644
--- a/kernel/arch/arm64/feature.cpp
+++ b/kernel/arch/arm64/feature.cpp
@@ -34,7 +34,7 @@
 void arm64_get_cache_info(arm64_cache_info_t* info) {
     uint64_t temp = 0;
 
-    uint64_t sysreg = ARM64_READ_SYSREG(clidr_el1);
+    uint64_t sysreg = __arm_rsr64("clidr_el1");
     info->inner_boundary = (uint8_t)BITS_SHIFT(sysreg, 32, 30);
     info->lou_u = (uint8_t)BITS_SHIFT(sysreg, 29, 27);
     info->loc = (uint8_t)BITS_SHIFT(sysreg, 26, 24);
@@ -45,20 +45,23 @@
             info->level_data_type[i].ctype = 0;
             info->level_inst_type[i].ctype = 0;
         } else if (ctype == 4) {                               // Unified
-            ARM64_WRITE_SYSREG(CSSELR_EL1, (int64_t)(i << 1)); // Select cache level
-            temp = ARM64_READ_SYSREG(ccsidr_el1);
+            __arm_wsr64("csselr_el1", (int64_t)(i << 1)); // Select cache level
+            __isb(ARM_MB_SY);
+            temp = __arm_rsr64("ccsidr_el1");
             info->level_data_type[i].ctype = 4;
             parse_ccsid(&(info->level_data_type[i]), temp);
         } else {
             if (ctype & 0x02) {
-                ARM64_WRITE_SYSREG(CSSELR_EL1, (int64_t)(i << 1));
-                temp = ARM64_READ_SYSREG(ccsidr_el1);
+                __arm_wsr64("csselr_el1", (int64_t)(i << 1));
+                __isb(ARM_MB_SY);
+                temp = __arm_rsr64("ccsidr_el1");
                 info->level_data_type[i].ctype = 2;
                 parse_ccsid(&(info->level_data_type[i]), temp);
             }
             if (ctype & 0x01) {
-                ARM64_WRITE_SYSREG(CSSELR_EL1, (int64_t)(i << 1) | 0x01);
-                temp = ARM64_READ_SYSREG(ccsidr_el1);
+                __arm_wsr64("csselr_el1", (int64_t)(i << 1) | 0x01);
+                __isb(ARM_MB_SY);
+                temp = __arm_rsr64("ccsidr_el1");
                 info->level_inst_type[i].ctype = 1;
                 parse_ccsid(&(info->level_inst_type[i]), temp);
             }
@@ -163,11 +166,11 @@
 }
 
 static void print_cpu_info() {
-    uint32_t midr = (uint32_t)ARM64_READ_SYSREG(midr_el1);
+    uint32_t midr = (uint32_t)__arm_rsr64("midr_el1");
     char cpu_name[128];
     midr_to_core(midr, cpu_name, sizeof(cpu_name));
 
-    uint64_t mpidr = ARM64_READ_SYSREG(mpidr_el1);
+    uint64_t mpidr = __arm_rsr64("mpidr_el1");
 
     dprintf(INFO, "ARM cpu %u: midr %#x '%s' mpidr %#" PRIx64 " aff %u:%u:%u:%u\n",
             arch_curr_cpu_num(), midr, cpu_name, mpidr,
@@ -183,16 +186,16 @@
     cpu_num_t cpu = arch_curr_cpu_num();
     if (cpu == 0) {
         // read the block size of DC ZVA
-        uint64_t dczid = ARM64_READ_SYSREG(dczid_el0);
+        uint64_t dczid = __arm_rsr64("dczid_el0");
         uint32_t arm64_zva_shift = 0;
         if (BIT(dczid, 4) == 0) {
-            arm64_zva_shift = (uint32_t)(ARM64_READ_SYSREG(dczid_el0) & 0xf) + 2;
+            arm64_zva_shift = (uint32_t)(__arm_rsr64("dczid_el0") & 0xf) + 2;
         }
         ASSERT(arm64_zva_shift != 0); // for now, fail if DC ZVA is unavailable
         arm64_zva_size = (1u << arm64_zva_shift);
 
         // read the dcache and icache line size
-        uint64_t ctr = ARM64_READ_SYSREG(ctr_el0);
+        uint64_t ctr = __arm_rsr64("ctr_el0");
         uint32_t arm64_dcache_shift = (uint32_t)BITS_SHIFT(ctr, 19, 16) + 2;
         arm64_dcache_size = (1u << arm64_dcache_shift);
         uint32_t arm64_icache_shift = (uint32_t)BITS(ctr, 3, 0) + 2;
@@ -200,7 +203,7 @@
 
         // parse the ISA feature bits
         arm64_features |= ZX_HAS_CPU_FEATURES;
-        uint64_t isar0 = ARM64_READ_SYSREG(id_aa64isar0_el1);
+        uint64_t isar0 = __arm_rsr64("id_aa64isar0_el1");
         if (BITS_SHIFT(isar0, 7, 4) >= 1) {
             arm64_features |= ZX_ARM64_FEATURE_ISA_AES;
         }
@@ -235,12 +238,12 @@
             arm64_features |= ZX_ARM64_FEATURE_ISA_DP;
         }
 
-        uint64_t isar1 = ARM64_READ_SYSREG(id_aa64isar1_el1);
+        uint64_t isar1 = __arm_rsr64("id_aa64isar1_el1");
         if (BITS_SHIFT(isar1, 3, 0) >= 1) {
             arm64_features |= ZX_ARM64_FEATURE_ISA_DPB;
         }
 
-        uint64_t pfr0 = ARM64_READ_SYSREG(id_aa64pfr0_el1);
+        uint64_t pfr0 = __arm_rsr64("id_aa64pfr0_el1");
         if (BITS_SHIFT(pfr0, 19, 16) < 0b1111) {
             arm64_features |= ZX_ARM64_FEATURE_ISA_FP;
         }
@@ -253,7 +256,7 @@
     arm64_get_cache_info(&(cache_info[cpu]));
 
     // check to make sure implementation supports 16 bit asids
-    uint64_t mmfr0 = ARM64_READ_SYSREG(ID_AA64MMFR0_EL1);
+    uint64_t mmfr0 = __arm_rsr64("id_aa64mmfr0_el1");
     ASSERT((mmfr0 & ARM64_MMFR0_ASIDBITS_MASK) == ARM64_MMFR0_ASIDBITS_16);
 }
 
diff --git a/kernel/arch/arm64/fpu.cpp b/kernel/arch/arm64/fpu.cpp
index 5b79efa..a338622 100644
--- a/kernel/arch/arm64/fpu.cpp
+++ b/kernel/arch/arm64/fpu.cpp
@@ -90,7 +90,7 @@
 /* save fpu state if the thread had dirtied it and disable the fpu */
 __NO_SAFESTACK void arm64_fpu_context_switch(struct thread* oldthread,
                                              struct thread* newthread) {
-    uint64_t cpacr = ARM64_READ_SYSREG(cpacr_el1);
+    uint64_t cpacr = __arm_rsr64("cpacr_el1");
     if (is_fpu_enabled((uint32_t)cpacr)) {
         LTRACEF("saving state on thread %s\n", oldthread->name);
 
@@ -98,7 +98,8 @@
         arm64_fpu_save_state(oldthread);
 
         /* disable the fpu again */
-        ARM64_WRITE_SYSREG(cpacr_el1, cpacr & ~FPU_ENABLE_MASK);
+        __arm_wsr64("cpacr_el1", cpacr & ~FPU_ENABLE_MASK);
+        __isb(ARM_MB_SY);
     }
 }
 
@@ -109,12 +110,13 @@
     /* only valid to be called if exception came from lower level */
     DEBUG_ASSERT(exception_flags & ARM64_EXCEPTION_FLAG_LOWER_EL);
 
-    uint64_t cpacr = ARM64_READ_SYSREG(cpacr_el1);
+    uint64_t cpacr = __arm_rsr64("cpacr_el1");
     DEBUG_ASSERT(!is_fpu_enabled((uint32_t)cpacr));
 
     /* enable the fpu */
     cpacr |= FPU_ENABLE_MASK;
-    ARM64_WRITE_SYSREG(cpacr_el1, cpacr);
+    __arm_wsr64("cpacr_el1", cpacr);
+    __isb(ARM_MB_SY);
 
     /* load the state from the current cpu */
     thread_t* t = get_current_thread();
diff --git a/kernel/arch/arm64/hypervisor/el2_cpu_state.cpp b/kernel/arch/arm64/hypervisor/el2_cpu_state.cpp
index 6fb5dcd..23abada 100644
--- a/kernel/arch/arm64/hypervisor/el2_cpu_state.cpp
+++ b/kernel/arch/arm64/hypervisor/el2_cpu_state.cpp
@@ -43,7 +43,7 @@
                     MMU_PTE_L012_DESCRIPTOR_BLOCK;
     }
 
-    DMB;
+    __dmb(ARM_MB_SY);
     return ZX_OK;
 }
 
diff --git a/kernel/arch/arm64/hypervisor/vcpu.cpp b/kernel/arch/arm64/hypervisor/vcpu.cpp
index 855f61b..97ece59 100644
--- a/kernel/arch/arm64/hypervisor/vcpu.cpp
+++ b/kernel/arch/arm64/hypervisor/vcpu.cpp
@@ -178,7 +178,7 @@
     vcpu->gich_state_.apr = 0;
     vcpu->el2_state_->guest_state.system_state.elr_el2 = entry;
     vcpu->el2_state_->guest_state.system_state.spsr_el2 = kSpsrDaif | kSpsrEl1h;
-    uint64_t mpidr = ARM64_READ_SYSREG(mpidr_el1);
+    uint64_t mpidr = __arm_rsr64("mpidr_el1");
     vcpu->el2_state_->guest_state.system_state.vmpidr_el2 = vmpidr_of(vpid, mpidr);
     vcpu->el2_state_->host_state.system_state.vmpidr_el2 = mpidr;
     vcpu->hcr_ = HCR_EL2_VM | HCR_EL2_PTW | HCR_EL2_FMO | HCR_EL2_IMO | HCR_EL2_DC | HCR_EL2_TWI |
diff --git a/kernel/arch/arm64/include/arch/arch_ops.h b/kernel/arch/arm64/include/arch/arch_ops.h
index 23edbb0..2805b30 100644
--- a/kernel/arch/arm64/include/arch/arch_ops.h
+++ b/kernel/arch/arm64/include/arch/arch_ops.h
@@ -22,25 +22,17 @@
 #define ENABLE_CYCLE_COUNTER 1
 
 static inline void arch_spinloop_pause(void) {
-    __asm__ volatile("yield" ::
-                         : "memory");
+    __yield();
 }
 
-#define mb() __asm__ volatile("dsb sy" \
-                              :        \
-                              :        \
-                              : "memory")
-#define smp_mb() __asm__ volatile("dmb sy" \
-                                  :         \
-                                  :         \
-                                  : "memory")
+#define mb() __dsb(ARM_MB_SY)
+#define smp_mb() __dmb(ARM_MB_SY)
 
 static inline uint64_t arch_cycle_count(void) {
-    return ARM64_READ_SYSREG(pmccntr_el0);
+    return __arm_rsr64("pmccntr_el0");
 }
 
-static inline uint32_t arch_cpu_features(void)
-{
+static inline uint32_t arch_cpu_features(void) {
     return arm64_features;
 }
 
diff --git a/kernel/arch/arm64/include/arch/arm64.h b/kernel/arch/arm64/include/arch/arm64.h
index 5bd1703..d44679f 100644
--- a/kernel/arch/arm64/include/arch/arm64.h
+++ b/kernel/arch/arm64/include/arch/arm64.h
@@ -9,6 +9,7 @@
 
 #ifndef __ASSEMBLER__
 
+#include <arm_acle.h>
 #include <assert.h>
 #include <stdbool.h>
 #include <sys/types.h>
@@ -17,39 +18,27 @@
 #include <zircon/compiler.h>
 #include <zircon/types.h>
 
+// Constants from ACLE section 8.3, used as the argument for __dmb(), __dsb(), and __isb()
+// in arm_acle.h. Values are the architecturally defined immediate values encoded in barrier
+// instructions DMB, DSB, and ISB.
+#define ARM_MB_OSHLD    0x1
+#define ARM_MB_OSHST    0x2
+#define ARM_MB_OSH      0x3
+
+#define ARM_MB_NSHLD    0x5
+#define ARM_MB_NSHST    0x6
+#define ARM_MB_NSH      0x7
+
+#define ARM_MB_ISHLD    0x9
+#define ARM_MB_ISHST    0xa
+#define ARM_MB_ISH      0xb
+
+#define ARM_MB_LD       0xd
+#define ARM_MB_ST       0xe
+#define ARM_MB_SY       0xf
+
 __BEGIN_CDECLS
 
-#define DSB __asm__ volatile("dsb sy" :: \
-                                 : "memory")
-#define DSB_ISHST __asm__ volatile("dsb ishst" :: \
-                                       : "memory")
-#define DMB __asm__ volatile("dmb sy" :: \
-                                 : "memory")
-#define DMB_ISHST __asm__ volatile("dmb ishst" :: \
-                                       : "memory")
-#define ISB __asm__ volatile("isb" :: \
-                                 : "memory")
-
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-
-#define ARM64_READ_SYSREG(reg)                   \
-    ({                                           \
-        uint64_t _val;                           \
-        __asm__ volatile("mrs %0," TOSTRING(reg) \
-                         : "=r"(_val));          \
-        _val;                                    \
-    })
-
-#define ARM64_READ_SYSREG_32(reg) ((uint32_t)ARM64_READ_SYSREG(reg))
-
-#define ARM64_WRITE_SYSREG(reg, val)                               \
-    ({                                                             \
-        uint64_t _val = (val);                                     \
-        __asm__ volatile("msr " TOSTRING(reg) ", %0" ::"r"(_val)); \
-        ISB;                                                       \
-    })
-
 void arm64_context_switch(vaddr_t* old_sp, vaddr_t new_sp);
 void arm64_uspace_entry(uintptr_t arg1, uintptr_t arg2,
                         uintptr_t pc, uintptr_t sp,
diff --git a/kernel/arch/arm64/include/arch/arm64/mmu.h b/kernel/arch/arm64/include/arch/arm64/mmu.h
index 0ddc125..bd25803 100644
--- a/kernel/arch/arm64/include/arch/arm64/mmu.h
+++ b/kernel/arch/arm64/include/arch/arm64/mmu.h
@@ -386,13 +386,13 @@
 #define ARM64_TLBI_NOADDR(op)            \
     ({                                   \
         __asm__ volatile("tlbi " #op::); \
-        ISB;                             \
+        __isb(ARM_MB_SY);                             \
     })
 
 #define ARM64_TLBI(op, val)                                          \
     ({                                                               \
         __asm__ volatile("tlbi " #op ", %0" ::"r"((uint64_t)(val))); \
-        ISB;                                                         \
+        __isb(ARM_MB_SY);                                                         \
     })
 
 const size_t MMU_ARM64_ASID_BITS = 16;
diff --git a/kernel/arch/arm64/include/arch/current_thread.h b/kernel/arch/arm64/include/arch/current_thread.h
index a0117f9..c85d6a5 100644
--- a/kernel/arch/arm64/include/arch/current_thread.h
+++ b/kernel/arch/arm64/include/arch/current_thread.h
@@ -18,14 +18,15 @@
     // which conceivably could let it optimize better.
     char* tp = (char*)__builtin_thread_pointer();
 #else
-    char* tp = (char*)ARM64_READ_SYSREG(tpidr_el1);
+    char* tp = (char*)__arm_rsr64("tpidr_el1");
 #endif
     tp -= offsetof(struct thread, arch.thread_pointer_location);
     return (struct thread*)tp;
 }
 
 static inline void set_current_thread(struct thread* t) {
-    ARM64_WRITE_SYSREG(tpidr_el1, (uint64_t)&t->arch.thread_pointer_location);
+    __arm_wsr64("tpidr_el1", (uint64_t)&t->arch.thread_pointer_location);
+    __isb(ARM_MB_SY);
 }
 
 __END_CDECLS
diff --git a/kernel/arch/arm64/mmu.cpp b/kernel/arch/arm64/mmu.cpp
index d7bb9fc..4c460a6 100644
--- a/kernel/arch/arm64/mmu.cpp
+++ b/kernel/arch/arm64/mmu.cpp
@@ -448,7 +448,7 @@
         memset(vaddr, MMU_PTE_DESCRIPTOR_INVALID, 1U << page_size_shift);
 
         // ensure that the zeroing is observable from hardware page table walkers
-        DMB_ISHST;
+        __dmb(ARM_MB_ISHST);
 
         pte = paddr | MMU_PTE_L012_DESCRIPTOR_TABLE;
         page_table[index] = pte;
@@ -552,7 +552,7 @@
                 page_table[index] = MMU_PTE_DESCRIPTOR_INVALID;
 
                 // ensure that the update is observable from hardware page table walkers
-                DMB_ISHST;
+                __dmb(ARM_MB_ISHST);
 
                 // flush the non terminal TLB entry
                 FlushTLBEntry(vaddr, false);
@@ -565,7 +565,7 @@
             page_table[index] = MMU_PTE_DESCRIPTOR_INVALID;
 
             // ensure that the update is observable from hardware page table walkers
-            DMB_ISHST;
+            __dmb(ARM_MB_ISHST);
 
             // flush the terminal TLB entry
             FlushTLBEntry(vaddr, true);
@@ -720,7 +720,7 @@
             page_table[index] = pte;
 
             // ensure that the update is observable from hardware page table walkers
-            DMB_ISHST;
+            __dmb(ARM_MB_ISHST);
 
             // flush the terminal TLB entry
             FlushTLBEntry(vaddr, true);
@@ -763,7 +763,7 @@
     LOCAL_KTRACE64("mmu map", (vaddr & ~PAGE_MASK) | ((size >> PAGE_SIZE_SHIFT) & PAGE_MASK));
     ssize_t ret = MapPageTable(vaddr, vaddr_rel, paddr, size, attrs,
                                top_index_shift, page_size_shift, tt_virt_);
-    DSB;
+    __dsb(ARM_MB_SY);
     return ret;
 }
 
@@ -787,7 +787,7 @@
 
     ssize_t ret = UnmapPageTable(vaddr, vaddr_rel, size, top_index_shift,
                                  page_size_shift, tt_virt_);
-    DSB;
+    __dsb(ARM_MB_SY);
     return ret;
 }
 
@@ -812,7 +812,7 @@
     zx_status_t ret = ProtectPageTable(vaddr, vaddr_rel, size, attrs,
                                        top_index_shift, page_size_shift,
                                        tt_virt_);
-    DSB;
+    __dsb(ARM_MB_SY);
     return ret;
 }
 
@@ -1129,7 +1129,8 @@
 
         tcr = MMU_TCR_FLAGS_USER;
         ttbr = ((uint64_t)aspace->asid_ << 48) | aspace->tt_phys_;
-        ARM64_WRITE_SYSREG(ttbr0_el1, ttbr);
+        __arm_wsr64("ttbr0_el1", ttbr);
+        __isb(ARM_MB_SY);
 
         if (TRACE_CONTEXT_SWITCH)
             TRACEF("ttbr %#" PRIx64 ", tcr %#" PRIx64 "\n", ttbr, tcr);
@@ -1141,7 +1142,8 @@
             TRACEF("tcr %#" PRIx64 "\n", tcr);
     }
 
-    ARM64_WRITE_SYSREG(tcr_el1, tcr);
+    __arm_wsr64("tcr_el1", tcr);
+    __isb(ARM_MB_SY);
 }
 
 void arch_zero_page(void* _ptr) {
@@ -1178,8 +1180,7 @@
         }
     }
 
-    uint64_t par;
-    par = ARM64_READ_SYSREG(par_el1);
+    uint64_t par = __arm_rsr64("par_el1");
 
     arch_interrupt_restore(state, ARCH_DEFAULT_SPIN_LOCK_FLAG_INTERRUPTS);
 
diff --git a/kernel/arch/arm64/mp.cpp b/kernel/arch/arm64/mp.cpp
index 744b74e..d8707f9 100644
--- a/kernel/arch/arm64/mp.cpp
+++ b/kernel/arch/arm64/mp.cpp
@@ -60,7 +60,7 @@
 
 // do the 'slow' lookup by mpidr to cpu number
 static uint arch_curr_cpu_num_slow() {
-    uint64_t mpidr = ARM64_READ_SYSREG(mpidr_el1);
+    uint64_t mpidr = __arm_rsr64("mpidr_el1");
     uint cluster = (mpidr & MPIDR_AFF1_MASK) >> MPIDR_AFF1_SHIFT;
     uint cpu = (mpidr & MPIDR_AFF0_MASK) >> MPIDR_AFF0_SHIFT;
 
diff --git a/kernel/arch/arm64/registers.cpp b/kernel/arch/arm64/registers.cpp
index c3c972b..5ca9f07 100644
--- a/kernel/arch/arm64/registers.cpp
+++ b/kernel/arch/arm64/registers.cpp
@@ -11,15 +11,17 @@
 void arm64_disable_debug_state() {
     // The KDE bit enables and disables debug exceptions for the current execution.
     // Instruction Breakpoint Exceptions (software breakpoints) cannot be deactivated.
-    uint32_t mdscr_val = ARM64_READ_SYSREG_32(mdscr_el1) & ~ARM64_MDSCR_EL1_KDE;
-    ARM64_WRITE_SYSREG(mdscr_el1, mdscr_val);
+    uint32_t mdscr_val = __arm_rsr("mdscr_el1") & ~ARM64_MDSCR_EL1_KDE;
+    __arm_wsr("mdscr_el1", mdscr_val);
+    __isb(ARM_MB_SY);
 }
 
 void arm64_enable_debug_state() {
     // The KDE bit enables and disables debug exceptions for the current execution.
     // Instruction Breakpoint Exceptions (software breakpoints) cannot be deactivated.
-    uint32_t mdscr_val = ARM64_READ_SYSREG_32(mdscr_el1) | ARM64_MDSCR_EL1_KDE;
-    ARM64_WRITE_SYSREG(mdscr_el1, mdscr_val);
+    uint32_t mdscr_val = __arm_rsr("mdscr_el1") | ARM64_MDSCR_EL1_KDE;
+    __arm_wsr("mdscr_el1", mdscr_val);
+    __isb(ARM_MB_SY);
 }
 
 bool arm64_validate_debug_state(arm64_debug_state_t* state) {
@@ -41,7 +43,7 @@
 
 uint8_t arm64_hw_breakpoint_count() {
     // TODO(donoso): Eventually this should be cached as a boot time constant.
-    uint64_t dfr0 = ARM64_READ_SYSREG(id_aa64dfr0_el1);
+    uint64_t dfr0 = __arm_rsr64("id_aa64dfr0_el1");
     uint8_t count = (uint8_t)(((dfr0 & ARM64_ID_AADFR0_EL1_BRPS) >>
                                ARM64_ID_AADFR0_EL1_BRPS_SHIFT) +
                               1lu);
@@ -58,68 +60,68 @@
 
     switch (index) {
     case 0:
-        debug_state->hw_bps[0].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr0_el1);
-        debug_state->hw_bps[0].dbgbvr = ARM64_READ_SYSREG(dbgbvr0_el1);
+        debug_state->hw_bps[0].dbgbcr = __arm_rsr("dbgbcr0_el1");
+        debug_state->hw_bps[0].dbgbvr = __arm_rsr64("dbgbvr0_el1");
         break;
     case 1:
-        debug_state->hw_bps[1].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr1_el1);
-        debug_state->hw_bps[1].dbgbvr = ARM64_READ_SYSREG(dbgbvr1_el1);
+        debug_state->hw_bps[1].dbgbcr = __arm_rsr("dbgbcr1_el1");
+        debug_state->hw_bps[1].dbgbvr = __arm_rsr64("dbgbvr1_el1");
         break;
     case 2:
-        debug_state->hw_bps[2].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr2_el1);
-        debug_state->hw_bps[2].dbgbvr = ARM64_READ_SYSREG(dbgbvr2_el1);
+        debug_state->hw_bps[2].dbgbcr = __arm_rsr("dbgbcr2_el1");
+        debug_state->hw_bps[2].dbgbvr = __arm_rsr64("dbgbvr2_el1");
         break;
     case 3:
-        debug_state->hw_bps[3].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr3_el1);
-        debug_state->hw_bps[3].dbgbvr = ARM64_READ_SYSREG(dbgbvr3_el1);
+        debug_state->hw_bps[3].dbgbcr = __arm_rsr("dbgbcr3_el1");
+        debug_state->hw_bps[3].dbgbvr = __arm_rsr64("dbgbvr3_el1");
         break;
     case 4:
-        debug_state->hw_bps[4].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr4_el1);
-        debug_state->hw_bps[4].dbgbvr = ARM64_READ_SYSREG(dbgbvr4_el1);
+        debug_state->hw_bps[4].dbgbcr = __arm_rsr("dbgbcr4_el1");
+        debug_state->hw_bps[4].dbgbvr = __arm_rsr64("dbgbvr4_el1");
         break;
     case 5:
-        debug_state->hw_bps[5].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr5_el1);
-        debug_state->hw_bps[5].dbgbvr = ARM64_READ_SYSREG(dbgbvr5_el1);
+        debug_state->hw_bps[5].dbgbcr = __arm_rsr("dbgbcr5_el1");
+        debug_state->hw_bps[5].dbgbvr = __arm_rsr64("dbgbvr5_el1");
         break;
     case 6:
-        debug_state->hw_bps[6].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr6_el1);
-        debug_state->hw_bps[6].dbgbvr = ARM64_READ_SYSREG(dbgbvr6_el1);
+        debug_state->hw_bps[6].dbgbcr = __arm_rsr("dbgbcr6_el1");
+        debug_state->hw_bps[6].dbgbvr = __arm_rsr64("dbgbvr6_el1");
         break;
     case 7:
-        debug_state->hw_bps[7].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr7_el1);
-        debug_state->hw_bps[7].dbgbvr = ARM64_READ_SYSREG(dbgbvr7_el1);
+        debug_state->hw_bps[7].dbgbcr = __arm_rsr("dbgbcr7_el1");
+        debug_state->hw_bps[7].dbgbvr = __arm_rsr64("dbgbvr7_el1");
         break;
     case 8:
-        debug_state->hw_bps[8].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr8_el1);
-        debug_state->hw_bps[8].dbgbvr = ARM64_READ_SYSREG(dbgbvr8_el1);
+        debug_state->hw_bps[8].dbgbcr = __arm_rsr("dbgbcr8_el1");
+        debug_state->hw_bps[8].dbgbvr = __arm_rsr64("dbgbvr8_el1");
         break;
     case 9:
-        debug_state->hw_bps[9].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr9_el1);
-        debug_state->hw_bps[9].dbgbvr = ARM64_READ_SYSREG(dbgbvr9_el1);
+        debug_state->hw_bps[9].dbgbcr = __arm_rsr("dbgbcr9_el1");
+        debug_state->hw_bps[9].dbgbvr = __arm_rsr64("dbgbvr9_el1");
         break;
     case 10:
-        debug_state->hw_bps[10].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr10_el1);
-        debug_state->hw_bps[10].dbgbvr = ARM64_READ_SYSREG(dbgbvr10_el1);
+        debug_state->hw_bps[10].dbgbcr = __arm_rsr("dbgbcr10_el1");
+        debug_state->hw_bps[10].dbgbvr = __arm_rsr64("dbgbvr10_el1");
         break;
     case 11:
-        debug_state->hw_bps[11].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr11_el1);
-        debug_state->hw_bps[11].dbgbvr = ARM64_READ_SYSREG(dbgbvr11_el1);
+        debug_state->hw_bps[11].dbgbcr = __arm_rsr("dbgbcr11_el1");
+        debug_state->hw_bps[11].dbgbvr = __arm_rsr64("dbgbvr11_el1");
         break;
     case 12:
-        debug_state->hw_bps[12].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr12_el1);
-        debug_state->hw_bps[12].dbgbvr = ARM64_READ_SYSREG(dbgbvr12_el1);
+        debug_state->hw_bps[12].dbgbcr = __arm_rsr("dbgbcr12_el1");
+        debug_state->hw_bps[12].dbgbvr = __arm_rsr64("dbgbvr12_el1");
         break;
     case 13:
-        debug_state->hw_bps[13].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr13_el1);
-        debug_state->hw_bps[13].dbgbvr = ARM64_READ_SYSREG(dbgbvr13_el1);
+        debug_state->hw_bps[13].dbgbcr = __arm_rsr("dbgbcr13_el1");
+        debug_state->hw_bps[13].dbgbvr = __arm_rsr64("dbgbvr13_el1");
         break;
     case 14:
-        debug_state->hw_bps[14].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr14_el1);
-        debug_state->hw_bps[14].dbgbvr = ARM64_READ_SYSREG(dbgbvr14_el1);
+        debug_state->hw_bps[14].dbgbcr = __arm_rsr("dbgbcr14_el1");
+        debug_state->hw_bps[14].dbgbvr = __arm_rsr64("dbgbvr14_el1");
         break;
     case 15:
-        debug_state->hw_bps[15].dbgbcr = ARM64_READ_SYSREG_32(dbgbcr15_el1);
-        debug_state->hw_bps[15].dbgbvr = ARM64_READ_SYSREG(dbgbvr15_el1);
+        debug_state->hw_bps[15].dbgbcr = __arm_rsr("dbgbcr15_el1");
+        debug_state->hw_bps[15].dbgbvr = __arm_rsr64("dbgbvr15_el1");
         break;
     default:
         DEBUG_ASSERT(false);
@@ -141,68 +143,100 @@
 
     switch (index) {
     case 0:
-        ARM64_WRITE_SYSREG(dbgbcr0_el1, debug_state->hw_bps[0].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr0_el1, debug_state->hw_bps[0].dbgbvr);
+        __arm_wsr("dbgbcr0_el1", debug_state->hw_bps[0].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr0_el1", debug_state->hw_bps[0].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     case 1:
-        ARM64_WRITE_SYSREG(dbgbcr1_el1, debug_state->hw_bps[1].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr1_el1, debug_state->hw_bps[1].dbgbvr);
+        __arm_wsr("dbgbcr1_el1", debug_state->hw_bps[1].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr1_el1", debug_state->hw_bps[1].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     case 2:
-        ARM64_WRITE_SYSREG(dbgbcr2_el1, debug_state->hw_bps[2].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr2_el1, debug_state->hw_bps[2].dbgbvr);
+        __arm_wsr("dbgbcr2_el1", debug_state->hw_bps[2].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr2_el1", debug_state->hw_bps[2].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     case 3:
-        ARM64_WRITE_SYSREG(dbgbcr3_el1, debug_state->hw_bps[3].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr3_el1, debug_state->hw_bps[3].dbgbvr);
+        __arm_wsr("dbgbcr3_el1", debug_state->hw_bps[3].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr3_el1", debug_state->hw_bps[3].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     case 4:
-        ARM64_WRITE_SYSREG(dbgbcr4_el1, debug_state->hw_bps[4].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr4_el1, debug_state->hw_bps[4].dbgbvr);
+        __arm_wsr("dbgbcr4_el1", debug_state->hw_bps[4].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr4_el1", debug_state->hw_bps[4].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     case 5:
-        ARM64_WRITE_SYSREG(dbgbcr5_el1, debug_state->hw_bps[5].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr5_el1, debug_state->hw_bps[5].dbgbvr);
+        __arm_wsr("dbgbcr5_el1", debug_state->hw_bps[5].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr5_el1", debug_state->hw_bps[5].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     case 6:
-        ARM64_WRITE_SYSREG(dbgbcr6_el1, debug_state->hw_bps[6].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr6_el1, debug_state->hw_bps[6].dbgbvr);
+        __arm_wsr("dbgbcr6_el1", debug_state->hw_bps[6].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr6_el1", debug_state->hw_bps[6].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     case 7:
-        ARM64_WRITE_SYSREG(dbgbcr7_el1, debug_state->hw_bps[7].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr7_el1, debug_state->hw_bps[7].dbgbvr);
+        __arm_wsr("dbgbcr7_el1", debug_state->hw_bps[7].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr7_el1", debug_state->hw_bps[7].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     case 8:
-        ARM64_WRITE_SYSREG(dbgbcr8_el1, debug_state->hw_bps[8].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr8_el1, debug_state->hw_bps[8].dbgbvr);
+        __arm_wsr("dbgbcr8_el1", debug_state->hw_bps[8].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr8_el1", debug_state->hw_bps[8].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     case 9:
-        ARM64_WRITE_SYSREG(dbgbcr9_el1, debug_state->hw_bps[9].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr9_el1, debug_state->hw_bps[9].dbgbvr);
+        __arm_wsr("dbgbcr9_el1", debug_state->hw_bps[9].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr9_el1", debug_state->hw_bps[9].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     case 10:
-        ARM64_WRITE_SYSREG(dbgbcr10_el1, debug_state->hw_bps[10].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr10_el1, debug_state->hw_bps[10].dbgbvr);
+        __arm_wsr("dbgbcr10_el1", debug_state->hw_bps[10].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr10_el1", debug_state->hw_bps[10].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     case 11:
-        ARM64_WRITE_SYSREG(dbgbcr11_el1, debug_state->hw_bps[11].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr11_el1, debug_state->hw_bps[11].dbgbvr);
+        __arm_wsr("dbgbcr11_el1", debug_state->hw_bps[11].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr11_el1", debug_state->hw_bps[11].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     case 12:
-        ARM64_WRITE_SYSREG(dbgbcr12_el1, debug_state->hw_bps[12].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr12_el1, debug_state->hw_bps[12].dbgbvr);
+        __arm_wsr("dbgbcr12_el1", debug_state->hw_bps[12].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr12_el1", debug_state->hw_bps[12].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     case 13:
-        ARM64_WRITE_SYSREG(dbgbcr13_el1, debug_state->hw_bps[13].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr13_el1, debug_state->hw_bps[13].dbgbvr);
+        __arm_wsr("dbgbcr13_el1", debug_state->hw_bps[13].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr13_el1", debug_state->hw_bps[13].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     case 14:
-        ARM64_WRITE_SYSREG(dbgbcr14_el1, debug_state->hw_bps[14].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr14_el1, debug_state->hw_bps[14].dbgbvr);
+        __arm_wsr("dbgbcr14_el1", debug_state->hw_bps[14].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr14_el1", debug_state->hw_bps[14].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     case 15:
-        ARM64_WRITE_SYSREG(dbgbcr15_el1, debug_state->hw_bps[15].dbgbcr);
-        ARM64_WRITE_SYSREG(dbgbvr15_el1, debug_state->hw_bps[15].dbgbvr);
+        __arm_wsr("dbgbcr15_el1", debug_state->hw_bps[15].dbgbcr);
+        __isb(ARM_MB_SY);
+        __arm_wsr64("dbgbvr15_el1", debug_state->hw_bps[15].dbgbvr);
+        __isb(ARM_MB_SY);
         break;
     default:
         DEBUG_ASSERT(false);
diff --git a/kernel/arch/arm64/sysreg.cpp b/kernel/arch/arm64/sysreg.cpp
index 1b8126b..73b7fc4 100644
--- a/kernel/arch/arm64/sysreg.cpp
+++ b/kernel/arch/arm64/sysreg.cpp
@@ -14,42 +14,42 @@
 #if ARCH_ARM64
 #include <lib/console.h>
 
-#define SYSREG_READ_COMMAND(sysreg_string)                                      \
-    if (!strncasecmp(regname, #sysreg_string, sizeof(#sysreg_string))) {        \
-        printf(#sysreg_string " = %016lx\n", ARM64_READ_SYSREG(sysreg_string)); \
-        return 0;                                                               \
+#define SYSREG_READ_COMMAND(sysreg_string)                               \
+    if (!strncasecmp(regname, sysreg_string, sizeof(sysreg_string))) {   \
+        printf(sysreg_string " = %016lx\n", __arm_rsr64(sysreg_string)); \
+        return 0;                                                        \
     } else
 
 static uint64_t read_sysregs(const char* regname) {
-    SYSREG_READ_COMMAND(ACTLR_EL1)
-    SYSREG_READ_COMMAND(CCSIDR_EL1)
-    SYSREG_READ_COMMAND(CLIDR_EL1)
-    SYSREG_READ_COMMAND(CSSELR_EL1)
-    SYSREG_READ_COMMAND(MIDR_EL1)
-    SYSREG_READ_COMMAND(MPIDR_EL1)
-    SYSREG_READ_COMMAND(SCTLR_EL1)
-    SYSREG_READ_COMMAND(SPSR_EL1)
-    SYSREG_READ_COMMAND(TCR_EL1)
-    SYSREG_READ_COMMAND(TPIDRRO_EL0)
-    SYSREG_READ_COMMAND(TPIDR_EL1)
-    SYSREG_READ_COMMAND(TTBR0_EL1)
-    SYSREG_READ_COMMAND(TTBR1_EL1)
-    SYSREG_READ_COMMAND(VBAR_EL1)
+    SYSREG_READ_COMMAND("actlr_el1")
+    SYSREG_READ_COMMAND("ccsidr_el1")
+    SYSREG_READ_COMMAND("clidr_el1")
+    SYSREG_READ_COMMAND("csselr_el1")
+    SYSREG_READ_COMMAND("midr_el1")
+    SYSREG_READ_COMMAND("mpidr_el1")
+    SYSREG_READ_COMMAND("sctlr_el1")
+    SYSREG_READ_COMMAND("spsr_el1")
+    SYSREG_READ_COMMAND("tcr_el1")
+    SYSREG_READ_COMMAND("tpidrro_el0")
+    SYSREG_READ_COMMAND("tpidr_el1")
+    SYSREG_READ_COMMAND("ttbr0_el1")
+    SYSREG_READ_COMMAND("ttbr1_el1")
+    SYSREG_READ_COMMAND("vbar_el1")
 
     //Generic Timer regs
-    SYSREG_READ_COMMAND(CNTFRQ_EL0)
-    SYSREG_READ_COMMAND(CNTKCTL_EL1)
-    SYSREG_READ_COMMAND(CNTPCT_EL0)
-    SYSREG_READ_COMMAND(CNTPS_CTL_EL1)
-    SYSREG_READ_COMMAND(CNTPS_CVAL_EL1)
-    SYSREG_READ_COMMAND(CNTPS_TVAL_EL1)
-    SYSREG_READ_COMMAND(CNTP_CTL_EL0)
-    SYSREG_READ_COMMAND(CNTP_CVAL_EL0)
-    SYSREG_READ_COMMAND(CNTP_TVAL_EL0)
-    SYSREG_READ_COMMAND(CNTVCT_EL0)
-    SYSREG_READ_COMMAND(CNTV_CTL_EL0)
-    SYSREG_READ_COMMAND(CNTV_CVAL_EL0)
-    SYSREG_READ_COMMAND(CNTV_TVAL_EL0) {
+    SYSREG_READ_COMMAND("cntfrq_el0")
+    SYSREG_READ_COMMAND("cntkctl_el1")
+    SYSREG_READ_COMMAND("cntpct_el0")
+    SYSREG_READ_COMMAND("cntps_ctl_el1")
+    SYSREG_READ_COMMAND("cntps_cval_el1")
+    SYSREG_READ_COMMAND("cntps_tval_el1")
+    SYSREG_READ_COMMAND("cntp_ctl_el0")
+    SYSREG_READ_COMMAND("cntp_cval_el0")
+    SYSREG_READ_COMMAND("cntp_tval_el0")
+    SYSREG_READ_COMMAND("cntvct_el0")
+    SYSREG_READ_COMMAND("cntv_ctl_el0")
+    SYSREG_READ_COMMAND("cntv_cval_el0")
+    SYSREG_READ_COMMAND("cntv_tval_el0") {
         printf("Could not find register %s in list (you may need to add it to kernel/kernel/sysreg.c)\n", regname);
     }
     return 0;
diff --git a/kernel/arch/arm64/thread.cpp b/kernel/arch/arm64/thread.cpp
index 6e3b5d3..5ee9bfd 100644
--- a/kernel/arch/arm64/thread.cpp
+++ b/kernel/arch/arm64/thread.cpp
@@ -100,7 +100,7 @@
 __NO_SAFESTACK void arch_context_switch(thread_t* oldthread,
                                         thread_t* newthread) {
     LTRACEF("old %p (%s), new %p (%s)\n", oldthread, oldthread->name, newthread, newthread->name);
-    DSB; /* broadcast tlb operations in case the thread moves to another cpu */
+    __dsb(ARM_MB_SY); /* broadcast tlb operations in case the thread moves to another cpu */
 
     /* set the current cpu pointer in the new thread's structure so it can be
      * restored on exception entry.
diff --git a/kernel/dev/interrupt/arm_gic/v3/arm_gicv3.cpp b/kernel/dev/interrupt/arm_gic/v3/arm_gicv3.cpp
index 6b9a517..0e1b94f 100644
--- a/kernel/dev/interrupt/arm_gic/v3/arm_gicv3.cpp
+++ b/kernel/dev/interrupt/arm_gic/v3/arm_gicv3.cpp
@@ -159,7 +159,7 @@
     // disable the distributor
     GICREG(0, GICD_CTLR) = 0;
     gic_wait_for_rwp(GICD_CTLR);
-    ISB;
+    __isb(ARM_MB_SY);
 
     // distributor config: mask and clear all spis, set group 1.
     uint i;
@@ -193,7 +193,7 @@
     gic_init_percpu_early();
 
     mb();
-    ISB;
+    __isb(ARM_MB_SY);
 
     return ZX_OK;
 }
diff --git a/kernel/dev/interrupt/arm_gic/v3/include/dev/interrupt/arm_gicv3_regs.h b/kernel/dev/interrupt/arm_gic/v3/include/dev/interrupt/arm_gicv3_regs.h
index 7bd7ec6..65c256f 100644
--- a/kernel/dev/interrupt/arm_gic/v3/include/dev/interrupt/arm_gicv3_regs.h
+++ b/kernel/dev/interrupt/arm_gic/v3/include/dev/interrupt/arm_gicv3_regs.h
@@ -106,18 +106,18 @@
 
 static inline void gic_write_ctlr(uint32_t val) {
     __asm__ volatile("msr " ICC_CTLR_EL1 ", %0" :: "r"((uint64_t)val));
-    ISB;
+    __isb(ARM_MB_SY);
 }
 
 static inline void gic_write_pmr(uint32_t val) {
     __asm__ volatile("msr " ICC_PMR_EL1 ", %0" :: "r"((uint64_t)val));
-    ISB;
-    DSB;
+    __isb(ARM_MB_SY);
+    __dsb(ARM_MB_SY);
 }
 
 static inline void gic_write_igrpen(uint32_t val) {
     __asm__ volatile("msr " ICC_IGRPEN1_EL1 ", %0" :: "r"((uint64_t)val));
-    ISB;
+    __isb(ARM_MB_SY);
 }
 
 static inline uint32_t gic_read_sre(void) {
@@ -128,28 +128,28 @@
 
 static inline void gic_write_sre(uint32_t val) {
     __asm__ volatile("msr " ICC_SRE_EL1 ", %0" :: "r"((uint64_t)val));
-    ISB;
+    __isb(ARM_MB_SY);
 }
 
 static inline void gic_write_eoir(uint32_t val) {
     __asm__ volatile("msr " ICC_EOIR1_EL1 ", %0" :: "r"((uint64_t)val));
-    ISB;
+    __isb(ARM_MB_SY);
 }
 
 static inline void gic_write_dir(uint32_t val) {
     __asm__ volatile("msr " ICC_DIR_EL1 ", %0" :: "r"((uint64_t)val));
-    ISB;
+    __isb(ARM_MB_SY);
 }
 
 static inline uint32_t gic_read_iar() {
     uint64_t temp;
     __asm__ volatile("mrs %0, " ICC_IAR1_EL1 : "=r"(temp));
-    DSB;
+    __dsb(ARM_MB_SY);
     return (uint32_t)temp;
 }
 
 static inline void gic_write_sgi1r(uint64_t val) {
     __asm__ volatile("msr " ICC_SGI1R_EL1 ", %0" :: "r"((uint64_t)val));
-    ISB;
-    DSB;
+    __isb(ARM_MB_SY);
+    __dsb(ARM_MB_SY);
 }
diff --git a/kernel/dev/timer/arm_generic/arm_generic_timer.cpp b/kernel/dev/timer/arm_generic/arm_generic_timer.cpp
index 8b4e1c1..d43c0f6 100644
--- a/kernel/dev/timer/arm_generic/arm_generic_timer.cpp
+++ b/kernel/dev/timer/arm_generic/arm_generic_timer.cpp
@@ -23,24 +23,24 @@
 #define LOCAL_TRACE 0
 
 /* CNTFRQ AArch64 register */
-#define TIMER_REG_CNTFRQ cntfrq_el0
+#define TIMER_REG_CNTFRQ "cntfrq_el0"
 
 /* CNTP AArch64 registers */
-#define TIMER_REG_CNTP_CTL cntp_ctl_el0
-#define TIMER_REG_CNTP_CVAL cntp_cval_el0
-#define TIMER_REG_CNTP_TVAL cntp_tval_el0
-#define TIMER_REG_CNTPCT cntpct_el0
+#define TIMER_REG_CNTP_CTL "cntp_ctl_el0"
+#define TIMER_REG_CNTP_CVAL "cntp_cval_el0"
+#define TIMER_REG_CNTP_TVAL "cntp_tval_el0"
+#define TIMER_REG_CNTPCT "cntpct_el0"
 
-/* CNTPS AArch64 registers */
-#define TIMER_REG_CNTPS_CTL cntps_ctl_el1
-#define TIMER_REG_CNTPS_CVAL cntps_cval_el1
-#define TIMER_REG_CNTPS_TVAL cntps_tval_el1
+/* CNTPS "AArch64" registers */
+#define TIMER_REG_CNTPS_CTL "cntps_ctl_el1"
+#define TIMER_REG_CNTPS_CVAL "cntps_cval_el1"
+#define TIMER_REG_CNTPS_TVAL "cntps_tval_el1"
 
-/* CNTV AArch64 registers */
-#define TIMER_REG_CNTV_CTL cntv_ctl_el0
-#define TIMER_REG_CNTV_CVAL cntv_cval_el0
-#define TIMER_REG_CNTV_TVAL cntv_tval_el0
-#define TIMER_REG_CNTVCT cntvct_el0
+/* CNTV "AArch64" registers */
+#define TIMER_REG_CNTV_CTL "cntv_ctl_el0"
+#define TIMER_REG_CNTV_CVAL "cntv_cval_el0"
+#define TIMER_REG_CNTV_TVAL "cntv_tval_el0"
+#define TIMER_REG_CNTVCT "cntvct_el0"
 
 static int timer_irq;
 
@@ -59,77 +59,86 @@
 static uint32_t read_cntfrq(void) {
     uint32_t cntfrq;
 
-    cntfrq = ARM64_READ_SYSREG_32(TIMER_REG_CNTFRQ);
+    cntfrq = __arm_rsr(TIMER_REG_CNTFRQ);
     LTRACEF("cntfrq: 0x%08x, %u\n", cntfrq, cntfrq);
     return cntfrq;
 }
 
 static uint32_t read_cntp_ctl(void) {
-    return ARM64_READ_SYSREG_32(TIMER_REG_CNTP_CTL);
+    return __arm_rsr(TIMER_REG_CNTP_CTL);
 }
 
 static uint32_t read_cntv_ctl(void) {
-    return ARM64_READ_SYSREG_32(TIMER_REG_CNTV_CTL);
+    return __arm_rsr(TIMER_REG_CNTV_CTL);
 }
 
 static uint32_t read_cntps_ctl(void) {
-    return ARM64_READ_SYSREG_32(TIMER_REG_CNTPS_CTL);
+    return __arm_rsr(TIMER_REG_CNTPS_CTL);
 }
 
 static void write_cntp_ctl(uint32_t val) {
     LTRACEF_LEVEL(3, "cntp_ctl: 0x%x %x\n", val, read_cntp_ctl());
-    ARM64_WRITE_SYSREG(TIMER_REG_CNTP_CTL, val);
+    __arm_wsr(TIMER_REG_CNTP_CTL, val);
+    __isb(ARM_MB_SY);
 }
 
 static void write_cntv_ctl(uint32_t val) {
     LTRACEF_LEVEL(3, "cntv_ctl: 0x%x %x\n", val, read_cntv_ctl());
-    ARM64_WRITE_SYSREG(TIMER_REG_CNTV_CTL, val);
+    __arm_wsr(TIMER_REG_CNTV_CTL, val);
+    __isb(ARM_MB_SY);
 }
 
 static void write_cntps_ctl(uint32_t val) {
     LTRACEF_LEVEL(3, "cntps_ctl: 0x%x %x\n", val, read_cntps_ctl());
-    ARM64_WRITE_SYSREG(TIMER_REG_CNTPS_CTL, val);
+    __arm_wsr(TIMER_REG_CNTPS_CTL, val);
+    __isb(ARM_MB_SY);
 }
 
 static void write_cntp_cval(uint64_t val) {
     LTRACEF_LEVEL(3, "cntp_cval: 0x%016" PRIx64 ", %" PRIu64 "\n",
                   val, val);
-    ARM64_WRITE_SYSREG(TIMER_REG_CNTP_CVAL, val);
+    __arm_wsr64(TIMER_REG_CNTP_CVAL, val);
+    __isb(ARM_MB_SY);
 }
 
 static void write_cntv_cval(uint64_t val) {
     LTRACEF_LEVEL(3, "cntv_cval: 0x%016" PRIx64 ", %" PRIu64 "\n",
                   val, val);
-    ARM64_WRITE_SYSREG(TIMER_REG_CNTV_CVAL, val);
+    __arm_wsr64(TIMER_REG_CNTV_CVAL, val);
+    __isb(ARM_MB_SY);
 }
 
 static void write_cntps_cval(uint64_t val) {
     LTRACEF_LEVEL(3, "cntps_cval: 0x%016" PRIx64 ", %" PRIu64 "\n",
                   val, val);
-    ARM64_WRITE_SYSREG(TIMER_REG_CNTPS_CVAL, val);
+    __arm_wsr64(TIMER_REG_CNTPS_CVAL, val);
+    __isb(ARM_MB_SY);
 }
 
 static void write_cntp_tval(int32_t val) {
     LTRACEF_LEVEL(3, "cntp_tval: %d\n", val);
-    ARM64_WRITE_SYSREG(TIMER_REG_CNTP_TVAL, val);
+    __arm_wsr(TIMER_REG_CNTP_TVAL, val);
+    __isb(ARM_MB_SY);
 }
 
 static void write_cntv_tval(int32_t val) {
     LTRACEF_LEVEL(3, "cntv_tval: %d\n", val);
-    ARM64_WRITE_SYSREG(TIMER_REG_CNTV_TVAL, val);
+    __arm_wsr(TIMER_REG_CNTV_TVAL, val);
+    __isb(ARM_MB_SY);
 }
 
 static void write_cntps_tval(int32_t val) {
     LTRACEF_LEVEL(3, "cntps_tval: %d\n", val);
-    ARM64_WRITE_SYSREG(TIMER_REG_CNTPS_TVAL, val);
+    __arm_wsr(TIMER_REG_CNTPS_TVAL, val);
+    __isb(ARM_MB_SY);
 }
 
 static uint64_t read_cntpct(void) {
-    return ARM64_READ_SYSREG(TIMER_REG_CNTPCT);
+    return __arm_rsr64(TIMER_REG_CNTPCT);
 }
 
 static uint64_t read_cntvct(void) {
-    return ARM64_READ_SYSREG(TIMER_REG_CNTVCT);
+    return __arm_rsr64(TIMER_REG_CNTVCT);
 }
 
 struct timer_reg_procs {
diff --git a/kernel/include/arm_acle.h b/kernel/include/arm_acle.h
new file mode 100644
index 0000000..03e5145
--- /dev/null
+++ b/kernel/include/arm_acle.h
@@ -0,0 +1,50 @@
+// Copyright 2018 The Fuchsia Authors
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+
+#pragma once
+
+// Include arm_acle.h from the toolchain headers.
+#include_next <arm_acle.h>
+
+#include <stdint.h>
+
+#ifndef __clang__
+
+// GCC's arm_acle.h is missing implementations of the following ARM-standard APIs.
+// Thus they are provided here.
+
+#define __yield() __asm__ volatile("yield" ::: "memory")
+#define __dsb(mb) __asm__ volatile("dsb %0" :: "i"(mb) : "memory")
+#define __dmb(mb) __asm__ volatile("dmb %0" :: "i"(mb) : "memory")
+#define __isb(mb) __asm__ volatile("isb %0" :: "i"(mb) : "memory")
+
+#define __arm_rsr64(reg) \
+    ({                                                \
+        uint64_t _val;                                \
+        __asm__ volatile("mrs %0," reg : "=r"(_val)); \
+        _val;                                         \
+    })
+
+#define __arm_rsr(reg) \
+    ({                                                \
+        uint32_t _val;                                \
+        __asm__ volatile("mrs %0," reg : "=r"(_val)); \
+        _val;                                         \
+    })
+
+#define __arm_wsr64(reg, val) \
+    ({                                                    \
+        uint64_t _val = (val);                            \
+        __asm__ volatile("msr " reg ", %0" :: "r"(_val)); \
+    })
+
+#define __arm_wsr(reg, val) \
+    ({                                                    \
+        uint32_t _val = (val);                            \
+        __asm__ volatile("msr " reg ", %0" :: "r"(_val)); \
+    })
+
+#endif // !__clang__