blob: 83306e250e1ec7906e350dd59601f307213081f6 [file] [log] [blame]
// Copyright 2016 The Fuchsia Authors
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
#include <lib/affine/ratio.h>
#include <lib/boot-options/boot-options.h>
#include <lib/userabi/vdso-constants.h>
#include <lib/userabi/vdso.h>
#include <lib/version.h>
#include <platform.h>
#include <zircon/types.h>
#include <arch/quirks.h>
#include <fbl/alloc_checker.h>
#include <kernel/mp.h>
#include <ktl/array.h>
#include <object/handle.h>
#include <vm/pmm.h>
#include <vm/vm.h>
#include <vm/vm_aspace.h>
#include <vm/vm_object.h>
#include "sysret-offsets.h"
#include "vdso-code.h"
#include <ktl/enforce.h>
// This is defined in assembly via RODSO_IMAGE (see rodso-asm.h);
// vdso-code.h gives details about the image's size and layout.
extern "C" const char vdso_image[];
namespace {
class VDsoMutator {
explicit VDsoMutator(const fbl::RefPtr<VmObject>& vmo) : vmo_(vmo) {}
void RedirectSymbol(size_t idx1, size_t idx2, uintptr_t value) {
auto [sym1, sym2] = ReadSymbol(idx1, idx2);
// Just change the st_value of the symbol.
sym1.value = sym2.value = value;
WriteSymbol(idx1, sym1);
WriteSymbol(idx2, sym2);
void BlockSymbol(size_t idx1, size_t idx2) {
auto [sym1, sym2] = ReadSymbol(idx1, idx2);
// First change the symbol to have local binding so it can't be resolved.
// The high nybble is the STB_* bits; STB_LOCAL is 0. &= 0xf; &= 0xf;
WriteSymbol(idx1, sym1);
WriteSymbol(idx2, sym2);
// Now fill the code region (a whole function) with safely invalid code.
// This code should never be run, and any attempt to use it should crash.
ASSERT(sym1.value >= VDSO_CODE_START);
ASSERT(sym1.value + sym1.size < VDSO_CODE_END);
zx_status_t status = vmo_->Write(GetTrapFill(sym1.size), sym1.value, sym1.size);
ASSERT_MSG(status == ZX_OK, "vDSO VMO Write failed: %d", status);
void PatchVmcall(uint64_t offset) {
// Not yet implemented for ARM64.
#if ARCH_X86
uint8_t bytes[sizeof(kSyscallInsn)] = {};
const uint64_t syscall_offset = offset - /*`syscall` instruction size*/ 2;
// Validate the syscall and nop instructions.
zx_status_t status = vmo_->Read(bytes, syscall_offset, sizeof(bytes));
ASSERT_MSG(status == ZX_OK, "vDSO VMO Read failed: %d", status);
ASSERT_MSG(memcmp(kSyscallInsn, bytes, sizeof(bytes)) == 0,
"vDSO VMO %#lx does not contain syscall", offset);
// Then write the vmcall instruction.
status = vmo_->Write(kVmcallInsn, syscall_offset, sizeof(kVmcallInsn));
ASSERT_MSG(status == ZX_OK, "vDSO VMO Write failed: %d", status);
struct ElfSym {
uintptr_t info, value, size;
#if ARCH_X86
// Fill with the single-byte HLT instruction, so any place
// user-mode jumps into this code, it gets a trap.
using Insn = uint8_t;
static constexpr Insn kTrapFill = 0xf4; // hlt
static constexpr Insn kSyscallInsn[] = {/*syscall*/ 0x0f, 0x05, /*nop*/ 0x90};
static constexpr Insn kVmcallInsn[] = {0x0f, 0x01, 0xc1};
#elif ARCH_ARM64
// Fixed-size instructions. Use 'brk #1' (what __builtin_trap() emits).
using Insn = uint32_t;
static constexpr Insn kTrapFill = 0xd4200020;
#error what architecture?
void* GetTrapFill(size_t fill_size) {
ASSERT(fill_size % sizeof(Insn) == 0);
fill_size /= sizeof(Insn);
if (fill_size > trap_fill_size_) {
fbl::AllocChecker ac;
trap_fill_.reset(new (&ac) Insn[fill_size]);
trap_fill_size_ = fill_size;
for (size_t i = 0; i < fill_size; ++i) {
trap_fill_[i] = kTrapFill;
return trap_fill_.get();
uintptr_t SymtabAddress(size_t idx) {
return VDSO_DATA_START_dynsym + (idx * sizeof(ElfSym));
ElfSym ReadSymbol(size_t idx) {
ElfSym sym;
zx_status_t status = vmo_->Read(&sym, SymtabAddress(idx), sizeof(sym));
ASSERT_MSG(status == ZX_OK, "vDSO VMO Read failed: %d", status);
return sym;
ktl::array<ElfSym, 2> ReadSymbol(size_t idx1, size_t idx2) {
ElfSym sym1 = ReadSymbol(idx1);
ElfSym sym2 = ReadSymbol(idx2);
ASSERT_MSG(sym1.value == sym2.value, "dynsym %zu vs %zu value %#lx vs %#lx", idx2, idx2,
sym1.value, sym2.value);
ASSERT_MSG(sym1.size == sym2.size, "dynsym %zu vs %zu size %#lx vs %#lx", idx2, idx2, sym1.size,
return {sym1, sym2};
void WriteSymbol(size_t idx, const ElfSym& sym) {
zx_status_t status = vmo_->Write(&sym, SymtabAddress(idx), sizeof(sym));
ASSERT_MSG(status == ZX_OK, "vDSO VMO Write failed: %d", status);
const fbl::RefPtr<VmObject>& vmo_;
ktl::unique_ptr<Insn[]> trap_fill_;
size_t trap_fill_size_ = 0;
#define PASTE(a, b, c) PASTE_1(a, b, c)
#define PASTE_1(a, b, c) a##b##c
#define REDIRECT_SYSCALL(mutator, symbol, target) \
mutator.RedirectSymbol(PASTE(VDSO_DYNSYM_, symbol, ), PASTE(VDSO_DYNSYM__, symbol, ), \
PASTE(VDSO_CODE_, target, ))
// Block the named zx_* function. The symbol table entry will
// become invisible to runtime symbol resolution, and the code of
// the function will be clobbered with trapping instructions.
#define BLOCK_SYSCALL(mutator, symbol) \
mutator.BlockSymbol(PASTE(VDSO_DYNSYM_, symbol, ), PASTE(VDSO_DYNSYM__, symbol, ))
// Random attributes in kazoo fidl files become "categories" of syscalls.
// For each category, define a function block_<category> to block all the
// syscalls in that category. These functions can be used in
// VDso::CreateVariant (below) to block a category of syscalls for a particular
// variant vDSO.
#define SYSCALL_CATEGORY_BEGIN(category) \
[[maybe_unused]] void block_##category##_syscalls(VDsoMutator& mutator) {
#define SYSCALL_IN_CATEGORY(syscall) BLOCK_SYSCALL(mutator, zx_##syscall);
#define SYSCALL_CATEGORY_END(category) }
#include <lib/syscalls/>
void PatchVmcall(VDsoMutator& mutator) {
for (auto offset : kSysretOffsets) {
// This is extracted from the vDSO image at build time.
using VdsoBuildIdNote = ktl::array<uint8_t, VDSO_BUILD_ID_NOTE_SIZE>;
constexpr VdsoBuildIdNote kVdsoBuildIdNote = VDSO_BUILD_ID_NOTE_BYTES;
// That should exactly match the note read from the vDSO image at runtime.
void CheckBuildId(const fbl::RefPtr<VmObject>& vmo) {
VdsoBuildIdNote note;
zx_status_t status = vmo->Read(&note, VDSO_BUILD_ID_NOTE_ADDRESS, sizeof(note));
ASSERT_MSG(status == ZX_OK, "vDSO VMO Read failed: %d", status);
ASSERT(note == kVdsoBuildIdNote);
// Fill out the contents of the vdso_constants struct.
void SetConstants(const fbl::RefPtr<VmObject>& vmo) {
zx_ticks_t per_second = ticks_per_second();
// Grab a copy of the ticks to mono ratio; we need this to initialize the
// constants window.
affine::Ratio ticks_to_mono_ratio = platform_get_ticks_to_time_ratio();
// At this point in time, we absolutely must know the rate that our tick
// counter is ticking at. If we don't, then something has gone horribly
// wrong.
ASSERT(per_second != 0);
ASSERT(ticks_to_mono_ratio.numerator() != 0);
ASSERT(ticks_to_mono_ratio.denominator() != 0);
// Initialize the constants that should be visible to the vDSO.
// Rather than assigning each member individually, do this with
// struct assignment and a compound literal so that the compiler
// can warn if the initializer list omits any member.
vdso_constants constants = {
0, // Padding.
ASSERT(constants.version_string_len < sizeof(constants.version_string));
memcpy(constants.version_string, version_string(), constants.version_string_len);
static_assert(sizeof(constants) == VDSO_DATA_CONSTANTS_SIZE, " is suspect");
zx_status_t status = vmo->Write(&constants, VDSO_DATA_CONSTANTS, sizeof(constants));
ASSERT_MSG(status == ZX_OK, "vDSO VMO Write failed: %d", status);
// Conditionally patch some of the entry points related to time based on
// platform details which get determined at runtime.
void PatchTimeSyscalls(VDsoMutator mutator) {
// If user mode cannot access the tick counter registers, or kernel command
// line arguments demand that we access the tick counter via a syscall
// instead of direct observation, then we need to make sure to redirect
// symbol in the vDSO such that we always syscall in order to query ticks.
// Since this can effect how clock monotonic is calculated as well, we may
// need to redirect zx_clock_get_monotonic as well.
const bool need_syscall_for_ticks =
!platform_usermode_can_access_tick_registers() || gBootOptions->vdso_ticks_get_force_syscall;
if (need_syscall_for_ticks) {
REDIRECT_SYSCALL(mutator, zx_ticks_get, SYSCALL_zx_ticks_get_via_kernel);
#if ARCH_ARM64
else {
// Wait for a _really_ long time for all of the CPUs to have started up, so
// we know whether or not to deploy the ARM A73 timer read mitigation or
// not. If we timeout from this, then something is extremely wrong. In
// that situation, go ahead and install the mitigation anyway. It is slower,
// but at least it will read correctly on all cores.
// see arch/quirks.h for details about the quirk itself.
zx_status_t status = mp_wait_for_all_cpus_started(Deadline::after(ZX_SEC(30)));
if ((status != ZX_OK) || arch_quirks_needs_arm_erratum_858921_mitigation()) {
if (status != ZX_OK) {
"WARNING: Timed out waiting for all CPUs to start. Installing A73 quirks for "
"zx_ticks_get in VDSO as a defensive measure.\n");
} else {
dprintf(INFO, "Installing A73 quirks for zx_ticks_get in VDSO\n");
REDIRECT_SYSCALL(mutator, zx_ticks_get, ticks_get_arm_a73);
if (gBootOptions->vdso_clock_get_monotonic_force_syscall) {
// Force a syscall for zx_clock_get_monotonic if instructed to do so by the
// kernel command line arguments. Make sure to swap out the implementation
// of zx_deadline_after as well.
REDIRECT_SYSCALL(mutator, zx_clock_get_monotonic, SYSCALL_zx_clock_get_monotonic_via_kernel);
REDIRECT_SYSCALL(mutator, zx_deadline_after, deadline_after_via_kernel_mono);
} else if (need_syscall_for_ticks) {
// If ticks must be accessed via syscall, then choose the alternate form
// for clock_get_monotonic which performs the scaling in user mode, but
// thunks into the kernel to read the ticks register.
REDIRECT_SYSCALL(mutator, zx_clock_get_monotonic, clock_get_monotonic_via_kernel_ticks);
REDIRECT_SYSCALL(mutator, zx_deadline_after, deadline_after_via_kernel_ticks);
} // anonymous namespace
const VDso* VDso::instance_ = NULL;
// Private constructor, can only be called by Create (below).
VDso::VDso(KernelHandle<VmObjectDispatcher>* vmo_kernel_handle)
: RoDso("vdso/next", vdso_image, VDSO_CODE_END, VDSO_CODE_START, vmo_kernel_handle) {}
// This is called exactly once, at boot time.
const VDso* VDso::Create(KernelHandle<VmObjectDispatcher>* vmo_kernel_handles) {
fbl::AllocChecker ac;
VDso* vdso = new (&ac) VDso(&vmo_kernel_handles[variant_index(Variant::NEXT)]);
// Sanity-check that it's the exact vDSO image the kernel was compiled for.
// Fill out the contents of the vdso_constants struct.
DEBUG_ASSERT(!(vdso->vmo_rights() & ZX_RIGHT_WRITE));
for (size_t v = static_cast<size_t>(Variant::STABLE); v < static_cast<size_t>(Variant::COUNT);
vdso->CreateVariant(static_cast<Variant>(v), &vmo_kernel_handles[v]);
instance_ = vdso;
return instance_;
uintptr_t VDso::base_address(const fbl::RefPtr<VmMapping>& code_mapping) {
return code_mapping ? code_mapping->base() - VDSO_CODE_START : 0;
// Each vDSO variant VMO is made via a COW clone of the next vDSO
// VMO. A variant can block some system calls, by syscall category.
// This works by modifying the symbol table entries to make the symbols
// invisible to dynamic linking (STB_LOCAL) and then clobbering the code
// with trapping instructions. In this way, all the code locations are the
// same across variants and the syscall entry enforcement doesn't have to
// care which variant is in use. The places where the blocked
// syscalls' syscall entry instructions would be no longer have the syscall
// instructions, so a process using the variant can never get into syscall
// entry with that PC value and hence can never pass the vDSO enforcement
// test.
void VDso::CreateVariant(Variant variant, KernelHandle<VmObjectDispatcher>* vmo_kernel_handle) {
DEBUG_ASSERT(variant >= Variant::STABLE);
DEBUG_ASSERT(variant < Variant::COUNT);
if (variant == Variant::NEXT) {
// The next variant already has a VMO.
variant_vmo_[variant_index(variant)] = vmo_kernel_handle->dispatcher();
fbl::RefPtr<VmObject> new_vmo;
zx_status_t status = vmo()->CreateChild(ZX_VMO_CHILD_SNAPSHOT, 0, size(), false, &new_vmo);
ASSERT(status == ZX_OK);
VDsoMutator mutator{new_vmo};
const char* name = nullptr;
switch (variant) {
case Variant::STABLE:
name = "vdso/stable";
case Variant::DIRECT:
name = "vdso/direct";
case Variant::TEST1:
name = "vdso/test1";
case Variant::TEST2:
name = "vdso/test2";
// No default case so the compiler will warn about new enum entries.
case Variant::NEXT:
case Variant::COUNT:
PANIC("VDso::CreateVariant called with bad variant");
zx_rights_t rights;
status = VmObjectDispatcher::Create(ktl::move(new_vmo), size(),
vmo_kernel_handle, &rights);
ASSERT(status == ZX_OK);
status = vmo_kernel_handle->dispatcher()->set_name(name, strlen(name));
ASSERT(status == ZX_OK);
variant_vmo_[variant_index(variant)] = vmo_kernel_handle->dispatcher();