// Copyright 2023 The Fuchsia Authors
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
#include <lib/boot-options/boot-options.h>
#include <lib/fit/defer.h>
#include <lib/fxt/serializer.h>
#include <lib/thread_sampler/per_cpu_state.h>
#include <lib/thread_sampler/thread_sampler.h>
#include <lib/zx/time.h>
#include <fbl/array.h>
#include <kernel/dpc.h>
#include <kernel/event.h>
#include <kernel/mp.h>
#include <kernel/spinlock.h>
#include <lk/init.h>
#include <object/io_buffer_dispatcher.h>
#include <object/process_dispatcher.h>
KernelHandle<sampler::ThreadSamplerDispatcher> sampler::ThreadSamplerDispatcher::gThreadSampler_;
zx::result<> sampler::ThreadSamplerDispatcher::CreateImpl(
const zx_sampler_config_t& config, KernelHandle<ThreadSamplerDispatcher>& read_handle,
KernelHandle<ThreadSamplerDispatcher>& write_handle) {
const size_t num_cpus = percpu::processor_count();
fbl::AllocChecker ac;
// Start by creating the buffer, a la IoBufferDispatcher::Create
auto holder0 = fbl::MakeRefCountedChecked<PeerHolder<IoBufferDispatcher>>(&ac);
if (!ac.check()) {
return zx::error(ZX_ERR_NO_MEMORY);
auto holder1 = holder0;
fbl::RefPtr<SharedIobState> shared_regions = fbl::AdoptRef(new (&ac) SharedIobState{
.regions = nullptr,
if (!ac.check()) {
return zx::error(ZX_ERR_NO_MEMORY);
KernelHandle write_dispatcher{fbl::AdoptRef(
new (&ac) ThreadSamplerDispatcher(ktl::move(holder0), IobEndpointId::Ep0, shared_regions))};
if (!ac.check()) {
return zx::error(ZX_ERR_NO_MEMORY);
KernelHandle read_dispatcher{fbl::AdoptRef(
new (&ac) ThreadSamplerDispatcher(ktl::move(holder1), IobEndpointId::Ep1, shared_regions))};
if (!ac.check()) {
return zx::error(ZX_ERR_NO_MEMORY);
IoBufferDispatcher::RegionArray configs{&ac, num_cpus};
if (!ac.check()) {
return zx::error(ZX_ERR_NO_MEMORY);
for (size_t i = 0; i < configs.size(); i++) {
configs[i] =
zx_iob_region_t{.type = ZX_IOB_REGION_TYPE_PRIVATE,
.size = config.buffer_size,
.discipline = zx_iob_discipline_t{.type = ZX_IOB_DISCIPLINE_TYPE_NONE},
.private_region = zx_iob_region_private_t{
.options = 0,
Guard<CriticalMutex> guard{&shared_regions->state_lock};
zx::result<fbl::Array<IobRegionVariant>> regions = CreateRegions(
configs, write_dispatcher.dispatcher().get(), read_dispatcher.dispatcher().get());
if (regions.is_error()) {
return regions.take_error();
shared_regions->regions = ktl::move(*regions);
// In addition to the work done a la IoBufferDispatcher::Create, we also need to map and pin
// each of the buffers so that the kernel can safely write to them.
fbl::Array<PinnedVmObject> pinned_buffers = fbl::MakeArray<PinnedVmObject>(&ac, num_cpus);
if (!ac.check()) {
return zx::error(ZX_ERR_NO_MEMORY);
// Allocate and pin each buffer so the kernel can write to it
for (unsigned i = 0; i < num_cpus; i++) {
fbl::RefPtr vmo = write_dispatcher.dispatcher()->GetVmo(i);
const uint64_t vmo_offset = 0;
const size_t size = config.buffer_size;
if (zx_status_t status =
PinnedVmObject::Create(vmo, vmo_offset, size, true, &pinned_buffers[i]);
status != ZX_OK) {
dprintf(INFO, "Failed to make pin: %d\n", status);
return zx::error(status);
write_dispatcher.dispatcher()->per_cpu_state_ =
fbl::MakeArray<internal::PerCpuState>(&ac, num_cpus);
if (!ac.check()) {
return zx::error(ZX_ERR_NO_MEMORY);
// Even though the buffer is per_cpu, we are fine to set up each cpu state here on a single cpu.
// When we start sampling, we call mp_sync_exec which will synchronize the written
// per_cpu_states.
for (unsigned i = 0; i < num_cpus; i++) {
if (zx::result<> setup_result = write_dispatcher.dispatcher()->per_cpu_state_[i].SetUp(
config, ktl::move(pinned_buffers[i]));
setup_result.is_error()) {
return setup_result.take_error();
read_handle = ktl::move(read_dispatcher);
write_handle = ktl::move(write_dispatcher);
return zx::ok();
zx::result<> sampler::ThreadSamplerDispatcher::StartImpl() TA_EXCL(get_lock()) {
Guard<CriticalMutex> guard(get_lock());
if (state_ != SamplingState::Configured) {
return zx::error(ZX_ERR_BAD_STATE);
DEBUG_ASSERT(GetEndpointId() == IobEndpointId::Ep0);
for (internal::PerCpuState& state : per_cpu_state_) {
[](void* s) { reinterpret_cast<sampler::ThreadSamplerDispatcher*>(s)->SetCurrCpuTimer(); },
state_ = SamplingState::Running;
return zx::ok();
zx::result<> sampler::ThreadSamplerDispatcher::StopImpl() TA_EXCL(get_lock()) {
Guard<CriticalMutex> guard(get_lock());
DEBUG_ASSERT(GetEndpointId() == IobEndpointId::Ep0);
if (state_ != SamplingState::Running) {
return zx::error(ZX_ERR_BAD_STATE);
return zx::ok();
void sampler::ThreadSamplerDispatcher::StopLocked() TA_REQ(get_lock()) {
DEBUG_ASSERT(GetEndpointId() == IobEndpointId::Ep0);
for (internal::PerCpuState& state : per_cpu_state_) {
// Some timers may not have not been able to be canceled, so we need to wait for any samples that
// have already started to finish.
zx_time_t deadline = zx_time_add_duration(current_time(), ZX_SEC(30));
for (const internal::PerCpuState& i : per_cpu_state_) {
bool pending_timers;
bool pending_writes;
do {
pending_timers = i.PendingTimer();
pending_writes = i.PendingWrites();
if (pending_timers || pending_writes) {
} while ((pending_writes || pending_timers) && (current_time() < deadline));
// We'll wait an unreasonable amount of time for the timer to finish. If the timer really
// haven't finished by this point, something has gone terribly wrong.
ZX_ASSERT(!pending_writes || !pending_timers);
// At this point, there are no longer pending writes. There may still be threads:
// 1) signaled to be sampled but haven't reached ProcessPendingSignals yet, or
// 2) are mid taking a sample but haven't yet reserved a PendingWrite
// For 1): Such threads will block on on getting the dispatcher lock which we currently hold to
// read the state. When they acquire it, they will see that the session is no longer running and
// skip taking a sample.
// For 2): Threads will check the PerCpuState and see that writes are disabled and will skip
// writing the sample. While taking a sample, threads have taken an fbl::RefPtr to the sampling
// state so that the PerCpuStates are not at risk of being destroyed.
state_ = SamplingState::Configured;
zx::result<> sampler::ThreadSamplerDispatcher::SampleThreadImpl(zx_koid_t pid, zx_koid_t tid,
GeneralRegsSource source,
void* gregs,
uint64_t sampler_koid) {
DEBUG_ASSERT(GetEndpointId() == IobEndpointId::Ep0);
// We are going to attempt a usercopy below which might fault, so interrupts cannot be disabled.
// We need to be a little bit careful here because we could be racing with a Stop operation. The
// Stop operation:
// 1) Disables Writes
// 2) Cancels each Timer
// 3) Waits for all PendingWrites to finish
// It does this while holding the ThreadSamplerDispatcher lock. This means if SetPendingWrite and
// then attempt to obtain the ThreadSamplerDispatcher lock, we could deadlock.
// Instead, we'll do a single enabled check here before attempting to read the stack, which will
// take some time. Once we've collected our data and are ready to write out, we'll
// SetPendingWrite to hold onto the buffers for the duration of the write.
// If we find that writes are enabled, we are safe to write to the buffers as
// Stop will not destroy them until we lower the PendingWrite bit.
// If we find that writes are disabled, we throw away our sample as it's no longer safe to write
// to the buffers.
if (State() != SamplingState::Running || sampler_koid != get_koid()) {
return zx::error(ZX_ERR_NOT_SUPPORTED);
size_t frame_num = 0;
constexpr size_t kMaxUserBacktraceSize = 64;
// We're dropping 512 bytes on the kernel stack here and we need a be careful not to overflow it.
// This amount of bytes _should_ be safe because SampleThread is only called during
// Thread::Current::ProcessPendingSignals which occurs directly before returning to usermode. At
// this point, the stack will be shallow.
vaddr_t bt[kMaxUserBacktraceSize]{};
vaddr_t fp = 0;
vaddr_t pc = 0;
switch (source) {
case GeneralRegsSource::None:
case GeneralRegsSource::Iframe:
#ifdef __x86_64__
fp = reinterpret_cast<iframe_t*>(gregs)->rbp;
pc = reinterpret_cast<iframe_t*>(gregs)->ip;
#ifdef __aarch64__
bt[frame_num++] = (reinterpret_cast<iframe_t*>(gregs)->elr) - 4;
fp = reinterpret_cast<iframe_t*>(gregs)->r[29];
pc = (reinterpret_cast<iframe_t*>(gregs)->lr) - 4;
#ifdef __riscv
fp = reinterpret_cast<iframe_t*>(gregs)->regs.s0;
pc = reinterpret_cast<iframe_t*>(gregs)->regs.pc;
#ifdef __x86_64__
case GeneralRegsSource::Syscall:
fp = reinterpret_cast<syscall_regs_t*>(gregs)->rbp;
pc = reinterpret_cast<syscall_regs_t*>(gregs)->rip;
if (pc == 0) {
return zx::error(ZX_ERR_BAD_STATE);
bt[frame_num++] = pc;
while (frame_num < kMaxUserBacktraceSize) {
vaddr_t actual_fp = fp;
if (fp == 0) {
// We've reached the top of the frame pointer chain.
// RISC-V has a nonstandard frame pointer which points to the CFA instead of
// the previous frame pointer. Since the frame pointer and return address are
// always just below the CFA, subtract 16 bytes to get to the actual frame pointer.
#if __riscv
actual_fp -= 16;
user_in_ptr<const vaddr_t> user_next_fp{reinterpret_cast<vaddr_t*>(actual_fp)};
user_in_ptr<const vaddr_t> user_pc{reinterpret_cast<vaddr_t*>(actual_fp + 8)};
// A well formed frame pointer chain ends in 0 and should never fail to copy. If a thread's
// stack is not readable or well formatted, we return an error to indicate that sampling should
// be disabled for the offending thread.
zx_status_t copy_res = user_pc.copy_from_user(&pc);
if (copy_res != ZX_OK) {
// We eat the copy_res and return ZX_ERR_NOT_SUPPORTED here and below to indicate that we
// failed to take a sample, but we might still succeed in the future. A thread may not
// necessarily have valid frame pointers at all points in execution, so don't give on this
// thread just yet.
return zx::error(ZX_ERR_NOT_SUPPORTED);
if (pc == 0) {
bt[frame_num++] = pc;
copy_res = user_next_fp.copy_from_user(&fp);
if (copy_res != ZX_OK) {
return zx::error(ZX_ERR_NOT_SUPPORTED);
internal::PerCpuState& cpu_state = GetPerCpuState(arch_curr_cpu_num());
bool enabled = cpu_state.SetPendingWrite();
if (!enabled) {
// Even though we didn't successfully write a sample, we return a success result -- we should
// still try to sample the thread as it may later be scheduled on a different cpu.
return zx::ok();
auto d = fit::defer([&cpu_state]() { cpu_state.ResetPendingWrite(); });
constexpr fxt::StringRef<fxt::RefType::kId> empty_string{0};
const fxt::ThreadRef current_thread{pid, tid};
zx_status_t write_result =
fxt::WriteLargeBlobRecordWithMetadata(&cpu_state, current_ticks(), empty_string, empty_string,
current_thread, bt, sizeof(uint64_t) * frame_num);
if (write_result != ZX_OK) {
dprintf(INFO, "Buffer full, disabling writes on cpu: %u\n", arch_curr_cpu_num());
return zx::ok();
void sampler::ThreadSamplerDispatcher::OnPeerZeroHandlesLocked() {
DEBUG_ASSERT(GetEndpointId() == IobEndpointId::Ep0);
// We purposely don't emit a call to IoBufferDispatcher::OnPeerZeroHandlesLocked() here. It's used
// to coordinate and delay ZX_IOB_PEER_CLOSED until any mapped regions have been unmapped. We
// don't need the logic here. Userspace will never see a ZX_IOB_PEER_CLOSED as we will not close
// the endpoint the kernel holds until after userspace closes the last handle to their endpoint.
// When that happens, we end up here and are going to destroy our state anyways.
// The userspace end of the iobuffer has closed. Time to clean up our state
if (state_ == SamplingState::Running) {
// After StopLocked, we have prevented further threads from accessing the per_cpu_states, and then
// waited for any threads that were accessing the states to finish.
// It's now safe to destroy our cpu states. This will destroy the mappings and pinnings that the
// kernel keeps to write to, but if userspace has their own mappings, those will remain continue
// to remain valid.
state_ = SamplingState::Destroyed;
void sampler::ThreadSamplerDispatcher::SetCurrCpuTimer() {
zx::result<KernelHandle<sampler::ThreadSamplerDispatcher>> sampler::ThreadSamplerDispatcher::Create(
const zx_sampler_config_t& config) {
Guard<Mutex> guard(ThreadSamplerLock::Get());
if (gThreadSampler_.dispatcher() != nullptr &&
gThreadSampler_.dispatcher()->State() !=
sampler::ThreadSamplerDispatcher::SamplingState::Destroyed) {
return zx::error(ZX_ERR_ALREADY_EXISTS);
KernelHandle<sampler::ThreadSamplerDispatcher> write_handle;
KernelHandle<sampler::ThreadSamplerDispatcher> read_handle;
zx::result res = sampler::ThreadSamplerDispatcher::CreateImpl(config, read_handle, write_handle);
if (res.is_error()) {
return res.take_error();
Guard<Mutex> guard(ThreadSamplerLock::Get());
// Ensure that someone hasn't created a new sampler since we created ours
if ((gThreadSampler_.dispatcher() != nullptr &&
gThreadSampler_.dispatcher()->State() !=
sampler::ThreadSamplerDispatcher::SamplingState::Destroyed)) {
return zx::error(ZX_ERR_ALREADY_EXISTS);
gThreadSampler_ = ktl::move(write_handle);
return zx::ok(ktl::move(read_handle));
zx::result<> sampler::ThreadSamplerDispatcher::Stop(const fbl::RefPtr<IoBufferDispatcher>& disp) {
Guard<Mutex> guard(ThreadSamplerLock::Get());
if (gThreadSampler_.dispatcher() == nullptr) {
return zx::error(ZX_ERR_BAD_STATE);
if (disp->get_koid() != gThreadSampler_.dispatcher()->get_related_koid()) {
return zx::error(ZX_ERR_BAD_HANDLE);
return gThreadSampler_.dispatcher()->StopImpl();
zx::result<> sampler::ThreadSamplerDispatcher::Start(const fbl::RefPtr<IoBufferDispatcher>& disp) {
Guard<Mutex> guard(ThreadSamplerLock::Get());
if (gThreadSampler_.dispatcher() == nullptr) {
return zx::error(ZX_ERR_BAD_STATE);
if (disp->get_koid() != gThreadSampler_.dispatcher()->get_related_koid()) {
return zx::error(ZX_ERR_BAD_HANDLE);
return gThreadSampler_.dispatcher()->StartImpl();
zx::result<> sampler::ThreadSamplerDispatcher::AddThread(
const fbl::RefPtr<IoBufferDispatcher>& disp, const fbl::RefPtr<ThreadDispatcher>& thread) {
Guard<Mutex> guard(ThreadSamplerLock::Get());
if (gThreadSampler_.dispatcher() == nullptr) {
return zx::error(ZX_ERR_BAD_STATE);
if (disp->get_koid() != gThreadSampler_.dispatcher()->get_related_koid()) {
return zx::error(ZX_ERR_BAD_HANDLE);
return zx::make_result(thread->EnableStackSampling(gThreadSampler_.dispatcher()->get_koid()));
zx::result<> sampler::ThreadSamplerDispatcher::SampleThread(zx_koid_t pid, zx_koid_t tid,
GeneralRegsSource source, void* gregs,
uint64_t sampler_koid) {
if (sampler_koid == ZX_KOID_INVALID) {
// Whatever thread this sampler_koid came from doesn't have sampling enabled.
return zx::error(ZX_ERR_NOT_SUPPORTED);
fbl::RefPtr<sampler::ThreadSamplerDispatcher> sampler_ref;
// We hold the global ThreadSamplerLock only long enough to increase the reference count on
// the current sampler if it exists.
// The sampling is relatively slow and we don't want to prevent other threads on a different
// core from making progress.
// Once we release the ThreadSamplerLock, it may be that the global sampler is replaced with a
// new one, however the ThreadSamplerDispatcher maintains enough state that the SampleThread
// and SetCurrCpuTimers will simply short circuit and return early if that is the case.
Guard<Mutex> guard(ThreadSamplerLock::Get());
if (gThreadSampler_.dispatcher() == nullptr ||
gThreadSampler_.dispatcher()->State() !=
sampler::ThreadSamplerDispatcher::SamplingState::Running) {
return zx::error(ZX_ERR_BAD_STATE);
sampler_ref = gThreadSampler_.dispatcher();
return sampler_ref->SampleThreadImpl(pid, tid, source, gregs, sampler_koid);