zircon/kernel/lib/thread_sampler/thread_sampler.cc - fuchsia - Git at Google

 // Copyright 2023 The Fuchsia Authors
 //
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT

 #include <lib/boot-options/boot-options.h>
 #include <lib/fit/defer.h>
 #include <lib/fxt/serializer.h>
 #include <lib/thread_sampler/per_cpu_state.h>
 #include <lib/thread_sampler/thread_sampler.h>
 #include <lib/zx/time.h>

 #include <fbl/array.h>
 #include <kernel/dpc.h>
 #include <kernel/event.h>
 #include <kernel/mp.h>
 #include <kernel/spinlock.h>
 #include <lk/init.h>
 #include <object/io_buffer_dispatcher.h>
 #include <object/process_dispatcher.h>

 KernelHandle<sampler::ThreadSamplerDispatcher> sampler::ThreadSamplerDispatcher::gThreadSampler_;

 zx::result<> sampler::ThreadSamplerDispatcher::CreateImpl(
     const zx_sampler_config_t& config, KernelHandle<ThreadSamplerDispatcher>& read_handle,
     KernelHandle<ThreadSamplerDispatcher>& write_handle) {
   const size_t num_cpus = percpu::processor_count();

   fbl::AllocChecker ac;
   // Start by creating the buffer, a la IoBufferDispatcher::Create
   auto holder0 = fbl::MakeRefCountedChecked<PeerHolder<IoBufferDispatcher>>(&ac);
   if (!ac.check()) {
     return zx::error(ZX_ERR_NO_MEMORY);
   }
   auto holder1 = holder0;

   fbl::RefPtr<SharedIobState> shared_regions = fbl::AdoptRef(new (&ac) SharedIobState{
       .regions = nullptr,
   });

   if (!ac.check()) {
     return zx::error(ZX_ERR_NO_MEMORY);
   }

   KernelHandle write_dispatcher{fbl::AdoptRef(
       new (&ac) ThreadSamplerDispatcher(ktl::move(holder0), IobEndpointId::Ep0, shared_regions))};
   if (!ac.check()) {
     return zx::error(ZX_ERR_NO_MEMORY);
   }

   KernelHandle read_dispatcher{fbl::AdoptRef(
       new (&ac) ThreadSamplerDispatcher(ktl::move(holder1), IobEndpointId::Ep1, shared_regions))};
   if (!ac.check()) {
     return zx::error(ZX_ERR_NO_MEMORY);
   }

   IoBufferDispatcher::RegionArray configs{&ac, num_cpus};
   if (!ac.check()) {
     return zx::error(ZX_ERR_NO_MEMORY);
   }
   for (size_t i = 0; i < configs.size(); i++) {
     configs[i] =
         zx_iob_region_t{.type = ZX_IOB_REGION_TYPE_PRIVATE,
                         .access = ZX_IOB_ACCESS_EP0_CAN_MAP_READ | ZX_IOB_ACCESS_EP0_CAN_MAP_WRITE |
                                   ZX_IOB_ACCESS_EP1_CAN_MAP_READ,
                         .size = config.buffer_size,
                         .discipline = zx_iob_discipline_t{.type = ZX_IOB_DISCIPLINE_TYPE_NONE},
                         .private_region = zx_iob_region_private_t{
                             .options = 0,
                         }};
   }

   {
     Guard<CriticalMutex> guard{&shared_regions->state_lock};

     zx::result<fbl::Array<IobRegionVariant>> regions = CreateRegions(
         configs, write_dispatcher.dispatcher().get(), read_dispatcher.dispatcher().get());
     if (regions.is_error()) {
       return regions.take_error();
     }

     shared_regions->regions = ktl::move(*regions);
   }

   read_dispatcher.dispatcher()->InitPeer(write_dispatcher.dispatcher());
   write_dispatcher.dispatcher()->InitPeer(read_dispatcher.dispatcher());

   // In addition to the work done a la IoBufferDispatcher::Create, we also need to map and pin
   // each of the buffers so that the kernel can safely write to them.
   fbl::Array<PinnedVmObject> pinned_buffers = fbl::MakeArray<PinnedVmObject>(&ac, num_cpus);
   if (!ac.check()) {
     return zx::error(ZX_ERR_NO_MEMORY);
   }

   // Allocate and pin each buffer so the kernel can write to it
   for (unsigned i = 0; i < num_cpus; i++) {
     fbl::RefPtr vmo = write_dispatcher.dispatcher()->GetVmo(i);
     const uint64_t vmo_offset = 0;
     const size_t size = config.buffer_size;
     if (zx_status_t status =
             PinnedVmObject::Create(vmo, vmo_offset, size, true, &pinned_buffers[i]);
         status != ZX_OK) {
       dprintf(INFO, "Failed to make pin: %d\n", status);
       return zx::error(status);
     }
   }

   write_dispatcher.dispatcher()->per_cpu_state_ =
       fbl::MakeArray<internal::PerCpuState>(&ac, num_cpus);
   if (!ac.check()) {
     return zx::error(ZX_ERR_NO_MEMORY);
   }

   // Even though the buffer is per_cpu, we are fine to set up each cpu state here on a single cpu.
   // When we start sampling, we call mp_sync_exec which will synchronize the written
   // per_cpu_states.
   for (unsigned i = 0; i < num_cpus; i++) {
     if (zx::result<> setup_result = write_dispatcher.dispatcher()->per_cpu_state_[i].SetUp(
             config, ktl::move(pinned_buffers[i]));
         setup_result.is_error()) {
       return setup_result.take_error();
     }
   }

   read_handle = ktl::move(read_dispatcher);
   write_handle = ktl::move(write_dispatcher);
   return zx::ok();
 }

 zx::result<> sampler::ThreadSamplerDispatcher::StartImpl() TA_EXCL(get_lock()) {
   Guard<CriticalMutex> guard(get_lock());
   if (state_ != SamplingState::Configured) {
     return zx::error(ZX_ERR_BAD_STATE);
   }

   DEBUG_ASSERT(!per_cpu_state_.empty());
   DEBUG_ASSERT(GetEndpointId() == IobEndpointId::Ep0);
   for (internal::PerCpuState& state : per_cpu_state_) {
     state.EnableWrites();
   }

   mp_sync_exec(
       MP_IPI_TARGET_ALL, 0,
       [](void* s) { reinterpret_cast<sampler::ThreadSamplerDispatcher*>(s)->SetCurrCpuTimer(); },
       this);
   state_ = SamplingState::Running;
   return zx::ok();
 }

 zx::result<> sampler::ThreadSamplerDispatcher::StopImpl() TA_EXCL(get_lock()) {
   Guard<CriticalMutex> guard(get_lock());
   DEBUG_ASSERT(GetEndpointId() == IobEndpointId::Ep0);
   if (state_ != SamplingState::Running) {
     return zx::error(ZX_ERR_BAD_STATE);
   }
   StopLocked();
   return zx::ok();
 }

 void sampler::ThreadSamplerDispatcher::StopLocked() TA_REQ(get_lock()) {
   DEBUG_ASSERT(GetEndpointId() == IobEndpointId::Ep0);

   for (internal::PerCpuState& state : per_cpu_state_) {
     state.DisableWrites();
     state.CancelTimer();
   }

   // Some timers may not have not been able to be canceled, so we need to wait for any samples that
   // have already started to finish.
   zx_time_t deadline = zx_time_add_duration(current_time(), ZX_SEC(30));
   for (const internal::PerCpuState& i : per_cpu_state_) {
     bool pending_timers;
     bool pending_writes;
     do {
       pending_timers = i.PendingTimer();
       pending_writes = i.PendingWrites();
       if (pending_timers || pending_writes) {
         Thread::Current::SleepRelative(ZX_MSEC(1));
       }
     } while ((pending_writes || pending_timers) && (current_time() < deadline));
     // We'll wait an unreasonable amount of time for the timer to finish. If the timer really
     // haven't finished by this point, something has gone terribly wrong.
     ZX_ASSERT(!pending_writes || !pending_timers);
   }

   // At this point, there are no longer pending writes. There may still be threads:
   //
   // 1) signaled to be sampled but haven't reached ProcessPendingSignals yet, or
   // 2) are mid taking a sample but haven't yet reserved a PendingWrite
   //
   // For 1): Such threads will block on on getting the dispatcher lock which we currently hold to
   // read the state. When they acquire it, they will see that the session is no longer running and
   // skip taking a sample.
   //
   // For 2): Threads will check the PerCpuState and see that writes are disabled and will skip
   // writing the sample. While taking a sample, threads have taken an fbl::RefPtr to the sampling
   // state so that the PerCpuStates are not at risk of being destroyed.
   state_ = SamplingState::Configured;
 }

 zx::result<> sampler::ThreadSamplerDispatcher::SampleThreadImpl(zx_koid_t pid, zx_koid_t tid,
                                                                 GeneralRegsSource source,
                                                                 void* gregs,
                                                                 uint64_t sampler_koid) {
   DEBUG_ASSERT(GetEndpointId() == IobEndpointId::Ep0);
   // We are going to attempt a usercopy below which might fault, so interrupts cannot be disabled.
   DEBUG_ASSERT(!arch_ints_disabled());
   // We need to be a little bit careful here because we could be racing with a Stop operation. The
   // Stop operation:
   //
   // 1) Disables Writes
   // 2) Cancels each Timer
   // 3) Waits for all PendingWrites to finish
   //
   // It does this while holding the ThreadSamplerDispatcher lock. This means if SetPendingWrite and
   // then attempt to obtain the ThreadSamplerDispatcher lock, we could deadlock.
   //
   // Instead, we'll do a single enabled check here before attempting to read the stack, which will
   // take some time. Once we've collected our data and are ready to write out, we'll
   // SetPendingWrite to hold onto the buffers for the duration of the write.
   //
   // If we find that writes are enabled, we are safe to write to the buffers as
   // Stop will not destroy them until we lower the PendingWrite bit.
   //
   // If we find that writes are disabled, we throw away our sample as it's no longer safe to write
   // to the buffers.
   if (State() != SamplingState::Running || sampler_koid != get_koid()) {
     return zx::error(ZX_ERR_NOT_SUPPORTED);
   }

   size_t frame_num = 0;
   constexpr size_t kMaxUserBacktraceSize = 64;
   // We're dropping 512 bytes on the kernel stack here and we need a be careful not to overflow it.
   //
   // This amount of bytes _should_ be safe because SampleThread is only called during
   // Thread::Current::ProcessPendingSignals which occurs directly before returning to usermode. At
   // this point, the stack will be shallow.
   vaddr_t bt[kMaxUserBacktraceSize]{};

   vaddr_t fp = 0;
   vaddr_t pc = 0;
   switch (source) {
     case GeneralRegsSource::None:
       break;
     case GeneralRegsSource::Iframe:
 #ifdef __x86_64__
       fp = reinterpret_cast<iframe_t*>(gregs)->rbp;
       pc = reinterpret_cast<iframe_t*>(gregs)->ip;
 #endif
 #ifdef __aarch64__
       bt[frame_num++] = (reinterpret_cast<iframe_t*>(gregs)->elr) - 4;
       fp = reinterpret_cast<iframe_t*>(gregs)->r[29];
       pc = (reinterpret_cast<iframe_t*>(gregs)->lr) - 4;
 #endif
 #ifdef __riscv
       fp = reinterpret_cast<iframe_t*>(gregs)->regs.s0;
       pc = reinterpret_cast<iframe_t*>(gregs)->regs.pc;
 #endif
       break;
 #ifdef __x86_64__
     case GeneralRegsSource::Syscall:
       fp = reinterpret_cast<syscall_regs_t*>(gregs)->rbp;
       pc = reinterpret_cast<syscall_regs_t*>(gregs)->rip;
       break;
 #endif
   }

   if (pc == 0) {
     return zx::error(ZX_ERR_BAD_STATE);
   }

   bt[frame_num++] = pc;

   while (frame_num < kMaxUserBacktraceSize) {
     vaddr_t actual_fp = fp;
     if (fp == 0) {
       // We've reached the top of the frame pointer chain.
       break;
     }

     // RISC-V has a nonstandard frame pointer which points to the CFA instead of
     // the previous frame pointer. Since the frame pointer and return address are
     // always just below the CFA, subtract 16 bytes to get to the actual frame pointer.
 #if __riscv
     actual_fp -= 16;
 #endif

     user_in_ptr<const vaddr_t> user_next_fp{reinterpret_cast<vaddr_t*>(actual_fp)};
     user_in_ptr<const vaddr_t> user_pc{reinterpret_cast<vaddr_t*>(actual_fp + 8)};

     // A well formed frame pointer chain ends in 0 and should never fail to copy. If a thread's
     // stack is not readable or well formatted, we return an error to indicate that sampling should
     // be disabled for the offending thread.
     zx_status_t copy_res = user_pc.copy_from_user(&pc);
     if (copy_res != ZX_OK) {
       // We eat the copy_res and return ZX_ERR_NOT_SUPPORTED here and below to indicate that we
       // failed to take a sample, but we might still succeed in the future. A thread may not
       // necessarily have valid frame pointers at all points in execution, so don't give on this
       // thread just yet.
       return zx::error(ZX_ERR_NOT_SUPPORTED);
     }
     if (pc == 0) {
       break;
     }
     bt[frame_num++] = pc;
     copy_res = user_next_fp.copy_from_user(&fp);
     if (copy_res != ZX_OK) {
       return zx::error(ZX_ERR_NOT_SUPPORTED);
     }
   }

   internal::PerCpuState& cpu_state = GetPerCpuState(arch_curr_cpu_num());
   bool enabled = cpu_state.SetPendingWrite();
   if (!enabled) {
     // Even though we didn't successfully write a sample, we return a success result -- we should
     // still try to sample the thread as it may later be scheduled on a different cpu.
     return zx::ok();
   }
   auto d = fit::defer([&cpu_state]() { cpu_state.ResetPendingWrite(); });

   constexpr fxt::StringRef<fxt::RefType::kId> empty_string{0};
   const fxt::ThreadRef current_thread{pid, tid};
   zx_status_t write_result =
       fxt::WriteLargeBlobRecordWithMetadata(&cpu_state, current_ticks(), empty_string, empty_string,
                                             current_thread, bt, sizeof(uint64_t) * frame_num);

   if (write_result != ZX_OK) {
     cpu_state.DisableWrites();
     dprintf(INFO, "Buffer full, disabling writes on cpu: %u\n", arch_curr_cpu_num());
   }
   return zx::ok();
 }

 void sampler::ThreadSamplerDispatcher::OnPeerZeroHandlesLocked() {
   DEBUG_ASSERT(GetEndpointId() == IobEndpointId::Ep0);

   // We purposely don't emit a call to IoBufferDispatcher::OnPeerZeroHandlesLocked() here. It's used
   // to coordinate and delay ZX_IOB_PEER_CLOSED until any mapped regions have been unmapped. We
   // don't need the logic here. Userspace will never see a ZX_IOB_PEER_CLOSED as we will not close
   // the endpoint the kernel holds until after userspace closes the last handle to their endpoint.
   // When that happens, we end up here and are going to destroy our state anyways.

   // The userspace end of the iobuffer has closed. Time to clean up our state
   if (state_ == SamplingState::Running) {
     StopLocked();
   }

   // After StopLocked, we have prevented further threads from accessing the per_cpu_states, and then
   // waited for any threads that were accessing the states to finish.
   //
   // It's now safe to destroy our cpu states. This will destroy the mappings and pinnings that the
   // kernel keeps to write to, but if userspace has their own mappings, those will remain continue
   // to remain valid.
   per_cpu_state_.reset();
   state_ = SamplingState::Destroyed;
 }

 void sampler::ThreadSamplerDispatcher::SetCurrCpuTimer() {
   GetPerCpuState(arch_curr_cpu_num()).SetTimer();
 }

 zx::result<KernelHandle<sampler::ThreadSamplerDispatcher>> sampler::ThreadSamplerDispatcher::Create(
     const zx_sampler_config_t& config) {
   {
     Guard<Mutex> guard(ThreadSamplerLock::Get());
     if (gThreadSampler_.dispatcher() != nullptr &&
         gThreadSampler_.dispatcher()->State() !=
             sampler::ThreadSamplerDispatcher::SamplingState::Destroyed) {
       return zx::error(ZX_ERR_ALREADY_EXISTS);
     }
   }

   KernelHandle<sampler::ThreadSamplerDispatcher> write_handle;
   KernelHandle<sampler::ThreadSamplerDispatcher> read_handle;
   zx::result res = sampler::ThreadSamplerDispatcher::CreateImpl(config, read_handle, write_handle);
   if (res.is_error()) {
     return res.take_error();
   }

   {
     Guard<Mutex> guard(ThreadSamplerLock::Get());
     // Ensure that someone hasn't created a new sampler since we created ours
     if ((gThreadSampler_.dispatcher() != nullptr &&
          gThreadSampler_.dispatcher()->State() !=
              sampler::ThreadSamplerDispatcher::SamplingState::Destroyed)) {
       return zx::error(ZX_ERR_ALREADY_EXISTS);
     }
     gThreadSampler_ = ktl::move(write_handle);
   }

   return zx::ok(ktl::move(read_handle));
 }

 zx::result<> sampler::ThreadSamplerDispatcher::Stop(const fbl::RefPtr<IoBufferDispatcher>& disp) {
   Guard<Mutex> guard(ThreadSamplerLock::Get());
   if (gThreadSampler_.dispatcher() == nullptr) {
     return zx::error(ZX_ERR_BAD_STATE);
   }
   if (disp->get_koid() != gThreadSampler_.dispatcher()->get_related_koid()) {
     return zx::error(ZX_ERR_BAD_HANDLE);
   }
   return gThreadSampler_.dispatcher()->StopImpl();
 }

 zx::result<> sampler::ThreadSamplerDispatcher::Start(const fbl::RefPtr<IoBufferDispatcher>& disp) {
   Guard<Mutex> guard(ThreadSamplerLock::Get());
   if (gThreadSampler_.dispatcher() == nullptr) {
     return zx::error(ZX_ERR_BAD_STATE);
   }
   if (disp->get_koid() != gThreadSampler_.dispatcher()->get_related_koid()) {
     return zx::error(ZX_ERR_BAD_HANDLE);
   }
   return gThreadSampler_.dispatcher()->StartImpl();
 }

 zx::result<> sampler::ThreadSamplerDispatcher::AddThread(
     const fbl::RefPtr<IoBufferDispatcher>& disp, const fbl::RefPtr<ThreadDispatcher>& thread) {
   Guard<Mutex> guard(ThreadSamplerLock::Get());
   if (gThreadSampler_.dispatcher() == nullptr) {
     return zx::error(ZX_ERR_BAD_STATE);
   }
   if (disp->get_koid() != gThreadSampler_.dispatcher()->get_related_koid()) {
     return zx::error(ZX_ERR_BAD_HANDLE);
   }
   return zx::make_result(thread->EnableStackSampling(gThreadSampler_.dispatcher()->get_koid()));
 }

 zx::result<> sampler::ThreadSamplerDispatcher::SampleThread(zx_koid_t pid, zx_koid_t tid,
                                                             GeneralRegsSource source, void* gregs,
                                                             uint64_t sampler_koid) {
   if (sampler_koid == ZX_KOID_INVALID) {
     // Whatever thread this sampler_koid came from doesn't have sampling enabled.
     return zx::error(ZX_ERR_NOT_SUPPORTED);
   }

   fbl::RefPtr<sampler::ThreadSamplerDispatcher> sampler_ref;
   {
     // We hold the global ThreadSamplerLock only long enough to increase the reference count on
     // the current sampler if it exists.
     //
     // The sampling is relatively slow and we don't want to prevent other threads on a different
     // core from making progress.
     //
     // Once we release the ThreadSamplerLock, it may be that the global sampler is replaced with a
     // new one, however the ThreadSamplerDispatcher maintains enough state that the SampleThread
     // and SetCurrCpuTimers will simply short circuit and return early if that is the case.
     Guard<Mutex> guard(ThreadSamplerLock::Get());
     if (gThreadSampler_.dispatcher() == nullptr ||
         gThreadSampler_.dispatcher()->State() !=
             sampler::ThreadSamplerDispatcher::SamplingState::Running) {
       return zx::error(ZX_ERR_BAD_STATE);
     }
     sampler_ref = gThreadSampler_.dispatcher();
   }

   return sampler_ref->SampleThreadImpl(pid, tid, source, gregs, sampler_koid);
 }
	// Copyright 2023 The Fuchsia Authors
	//
	// Use of this source code is governed by a MIT-style
	// license that can be found in the LICENSE file or at
	// https://opensource.org/licenses/MIT

	#include <lib/boot-options/boot-options.h>
	#include <lib/fit/defer.h>
	#include <lib/fxt/serializer.h>
	#include <lib/thread_sampler/per_cpu_state.h>
	#include <lib/thread_sampler/thread_sampler.h>
	#include <lib/zx/time.h>

	#include <fbl/array.h>
	#include <kernel/dpc.h>
	#include <kernel/event.h>
	#include <kernel/mp.h>
	#include <kernel/spinlock.h>
	#include <lk/init.h>
	#include <object/io_buffer_dispatcher.h>
	#include <object/process_dispatcher.h>

	KernelHandle<sampler::ThreadSamplerDispatcher> sampler::ThreadSamplerDispatcher::gThreadSampler_;

	zx::result<> sampler::ThreadSamplerDispatcher::CreateImpl(
	const zx_sampler_config_t& config, KernelHandle<ThreadSamplerDispatcher>& read_handle,
	KernelHandle<ThreadSamplerDispatcher>& write_handle) {
	const size_t num_cpus = percpu::processor_count();

	fbl::AllocChecker ac;
	// Start by creating the buffer, a la IoBufferDispatcher::Create
	auto holder0 = fbl::MakeRefCountedChecked<PeerHolder<IoBufferDispatcher>>(&ac);
	if (!ac.check()) {
	return zx::error(ZX_ERR_NO_MEMORY);
	}
	auto holder1 = holder0;

	fbl::RefPtr<SharedIobState> shared_regions = fbl::AdoptRef(new (&ac) SharedIobState{
	.regions = nullptr,
	});

	if (!ac.check()) {
	return zx::error(ZX_ERR_NO_MEMORY);
	}

	KernelHandle write_dispatcher{fbl::AdoptRef(
	new (&ac) ThreadSamplerDispatcher(ktl::move(holder0), IobEndpointId::Ep0, shared_regions))};
	if (!ac.check()) {
	return zx::error(ZX_ERR_NO_MEMORY);
	}

	KernelHandle read_dispatcher{fbl::AdoptRef(
	new (&ac) ThreadSamplerDispatcher(ktl::move(holder1), IobEndpointId::Ep1, shared_regions))};
	if (!ac.check()) {
	return zx::error(ZX_ERR_NO_MEMORY);
	}

	IoBufferDispatcher::RegionArray configs{&ac, num_cpus};
	if (!ac.check()) {
	return zx::error(ZX_ERR_NO_MEMORY);
	}
	for (size_t i = 0; i < configs.size(); i++) {
	configs[i] =
	zx_iob_region_t{.type = ZX_IOB_REGION_TYPE_PRIVATE,
	.access = ZX_IOB_ACCESS_EP0_CAN_MAP_READ \| ZX_IOB_ACCESS_EP0_CAN_MAP_WRITE \|
	ZX_IOB_ACCESS_EP1_CAN_MAP_READ,
	.size = config.buffer_size,
	.discipline = zx_iob_discipline_t{.type = ZX_IOB_DISCIPLINE_TYPE_NONE},
	.private_region = zx_iob_region_private_t{
	.options = 0,
	}};
	}

	{
	Guard<CriticalMutex> guard{&shared_regions->state_lock};

	zx::result<fbl::Array<IobRegionVariant>> regions = CreateRegions(
	configs, write_dispatcher.dispatcher().get(), read_dispatcher.dispatcher().get());
	if (regions.is_error()) {
	return regions.take_error();
	}

	shared_regions->regions = ktl::move(*regions);
	}

	read_dispatcher.dispatcher()->InitPeer(write_dispatcher.dispatcher());
	write_dispatcher.dispatcher()->InitPeer(read_dispatcher.dispatcher());

	// In addition to the work done a la IoBufferDispatcher::Create, we also need to map and pin
	// each of the buffers so that the kernel can safely write to them.
	fbl::Array<PinnedVmObject> pinned_buffers = fbl::MakeArray<PinnedVmObject>(&ac, num_cpus);
	if (!ac.check()) {
	return zx::error(ZX_ERR_NO_MEMORY);
	}

	// Allocate and pin each buffer so the kernel can write to it
	for (unsigned i = 0; i < num_cpus; i++) {
	fbl::RefPtr vmo = write_dispatcher.dispatcher()->GetVmo(i);
	const uint64_t vmo_offset = 0;
	const size_t size = config.buffer_size;
	if (zx_status_t status =
	PinnedVmObject::Create(vmo, vmo_offset, size, true, &pinned_buffers[i]);
	status != ZX_OK) {
	dprintf(INFO, "Failed to make pin: %d\n", status);
	return zx::error(status);
	}
	}

	write_dispatcher.dispatcher()->per_cpu_state_ =
	fbl::MakeArray<internal::PerCpuState>(&ac, num_cpus);
	if (!ac.check()) {
	return zx::error(ZX_ERR_NO_MEMORY);
	}

	// Even though the buffer is per_cpu, we are fine to set up each cpu state here on a single cpu.
	// When we start sampling, we call mp_sync_exec which will synchronize the written
	// per_cpu_states.
	for (unsigned i = 0; i < num_cpus; i++) {
	if (zx::result<> setup_result = write_dispatcher.dispatcher()->per_cpu_state_[i].SetUp(
	config, ktl::move(pinned_buffers[i]));
	setup_result.is_error()) {
	return setup_result.take_error();
	}
	}

	read_handle = ktl::move(read_dispatcher);
	write_handle = ktl::move(write_dispatcher);
	return zx::ok();
	}

	zx::result<> sampler::ThreadSamplerDispatcher::StartImpl() TA_EXCL(get_lock()) {
	Guard<CriticalMutex> guard(get_lock());
	if (state_ != SamplingState::Configured) {
	return zx::error(ZX_ERR_BAD_STATE);
	}

	DEBUG_ASSERT(!per_cpu_state_.empty());
	DEBUG_ASSERT(GetEndpointId() == IobEndpointId::Ep0);
	for (internal::PerCpuState& state : per_cpu_state_) {
	state.EnableWrites();
	}

	mp_sync_exec(
	MP_IPI_TARGET_ALL, 0,
	[](void* s) { reinterpret_cast<sampler::ThreadSamplerDispatcher*>(s)->SetCurrCpuTimer(); },
	this);
	state_ = SamplingState::Running;
	return zx::ok();
	}

	zx::result<> sampler::ThreadSamplerDispatcher::StopImpl() TA_EXCL(get_lock()) {
	Guard<CriticalMutex> guard(get_lock());
	DEBUG_ASSERT(GetEndpointId() == IobEndpointId::Ep0);
	if (state_ != SamplingState::Running) {
	return zx::error(ZX_ERR_BAD_STATE);
	}
	StopLocked();
	return zx::ok();
	}

	void sampler::ThreadSamplerDispatcher::StopLocked() TA_REQ(get_lock()) {
	DEBUG_ASSERT(GetEndpointId() == IobEndpointId::Ep0);

	for (internal::PerCpuState& state : per_cpu_state_) {
	state.DisableWrites();
	state.CancelTimer();
	}

	// Some timers may not have not been able to be canceled, so we need to wait for any samples that
	// have already started to finish.
	zx_time_t deadline = zx_time_add_duration(current_time(), ZX_SEC(30));
	for (const internal::PerCpuState& i : per_cpu_state_) {
	bool pending_timers;
	bool pending_writes;
	do {
	pending_timers = i.PendingTimer();
	pending_writes = i.PendingWrites();
	if (pending_timers \|\| pending_writes) {
	Thread::Current::SleepRelative(ZX_MSEC(1));
	}
	} while ((pending_writes \|\| pending_timers) && (current_time() < deadline));
	// We'll wait an unreasonable amount of time for the timer to finish. If the timer really
	// haven't finished by this point, something has gone terribly wrong.
	ZX_ASSERT(!pending_writes \|\| !pending_timers);
	}

	// At this point, there are no longer pending writes. There may still be threads:
	//
	// 1) signaled to be sampled but haven't reached ProcessPendingSignals yet, or
	// 2) are mid taking a sample but haven't yet reserved a PendingWrite
	//
	// For 1): Such threads will block on on getting the dispatcher lock which we currently hold to
	// read the state. When they acquire it, they will see that the session is no longer running and
	// skip taking a sample.
	//
	// For 2): Threads will check the PerCpuState and see that writes are disabled and will skip
	// writing the sample. While taking a sample, threads have taken an fbl::RefPtr to the sampling
	// state so that the PerCpuStates are not at risk of being destroyed.
	state_ = SamplingState::Configured;
	}

	zx::result<> sampler::ThreadSamplerDispatcher::SampleThreadImpl(zx_koid_t pid, zx_koid_t tid,
	GeneralRegsSource source,
	void* gregs,
	uint64_t sampler_koid) {
	DEBUG_ASSERT(GetEndpointId() == IobEndpointId::Ep0);
	// We are going to attempt a usercopy below which might fault, so interrupts cannot be disabled.
	DEBUG_ASSERT(!arch_ints_disabled());
	// We need to be a little bit careful here because we could be racing with a Stop operation. The
	// Stop operation:
	//
	// 1) Disables Writes
	// 2) Cancels each Timer
	// 3) Waits for all PendingWrites to finish
	//
	// It does this while holding the ThreadSamplerDispatcher lock. This means if SetPendingWrite and
	// then attempt to obtain the ThreadSamplerDispatcher lock, we could deadlock.
	//
	// Instead, we'll do a single enabled check here before attempting to read the stack, which will
	// take some time. Once we've collected our data and are ready to write out, we'll
	// SetPendingWrite to hold onto the buffers for the duration of the write.
	//
	// If we find that writes are enabled, we are safe to write to the buffers as
	// Stop will not destroy them until we lower the PendingWrite bit.
	//
	// If we find that writes are disabled, we throw away our sample as it's no longer safe to write
	// to the buffers.
	if (State() != SamplingState::Running \|\| sampler_koid != get_koid()) {
	return zx::error(ZX_ERR_NOT_SUPPORTED);
	}

	size_t frame_num = 0;
	constexpr size_t kMaxUserBacktraceSize = 64;
	// We're dropping 512 bytes on the kernel stack here and we need a be careful not to overflow it.
	//
	// This amount of bytes _should_ be safe because SampleThread is only called during
	// Thread::Current::ProcessPendingSignals which occurs directly before returning to usermode. At
	// this point, the stack will be shallow.
	vaddr_t bt[kMaxUserBacktraceSize]{};

	vaddr_t fp = 0;
	vaddr_t pc = 0;
	switch (source) {
	case GeneralRegsSource::None:
	break;
	case GeneralRegsSource::Iframe:
	#ifdef __x86_64__
	fp = reinterpret_cast<iframe_t*>(gregs)->rbp;
	pc = reinterpret_cast<iframe_t*>(gregs)->ip;
	#endif
	#ifdef __aarch64__
	bt[frame_num++] = (reinterpret_cast<iframe_t*>(gregs)->elr) - 4;
	fp = reinterpret_cast<iframe_t*>(gregs)->r[29];
	pc = (reinterpret_cast<iframe_t*>(gregs)->lr) - 4;
	#endif
	#ifdef __riscv
	fp = reinterpret_cast<iframe_t*>(gregs)->regs.s0;
	pc = reinterpret_cast<iframe_t*>(gregs)->regs.pc;
	#endif
	break;
	#ifdef __x86_64__
	case GeneralRegsSource::Syscall:
	fp = reinterpret_cast<syscall_regs_t*>(gregs)->rbp;
	pc = reinterpret_cast<syscall_regs_t*>(gregs)->rip;
	break;
	#endif
	}

	if (pc == 0) {
	return zx::error(ZX_ERR_BAD_STATE);
	}

	bt[frame_num++] = pc;

	while (frame_num < kMaxUserBacktraceSize) {
	vaddr_t actual_fp = fp;
	if (fp == 0) {
	// We've reached the top of the frame pointer chain.
	break;
	}

	// RISC-V has a nonstandard frame pointer which points to the CFA instead of
	// the previous frame pointer. Since the frame pointer and return address are
	// always just below the CFA, subtract 16 bytes to get to the actual frame pointer.
	#if __riscv
	actual_fp -= 16;
	#endif

	user_in_ptr<const vaddr_t> user_next_fp{reinterpret_cast<vaddr_t*>(actual_fp)};
	user_in_ptr<const vaddr_t> user_pc{reinterpret_cast<vaddr_t*>(actual_fp + 8)};

	// A well formed frame pointer chain ends in 0 and should never fail to copy. If a thread's
	// stack is not readable or well formatted, we return an error to indicate that sampling should
	// be disabled for the offending thread.
	zx_status_t copy_res = user_pc.copy_from_user(&pc);
	if (copy_res != ZX_OK) {
	// We eat the copy_res and return ZX_ERR_NOT_SUPPORTED here and below to indicate that we
	// failed to take a sample, but we might still succeed in the future. A thread may not
	// necessarily have valid frame pointers at all points in execution, so don't give on this
	// thread just yet.
	return zx::error(ZX_ERR_NOT_SUPPORTED);
	}
	if (pc == 0) {
	break;
	}
	bt[frame_num++] = pc;
	copy_res = user_next_fp.copy_from_user(&fp);
	if (copy_res != ZX_OK) {
	return zx::error(ZX_ERR_NOT_SUPPORTED);
	}
	}

	internal::PerCpuState& cpu_state = GetPerCpuState(arch_curr_cpu_num());
	bool enabled = cpu_state.SetPendingWrite();
	if (!enabled) {
	// Even though we didn't successfully write a sample, we return a success result -- we should
	// still try to sample the thread as it may later be scheduled on a different cpu.
	return zx::ok();
	}
	auto d = fit::defer([&cpu_state]() { cpu_state.ResetPendingWrite(); });

	constexpr fxt::StringRef<fxt::RefType::kId> empty_string{0};
	const fxt::ThreadRef current_thread{pid, tid};
	zx_status_t write_result =
	fxt::WriteLargeBlobRecordWithMetadata(&cpu_state, current_ticks(), empty_string, empty_string,
	current_thread, bt, sizeof(uint64_t) * frame_num);

	if (write_result != ZX_OK) {
	cpu_state.DisableWrites();
	dprintf(INFO, "Buffer full, disabling writes on cpu: %u\n", arch_curr_cpu_num());
	}
	return zx::ok();
	}

	void sampler::ThreadSamplerDispatcher::OnPeerZeroHandlesLocked() {
	DEBUG_ASSERT(GetEndpointId() == IobEndpointId::Ep0);

	// We purposely don't emit a call to IoBufferDispatcher::OnPeerZeroHandlesLocked() here. It's used
	// to coordinate and delay ZX_IOB_PEER_CLOSED until any mapped regions have been unmapped. We
	// don't need the logic here. Userspace will never see a ZX_IOB_PEER_CLOSED as we will not close
	// the endpoint the kernel holds until after userspace closes the last handle to their endpoint.
	// When that happens, we end up here and are going to destroy our state anyways.

	// The userspace end of the iobuffer has closed. Time to clean up our state
	if (state_ == SamplingState::Running) {
	StopLocked();
	}

	// After StopLocked, we have prevented further threads from accessing the per_cpu_states, and then
	// waited for any threads that were accessing the states to finish.
	//
	// It's now safe to destroy our cpu states. This will destroy the mappings and pinnings that the
	// kernel keeps to write to, but if userspace has their own mappings, those will remain continue
	// to remain valid.
	per_cpu_state_.reset();
	state_ = SamplingState::Destroyed;
	}

	void sampler::ThreadSamplerDispatcher::SetCurrCpuTimer() {
	GetPerCpuState(arch_curr_cpu_num()).SetTimer();
	}

	zx::result<KernelHandle<sampler::ThreadSamplerDispatcher>> sampler::ThreadSamplerDispatcher::Create(
	const zx_sampler_config_t& config) {
	{
	Guard<Mutex> guard(ThreadSamplerLock::Get());
	if (gThreadSampler_.dispatcher() != nullptr &&
	gThreadSampler_.dispatcher()->State() !=
	sampler::ThreadSamplerDispatcher::SamplingState::Destroyed) {
	return zx::error(ZX_ERR_ALREADY_EXISTS);
	}
	}

	KernelHandle<sampler::ThreadSamplerDispatcher> write_handle;
	KernelHandle<sampler::ThreadSamplerDispatcher> read_handle;
	zx::result res = sampler::ThreadSamplerDispatcher::CreateImpl(config, read_handle, write_handle);
	if (res.is_error()) {
	return res.take_error();
	}

	{
	Guard<Mutex> guard(ThreadSamplerLock::Get());
	// Ensure that someone hasn't created a new sampler since we created ours
	if ((gThreadSampler_.dispatcher() != nullptr &&
	gThreadSampler_.dispatcher()->State() !=
	sampler::ThreadSamplerDispatcher::SamplingState::Destroyed)) {
	return zx::error(ZX_ERR_ALREADY_EXISTS);
	}
	gThreadSampler_ = ktl::move(write_handle);
	}

	return zx::ok(ktl::move(read_handle));
	}

	zx::result<> sampler::ThreadSamplerDispatcher::Stop(const fbl::RefPtr<IoBufferDispatcher>& disp) {
	Guard<Mutex> guard(ThreadSamplerLock::Get());
	if (gThreadSampler_.dispatcher() == nullptr) {
	return zx::error(ZX_ERR_BAD_STATE);
	}
	if (disp->get_koid() != gThreadSampler_.dispatcher()->get_related_koid()) {
	return zx::error(ZX_ERR_BAD_HANDLE);
	}
	return gThreadSampler_.dispatcher()->StopImpl();
	}

	zx::result<> sampler::ThreadSamplerDispatcher::Start(const fbl::RefPtr<IoBufferDispatcher>& disp) {
	Guard<Mutex> guard(ThreadSamplerLock::Get());
	if (gThreadSampler_.dispatcher() == nullptr) {
	return zx::error(ZX_ERR_BAD_STATE);
	}
	if (disp->get_koid() != gThreadSampler_.dispatcher()->get_related_koid()) {
	return zx::error(ZX_ERR_BAD_HANDLE);
	}
	return gThreadSampler_.dispatcher()->StartImpl();
	}

	zx::result<> sampler::ThreadSamplerDispatcher::AddThread(
	const fbl::RefPtr<IoBufferDispatcher>& disp, const fbl::RefPtr<ThreadDispatcher>& thread) {
	Guard<Mutex> guard(ThreadSamplerLock::Get());
	if (gThreadSampler_.dispatcher() == nullptr) {
	return zx::error(ZX_ERR_BAD_STATE);
	}
	if (disp->get_koid() != gThreadSampler_.dispatcher()->get_related_koid()) {
	return zx::error(ZX_ERR_BAD_HANDLE);
	}
	return zx::make_result(thread->EnableStackSampling(gThreadSampler_.dispatcher()->get_koid()));
	}

	zx::result<> sampler::ThreadSamplerDispatcher::SampleThread(zx_koid_t pid, zx_koid_t tid,
	GeneralRegsSource source, void* gregs,
	uint64_t sampler_koid) {
	if (sampler_koid == ZX_KOID_INVALID) {
	// Whatever thread this sampler_koid came from doesn't have sampling enabled.
	return zx::error(ZX_ERR_NOT_SUPPORTED);
	}

	fbl::RefPtr<sampler::ThreadSamplerDispatcher> sampler_ref;
	{
	// We hold the global ThreadSamplerLock only long enough to increase the reference count on
	// the current sampler if it exists.
	//
	// The sampling is relatively slow and we don't want to prevent other threads on a different
	// core from making progress.
	//
	// Once we release the ThreadSamplerLock, it may be that the global sampler is replaced with a
	// new one, however the ThreadSamplerDispatcher maintains enough state that the SampleThread
	// and SetCurrCpuTimers will simply short circuit and return early if that is the case.
	Guard<Mutex> guard(ThreadSamplerLock::Get());
	if (gThreadSampler_.dispatcher() == nullptr \|\|
	gThreadSampler_.dispatcher()->State() !=
	sampler::ThreadSamplerDispatcher::SamplingState::Running) {
	return zx::error(ZX_ERR_BAD_STATE);
	}
	sampler_ref = gThreadSampler_.dispatcher();
	}

	return sampler_ref->SampleThreadImpl(pid, tid, source, gregs, sampler_koid);
	}