// Copyright 2017 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "round_trips.h"
#include <fuchsia/zircon/benchmarks/cpp/fidl.h>
#include <lib/async-loop/cpp/loop.h>
#include <lib/async-loop/default.h>
#include <lib/fdio/directory.h>
#include <lib/fdio/spawn.h>
#include <lib/scheduler/role.h>
#include <lib/syslog/cpp/macros.h>
#include <lib/zx/channel.h>
#include <lib/zx/handle.h>
#include <pthread.h>
#include <zircon/process.h>
#include <zircon/processargs.h>
#include <zircon/syscalls.h>
#include <zircon/syscalls/port.h>
#include <functional>
#include <iterator>
#include <thread>
#include <vector>
#include "assert.h"
#include "lib/fidl/cpp/binding.h"
#include "test_runner.h"
// This file measures two things:
// 1) The round-trip time of various operations, including Zircon kernel IPC
// primitives. This measures the latency of sending a request to another thread
// or process and receiving a reply back. In this case, there's little
// opportunity for concurrency between the two threads.
// 2) The throughput of IPC operations. This is similar to measuring the
// round-trip time, except that instead of sending and receiving one message,
// the main thread sends N messages and then waits for N messages in reply.
// This allows for more concurrency between the two threads. Currently we only
// test this for Zircon channels.
// Note that the first case is a special case of the second case, with N=1.
// These tests generally use the same IPC primitive in both directions
// (i.e. from client to server and from server to client) for sending and
// receiving wakeups. There are a couple of reasons for that:
// * This allows us to estimate the one-way latency of the IPC primitive
// by dividing the round-trip latency by 2.
// * This keeps the number of tests manageable. If we mixed the
// primitives, the number of possible combinations would be O(n^2) in
// the number of primitives. (For example, we could signal using a
// channel in one direction and a futex in the other direction.)
// An exception is zx_channel_call(), which generally can't be used by a
// server process for receiving requests.
// There are two further dimensions of test variants:
// * "SingleProcess" versus "MultiProcess". The single-process case
// involves round trips between two threads in the same process,
// whereas the multi-process case involves round trips between two
// threads in different processes.
// The multi-process case tends to be slower as a result of
// requiring TLB flushes (or similar operations) when switching
// between processes (if the processes are scheduled on the same
// CPU).
// * "SameCpu" versus "DiffCpu". These variants set the CPU
// affinities of the two threads so that the threads are pinned to
// the same CPU or different CPUs.
// The different-CPU case might be faster as a result of the
// increased parallelism, or it might be slower as a result of IPI
// latency and lock contention between the CPUs.
namespace {
// Block and read a message of size |msg->size()| into |msg| from a channel.
// Returns false if the channel's peer was closed.
bool ChannelRead(const zx::channel& channel, std::vector<uint8_t>* msg) {
zx_signals_t observed;
ASSERT_OK(channel.wait_one(ZX_CHANNEL_READABLE | ZX_CHANNEL_PEER_CLOSED, zx::time::infinite(),
if (observed & ZX_CHANNEL_PEER_CLOSED)
return false;
uint32_t bytes_read;
ASSERT_OK(, msg->data(), nullptr, static_cast<uint32_t>(msg->size()), 0,
&bytes_read, nullptr));
FX_CHECK(bytes_read == msg->size());
return true;
// Block and read |count| messages of size |msg->size()| into |msg| from a
// channel. Returns false if the channel's peer was closed.
bool ChannelReadMultiple(const zx::channel& channel, uint32_t count, std::vector<uint8_t>* msg) {
for (uint32_t i = 0; i < count; ++i) {
if (!ChannelRead(channel, msg))
return false;
return true;
// Serve requests on a channel: read |count| messages of size |size| and write
// |count| replies.
void ChannelServe(const zx::channel& channel, uint32_t count, uint32_t size) {
std::vector<uint8_t> msg(size);
for (;;) {
if (!ChannelReadMultiple(channel, count, &msg))
for (uint32_t i = 0; i < count; ++i) {
ASSERT_OK(channel.write(0,, static_cast<uint32_t>(msg.size()), nullptr, 0));
// Sets the role using the scheduler library.
void SetSchedulerRole(const std::string& sched_role_name) {
// If the scheduler role name is empty, short circuit out.
if (sched_role_name.empty()) {
typedef void (*ThreadFunc)(std::vector<zx::handle>&& handles);
ThreadFunc GetThreadFunc(const char* name);
enum MultiProc {
SingleProcess = 1,
MultiProcess = 2,
// Parameters for launching a child thread or process.
struct ThreadOrProcessParams {
MultiProc multi_proc = SingleProcess;
const std::string sched_role_name;
// Helper class for launching a thread or a subprocess.
class ThreadOrProcess {
~ThreadOrProcess() {
if (thread_.joinable())
if (subprocess_) {
// Join the process.
ASSERT_OK(subprocess_.wait_one(ZX_PROCESS_TERMINATED, zx::time::infinite(), nullptr));
// Make sure it exited cleanly.
zx_info_process_t process_info;
ASSERT_OK(subprocess_.get_info(ZX_INFO_PROCESS, &process_info, sizeof(process_info), nullptr,
FX_CHECK(process_info.return_code == 0);
void Launch(const char* func_name, std::vector<zx::handle>&& handles,
ThreadOrProcessParams params) {
if (params.multi_proc == MultiProcess) {
const char* executable_path = "/pkg/bin/round_trips_helper";
const char* args[] = {executable_path, "--subprocess", func_name,
params.sched_role_name.c_str(), nullptr};
std::vector<fdio_spawn_action_t> actions;
for (uint32_t i = 0; i < handles.size(); ++i) {
fdio_spawn_action_t action{
.h =
.id = PA_HND(PA_USER0, i),
.handle = handles[i].release(),
.name =
.data = "test-process",
if (fdio_spawn_etc(ZX_HANDLE_INVALID, FDIO_SPAWN_CLONE_ALL, executable_path, args, nullptr,
actions.size(),, subprocess_.reset_and_get_address(),
err_msg) != ZX_OK) {
FX_LOGS(FATAL) << "Subprocess launch failed: " << err_msg;
} else {
auto thread_func = [=](std::vector<zx::handle>&& handles) {
thread_ = std::thread(thread_func, std::move(handles));
std::thread thread_;
zx::process subprocess_;
// Convenience function for creating a vector of zx::handles.
std::vector<zx::handle> MakeHandleVector(zx_handle_t handle) {
// Note that "std::vector<zx::handle> v{h}" creates a vector of size h,
// which is not what we want.
std::vector<zx::handle> vec(1);
vec[0] = zx::handle(handle);
return vec;
// Test IPC round trips and/or throughput using Zircon channels where the client
// and server both use zx_object_wait_one() to wait.
class BasicChannelTest {
explicit BasicChannelTest(ThreadOrProcessParams params, uint32_t msg_count, uint32_t msg_size)
: args_({msg_count, msg_size}), msg_(args_.msg_size) {
zx::channel server;
ASSERT_OK(zx::channel::create(0, &server, &client_));
thread_or_process_.Launch("BasicChannelTest::ThreadFunc", MakeHandleVector(server.release()),
// Pass the test arguments to the other thread.
ASSERT_OK(client_.write(0, &args_, sizeof(args_), nullptr, 0));
static void ThreadFunc(std::vector<zx::handle>&& handles) {
FX_CHECK(handles.size() == 1);
zx::channel channel(std::move(handles[0]));
Args args;
GetArgs(channel, &args);
ChannelServe(channel, args.msg_count, args.msg_size);
void Run() {
for (unsigned i = 0; i < args_.msg_count; ++i) {
ASSERT_OK(client_.write(0,, static_cast<uint32_t>(msg_.size()), nullptr, 0));
FX_CHECK(ChannelReadMultiple(client_, args_.msg_count, &msg_));
// Holds the test arguments sent over a channel.
struct Args {
uint32_t msg_count;
uint32_t msg_size;
// Reads test arguments from |channel| and stores them in |args|.
static void GetArgs(const zx::channel& channel, Args* args) {
std::vector<uint8_t> msg(sizeof(*args));
FX_CHECK(ChannelRead(channel, &msg));
*args = *reinterpret_cast<Args*>(;
const Args args_;
std::vector<uint8_t> msg_;
ThreadOrProcess thread_or_process_;
zx::channel client_;
// Test IPC round trips using Zircon channels where the client and server
// both use Zircon ports to wait.
class ChannelPortTest {
explicit ChannelPortTest(ThreadOrProcessParams params) {
zx::channel server;
ASSERT_OK(zx::channel::create(0, &server, &client_));
thread_or_process_.Launch("ChannelPortTest::ThreadFunc", MakeHandleVector(server.release()),
ASSERT_OK(zx::port::create(0, &client_port_));
static bool ChannelPortRead(const zx::channel& channel, const zx::port& port, uint32_t* msg) {
ASSERT_OK(channel.wait_async(port, 0, ZX_CHANNEL_READABLE | ZX_CHANNEL_PEER_CLOSED, 0));
zx_port_packet_t packet;
ASSERT_OK(port.wait(zx::time::infinite(), &packet));
if (packet.signal.observed & ZX_CHANNEL_PEER_CLOSED)
return false;
uint32_t bytes_read;
ASSERT_OK(, msg, nullptr, sizeof(*msg), 0, &bytes_read, nullptr));
FX_CHECK(bytes_read == sizeof(*msg));
return true;
static void ThreadFunc(std::vector<zx::handle>&& handles) {
FX_CHECK(handles.size() == 1);
zx::channel channel(std::move(handles[0]));
zx::port port;
ASSERT_OK(zx::port::create(0, &port));
for (;;) {
uint32_t msg;
if (!ChannelPortRead(channel, port, &msg))
ASSERT_OK(channel.write(0, &msg, sizeof(msg), nullptr, 0));
void Run() {
uint32_t msg = 123;
ASSERT_OK(client_.write(0, &msg, sizeof(msg), nullptr, 0));
FX_CHECK(ChannelPortRead(client_, client_port_, &msg));
ThreadOrProcess thread_or_process_;
zx::channel client_;
zx::port client_port_;
// Test IPC round trips using Zircon channels where the server uses
// zx_object_wait_one() to wait (as with BasicChannelTest) but the client
// uses zx_channel_call() for the send+wait+read.
class ChannelCallTest {
explicit ChannelCallTest(ThreadOrProcessParams params) {
zx::channel server;
ASSERT_OK(zx::channel::create(0, &server, &client_));
thread_or_process_.Launch("ChannelCallTest::ThreadFunc", MakeHandleVector(server.release()),
msg_ = 0;
args_.wr_bytes = reinterpret_cast<void*>(&msg_);
args_.wr_handles = nullptr;
args_.rd_bytes = reinterpret_cast<void*>(&reply_);
args_.rd_handles = nullptr;
args_.wr_num_bytes = sizeof(msg_);
args_.wr_num_handles = 0;
args_.rd_num_bytes = sizeof(reply_);
args_.rd_num_handles = 0;
static void ThreadFunc(std::vector<zx::handle>&& handles) {
FX_CHECK(handles.size() == 1);
zx::channel channel(std::move(handles[0]));
ChannelServe(channel, /* count= */ 1, /* size= */ 4);
void Run() {
uint32_t bytes_read;
uint32_t handles_read;
ASSERT_OK(, zx::time::infinite(), &args_, &bytes_read, &handles_read));
ThreadOrProcess thread_or_process_;
zx::channel client_;
uint32_t msg_;
uint32_t reply_;
zx_channel_call_args_t args_;
// Test IPC round trips using Zircon ports, where the client and server
// send each other user packets. This is not a normal use case for ports,
// but it is useful for measuring the overhead of ports.
class PortTest {
explicit PortTest(ThreadOrProcessParams params) {
ASSERT_OK(zx::port::create(0, &ports_[0]));
ASSERT_OK(zx::port::create(0, &ports_[1]));
std::vector<zx::handle> ports_dup(2);
for (int i = 0; i < 2; ++i) {
zx::port dup;
ASSERT_OK(ports_[i].duplicate(ZX_RIGHT_SAME_RIGHTS, &dup));
ports_dup[i] = std::move(dup);
thread_or_process_.Launch("PortTest::ThreadFunc", std::move(ports_dup), params);
~PortTest() {
// Tell the server to shut down.
zx_port_packet_t packet = {};
packet.type = ZX_PKT_TYPE_USER;
packet.user.u32[0] = 1;
static void ThreadFunc(std::vector<zx::handle>&& ports) {
FX_CHECK(ports.size() == 2);
for (;;) {
zx_port_packet_t packet;
ASSERT_OK(zx_port_wait(ports[0].get(), ZX_TIME_INFINITE, &packet));
// Check for a request to shut down.
if (packet.user.u32[0])
ASSERT_OK(zx_port_queue(ports[1].get(), &packet));
void Run() {
zx_port_packet_t packet = {};
packet.type = ZX_PKT_TYPE_USER;
ASSERT_OK(ports_[1].wait(zx::time::infinite(), &packet));
zx::port ports_[2];
ThreadOrProcess thread_or_process_;
// Helper object for signaling and waiting on a Zircon event object. This
// uses a port for waiting on the event object.
class EventPortSignaler {
EventPortSignaler() { ASSERT_OK(zx::port::create(0, &port_)); }
void set_event(zx::eventpair&& event) { event_ = std::move(event); }
// Waits for the event to be signaled. Returns true if it was signaled
// by Signal() and false if the peer event object was closed.
bool Wait() {
ASSERT_OK(event_.wait_async(port_, 0, ZX_USER_SIGNAL_0 | ZX_EVENTPAIR_PEER_CLOSED, 0));
zx_port_packet_t packet;
ASSERT_OK(port_.wait(zx::time::infinite(), &packet));
if (packet.signal.observed & ZX_EVENTPAIR_PEER_CLOSED)
return false;
// Clear the signal bit.
ASSERT_OK(event_.signal(ZX_USER_SIGNAL_0, 0));
return true;
void Signal() {
// Set a signal bit.
ASSERT_OK(event_.signal_peer(0, ZX_USER_SIGNAL_0));
zx::eventpair event_;
zx::port port_;
// Test the round trip time for waking up threads by signaling using Zircon
// event objects. This uses ports for waiting on the events (rather than
// zx_object_wait_one()), because ports are the most general way to wait.
class EventPortTest {
explicit EventPortTest(ThreadOrProcessParams params) {
zx::eventpair event1;
zx::eventpair event2;
ASSERT_OK(zx::eventpair::create(0, &event1, &event2));
thread_or_process_.Launch("EventPortTest::ThreadFunc", MakeHandleVector(event2.release()),
static void ThreadFunc(std::vector<zx::handle>&& handles) {
FX_CHECK(handles.size() == 1);
EventPortSignaler signaler;
while (signaler.Wait()) {
void Run() {
ThreadOrProcess thread_or_process_;
EventPortSignaler signaler_;
// Helper object for signaling and waiting on a Zircon socket object. This
// uses a port for waiting on the socket object.
class SocketPortSignaler {
SocketPortSignaler() { ASSERT_OK(zx::port::create(0, &port_)); }
void set_socket(zx::socket&& socket) { socket_ = std::move(socket); }
// Waits for the socket to be signaled: reads a byte from the socket.
// Returns true if it was signaled by Signal() and false if it was
// signaled by SignalExit().
bool Wait() {
ASSERT_OK(socket_.wait_async(port_, 0, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0));
zx_port_packet_t packet;
ASSERT_OK(port_.wait(zx::time::infinite(), &packet));
if (packet.signal.observed & ZX_SOCKET_PEER_CLOSED)
return false;
uint8_t message;
size_t bytes_read = 0;
ASSERT_OK(, &message, 1, &bytes_read));
FX_CHECK(bytes_read == 1);
return true;
// Signal the socket by writing a byte to it.
void Signal() {
uint8_t message = 0;
size_t bytes_written = 0;
ASSERT_OK(socket_.write(0, &message, 1, &bytes_written));
FX_CHECK(bytes_written == 1);
zx::socket socket_;
zx::port port_;
// Test the round trip time for waking up threads by reading and writing
// bytes on Zircon socket objects. This uses ports for waiting on the
// sockets (rather than zx_object_wait_one()), because ports are the most
// general way to wait.
class SocketPortTest {
explicit SocketPortTest(ThreadOrProcessParams params) {
zx::socket socket1;
zx::socket socket2;
ASSERT_OK(zx::socket::create(0, &socket1, &socket2));
thread_or_process_.Launch("SocketPortTest::ThreadFunc", MakeHandleVector(socket2.release()),
static void ThreadFunc(std::vector<zx::handle>&& handles) {
FX_CHECK(handles.size() == 1);
SocketPortSignaler signaler;
while (signaler.Wait()) {
void Run() {
ThreadOrProcess thread_or_process_;
SocketPortSignaler signaler_;
// Implementation of FIDL interface for testing round trip IPCs.
class RoundTripperImpl : public fuchsia::zircon::benchmarks::RoundTripper {
void RoundTripTest(uint32_t arg, RoundTripTestCallback callback) override {
FX_CHECK(arg == 123);
// Test IPC round trips using FIDL IPC. This uses a synchronous IPC on the
// client side.
class FidlTest {
explicit FidlTest(ThreadOrProcessParams params) {
zx_handle_t server = service_ptr_.NewRequest().TakeChannel().release();
thread_or_process_.Launch("FidlTest::ThreadFunc", MakeHandleVector(server), params);
static void ThreadFunc(std::vector<zx::handle>&& handles) {
FX_CHECK(handles.size() == 1);
zx::channel channel(std::move(handles[0]));
async::Loop loop(&kAsyncLoopConfigAttachToCurrentThread);
RoundTripperImpl service_impl;
fidl::Binding<fuchsia::zircon::benchmarks::RoundTripper> binding(&service_impl,
binding.set_error_handler([&loop](zx_status_t status) { loop.Quit(); });
void Run() {
uint32_t result;
ASSERT_OK(service_ptr_->RoundTripTest(123, &result));
FX_CHECK(result == 456);
ThreadOrProcess thread_or_process_;
fuchsia::zircon::benchmarks::RoundTripperSyncPtr service_ptr_;
struct ThreadFuncEntry {
const char* name;
ThreadFunc func;
// clang-format off
const ThreadFuncEntry thread_funcs[] = {
#define DEF_FUNC(FUNC) { #FUNC, FUNC },
#undef DEF_FUNC
// clang-format on
ThreadFunc GetThreadFunc(const char* name) {
for (size_t i = 0; i < std::size(thread_funcs); ++i) {
if (!strcmp(name, thread_funcs[i].name))
return thread_funcs[i].func;
FX_LOGS(FATAL) << "Thread function not found: " << name;
return nullptr;
// Register a test that has two variants, single-process and multi-process.
template <class TestClass, typename... Args>
void RegisterTestMultiProc(const char* base_name, Args... args) {
fbenchmark::RegisterTest<TestClass>((std::string(base_name) + "_SingleProcess").c_str(),
fbenchmark::RegisterTest<TestClass>((std::string(base_name) + "_MultiProcess").c_str(),
// Call the given function with the given scheduler role.
// Fuchsia does not currently provide a way to restore the zx::profile
// for a thread after setting it, so in order to leave the zx::profile
// of the calling thread unmodified, this creates a new thread for
// running the function.
void CallWithSchedulerRole(const std::string& sched_role_name, std::function<void()> func) {
if (sched_role_name.empty()) {
} else {
std::thread thread([=] {
// Register a test where the Run() method is run on a thread with the
// given scheduler role.
template <class TestClass, typename... Args>
void RegisterTestWithSchedulerRole(const char* test_name, const std::string& sched_role_name,
Args... args) {
perftest::RegisterTest(test_name, [=](perftest::RepeatState* state) {
CallWithSchedulerRole(sched_role_name, [=] {
TestClass test(args...);
while (state->KeepRunning()) {
return true;
// Register a test with instantiations covering the same-CPU and
// different-CPU cases as well as the single-process and multi-process
// cases.
template <class TestClass>
void RegisterTestMultiProcSameDiffCpu(const char* base_name) {
struct MultiProcParam {
const char* suffix;
MultiProc value;
const static MultiProcParam multi_proc_params[] = {
{"_SingleProcess", SingleProcess},
{"_MultiProcess", MultiProcess},
struct CpuParam {
const char* suffix;
const std::string parent_thread_role_name;
const std::string child_thread_role_name;
// These parameters pin the threads to CPUs 0 and 1. This is
// reasonable on systems with uniform CPUs, such as NUCs. This
// would need to be revisited for systems with non-uniform CPUs,
// e.g. big.LITTLE systems such as VIM3s. On a single-CPU system,
// the pinning should have no effect.
const static CpuParam cpu_params[] = {
{"_SameCpu", "fuchsia.microbenchmarks.pin_to_cpu_0", "fuchsia.microbenchmarks.pin_to_cpu_0"},
{"_DiffCpu", "fuchsia.microbenchmarks.pin_to_cpu_0", "fuchsia.microbenchmarks.pin_to_cpu_1"},
for (auto multi_proc_param : multi_proc_params) {
for (auto cpu_param : cpu_params) {
(std::string(base_name) + multi_proc_param.suffix + cpu_param.suffix).c_str(),
ThreadOrProcessParams{multi_proc_param.value, cpu_param.child_thread_role_name});
void RegisterTests() {
/* count= */ 1, /* size= */ 4);
/* msg_count= */ 1, /* msg_size= */ 64 * 1024);
// These next two benchmarks allocate and free a significant amount of
// memory so their performance can be heavily dependent on kernel allocator
// performance.
/* msg_count= */ 1024, /* msg_size= */ 4);
/* msg_count= */ 1024, /* msg_size= */ 64 * 1024);
// To avoid creating too many test instantiations and metrics, we
// only instantiate one of these tests for the same-CPU and
// different-CPU cases.
} // namespace
void RunSubprocess(const char* func_name, const char* sched_role_name) {
auto func = GetThreadFunc(func_name);
// Retrieve the handles.
std::vector<zx::handle> handles;
for (;;) {
uint32_t index = static_cast<uint32_t>(handles.size());
zx::handle handle(zx_take_startup_handle(PA_HND(PA_USER0, index)));
if (!handle)