libs/vr/libpdx_default_transport/pdx_benchmarks.cpp - third_party/android/platform/frameworks/native - Git at Google

 // Use ALWAYS at the tag level. Control is performed manually during command
 // line processing.
 #define ATRACE_TAG ATRACE_TAG_ALWAYS
 #include <utils/Trace.h>

 #include <base/files/file_util.h>
 #include <base/logging.h>
 #include <base/strings/string_split.h>
 #include <errno.h>
 #include <getopt.h>
 #include <pdx/client.h>
 #include <pdx/default_transport/client_channel_factory.h>
 #include <pdx/default_transport/service_endpoint.h>
 #include <pdx/rpc/buffer_wrapper.h>
 #include <pdx/rpc/default_initialization_allocator.h>
 #include <pdx/rpc/message_buffer.h>
 #include <pdx/rpc/remote_method.h>
 #include <pdx/rpc/serializable.h>
 #include <pdx/service.h>
 #include <sys/prctl.h>
 #include <time.h>
 #include <unistd.h>

 #include <atomic>
 #include <cstdlib>
 #include <functional>
 #include <future>
 #include <iomanip>
 #include <ios>
 #include <iostream>
 #include <memory>
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <thread>
 #include <vector>

 using android::pdx::Channel;
 using android::pdx::ClientBase;
 using android::pdx::Endpoint;
 using android::pdx::ErrorStatus;
 using android::pdx::Message;
 using android::pdx::Service;
 using android::pdx::ServiceBase;
 using android::pdx::default_transport::ClientChannelFactory;
 using android::pdx::Status;
 using android::pdx::Transaction;
 using android::pdx::rpc::BufferWrapper;
 using android::pdx::rpc::DefaultInitializationAllocator;
 using android::pdx::rpc::MessageBuffer;
 using android::pdx::rpc::DispatchRemoteMethod;
 using android::pdx::rpc::RemoteMethodReturn;
 using android::pdx::rpc::ReplyBuffer;
 using android::pdx::rpc::Void;
 using android::pdx::rpc::WrapBuffer;

 namespace {

 constexpr size_t kMaxMessageSize = 4096 * 1024;

 std::string GetServicePath(const std::string& path, int instance_id) {
   return path + std::to_string(instance_id);
 }

 void SetThreadName(const std::string& name) {
   prctl(PR_SET_NAME, reinterpret_cast<unsigned long>(name.c_str()), 0, 0, 0);
 }

 constexpr uint64_t kNanosPerSecond = 1000000000llu;

 uint64_t GetClockNs() {
   timespec t;
   clock_gettime(CLOCK_MONOTONIC, &t);
   return kNanosPerSecond * t.tv_sec + t.tv_nsec;
 }

 template <typename T>
 ssize_t ssizeof(const T&) {
   return static_cast<ssize_t>(sizeof(T));
 }

 class SchedStats {
  public:
   SchedStats() : SchedStats(gettid()) {}
   SchedStats(pid_t task_id) : task_id_(task_id) {}
   SchedStats(const SchedStats&) = default;
   SchedStats& operator=(const SchedStats&) = default;

   void Update() {
     const std::string stats_path =
         "/proc/" + std::to_string(task_id_) + "/schedstat";

     std::string line;
     base::ReadFileToString(base::FilePath{stats_path}, &line);
     std::vector<std::string> stats = base::SplitString(
         line, " ", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);

     CHECK_EQ(3u, stats.size());

     // Calculate the deltas since the last update. Each value is absolute since
     // the task started.
     uint64_t current_cpu_time_ns = std::stoull(stats[0]);
     uint64_t current_wait_ns = std::stoull(stats[1]);
     uint64_t current_timeslices = std::stoull(stats[2]);
     cpu_time_ns_ = current_cpu_time_ns - last_cpu_time_ns_;
     wait_ns_ = current_wait_ns - last_wait_ns_;
     timeslices_ = current_timeslices - last_timeslices_;
     last_cpu_time_ns_ = current_cpu_time_ns;
     last_wait_ns_ = current_wait_ns;
     last_timeslices_ = current_timeslices;
   }

   pid_t task_id() const { return task_id_; }
   uint64_t cpu_time_ns() const { return cpu_time_ns_; }
   uint64_t wait_ns() const { return wait_ns_; }
   uint64_t timeslices() const { return timeslices_; }

   double cpu_time_s() const {
     return static_cast<double>(cpu_time_ns_) / kNanosPerSecond;
   }
   double wait_s() const {
     return static_cast<double>(wait_ns_) / kNanosPerSecond;
   }

  private:
   int32_t task_id_;
   uint64_t cpu_time_ns_ = 0;
   uint64_t last_cpu_time_ns_ = 0;
   uint64_t wait_ns_ = 0;
   uint64_t last_wait_ns_ = 0;
   uint64_t timeslices_ = 0;
   uint64_t last_timeslices_ = 0;

   PDX_SERIALIZABLE_MEMBERS(SchedStats, task_id_, cpu_time_ns_, wait_ns_,
                            timeslices_);
 };

 // Opcodes for client/service protocol.
 struct BenchmarkOps {
   enum : int {
     Nop,
     Read,
     Write,
     Echo,
     Stats,
     WriteVector,
     EchoVector,
     Quit,
   };
 };

 struct BenchmarkRPC {
   PDX_REMOTE_METHOD(Stats, BenchmarkOps::Stats,
                     std::tuple<uint64_t, uint64_t, SchedStats>(Void));
   PDX_REMOTE_METHOD(WriteVector, BenchmarkOps::WriteVector,
                     int(const BufferWrapper<std::vector<uint8_t>> data));
   PDX_REMOTE_METHOD(EchoVector, BenchmarkOps::EchoVector,
                     BufferWrapper<std::vector<uint8_t>>(
                         const BufferWrapper<std::vector<uint8_t>> data));
 };

 struct BenchmarkResult {
   int thread_id = 0;
   int service_id = 0;
   double time_delta_s = 0.0;
   uint64_t bytes_sent = 0;
   SchedStats sched_stats = {};
 };

 // Global command line option values.
 struct Options {
   bool verbose = false;
   int threads = 1;
   int opcode = BenchmarkOps::Read;
   int blocksize = 1;
   int count = 1;
   int instances = 1;
   int timeout = 1;
   int warmup = 0;
 } ProgramOptions;

 // Command line option names.
 const char kOptionService[] = "service";
 const char kOptionClient[] = "client";
 const char kOptionVerbose[] = "verbose";
 const char kOptionOpcode[] = "op";
 const char kOptionBlocksize[] = "bs";
 const char kOptionCount[] = "count";
 const char kOptionThreads[] = "threads";
 const char kOptionInstances[] = "instances";
 const char kOptionTimeout[] = "timeout";
 const char kOptionTrace[] = "trace";
 const char kOptionWarmup[] = "warmup";

 // getopt() long options.
 static option long_options[] = {
     {kOptionService, required_argument, 0, 0},
     {kOptionClient, required_argument, 0, 0},
     {kOptionVerbose, no_argument, 0, 0},
     {kOptionOpcode, required_argument, 0, 0},
     {kOptionBlocksize, required_argument, 0, 0},
     {kOptionCount, required_argument, 0, 0},
     {kOptionThreads, required_argument, 0, 0},
     {kOptionInstances, required_argument, 0, 0},
     {kOptionTimeout, required_argument, 0, 0},
     {kOptionTrace, no_argument, 0, 0},
     {kOptionWarmup, required_argument, 0, 0},
     {0, 0, 0, 0},
 };

 // Parses the argument for kOptionOpcode and sets the value of
 // ProgramOptions.opcode.
 void ParseOpcodeOption(const std::string& argument) {
   if (argument == "read") {
     ProgramOptions.opcode = BenchmarkOps::Read;
   } else if (argument == "write") {
     ProgramOptions.opcode = BenchmarkOps::Write;
   } else if (argument == "echo") {
     ProgramOptions.opcode = BenchmarkOps::Echo;
   } else if (argument == "writevec") {
     ProgramOptions.opcode = BenchmarkOps::WriteVector;
   } else if (argument == "echovec") {
     ProgramOptions.opcode = BenchmarkOps::EchoVector;
   } else if (argument == "quit") {
     ProgramOptions.opcode = BenchmarkOps::Quit;
   } else if (argument == "nop") {
     ProgramOptions.opcode = BenchmarkOps::Nop;
   } else if (argument == "stats") {
     ProgramOptions.opcode = BenchmarkOps::Stats;
   } else {
     ProgramOptions.opcode = std::stoi(argument);
   }
 }

 // Implements the service side of the benchmark.
 class BenchmarkService : public ServiceBase<BenchmarkService> {
  public:
   std::shared_ptr<Channel> OnChannelOpen(Message& message) override {
     VLOG(1) << "BenchmarkService::OnChannelCreate: cid="
             << message.GetChannelId();
     return nullptr;
   }

   void OnChannelClose(Message& message,
                       const std::shared_ptr<Channel>& /*channel*/) override {
     VLOG(1) << "BenchmarkService::OnChannelClose: cid="
             << message.GetChannelId();
   }

   Status<void> HandleMessage(Message& message) override {
     ATRACE_NAME("BenchmarkService::HandleMessage");

     switch (message.GetOp()) {
       case BenchmarkOps::Nop:
         VLOG(1) << "BenchmarkService::HandleMessage: op=nop";
         {
           ATRACE_NAME("Reply");
           CHECK(message.Reply(0));
         }
         return {};

       case BenchmarkOps::Write: {
         VLOG(1) << "BenchmarkService::HandleMessage: op=write send_length="
                 << message.GetSendLength()
                 << " receive_length=" << message.GetReceiveLength();

         Status<void> status;
         if (message.GetSendLength())
           status = message.ReadAll(send_buffer.data(), message.GetSendLength());

         {
           ATRACE_NAME("Reply");
           if (!status)
             CHECK(message.ReplyError(status.error()));
           else
             CHECK(message.Reply(message.GetSendLength()));
         }
         return {};
       }

       case BenchmarkOps::Read: {
         VLOG(1) << "BenchmarkService::HandleMessage: op=read send_length="
                 << message.GetSendLength()
                 << " receive_length=" << message.GetReceiveLength();

         Status<void> status;
         if (message.GetReceiveLength()) {
           status = message.WriteAll(receive_buffer.data(),
                                     message.GetReceiveLength());
         }

         {
           ATRACE_NAME("Reply");
           if (!status)
             CHECK(message.ReplyError(status.error()));
           else
             CHECK(message.Reply(message.GetReceiveLength()));
         }
         return {};
       }

       case BenchmarkOps::Echo: {
         VLOG(1) << "BenchmarkService::HandleMessage: op=echo send_length="
                 << message.GetSendLength()
                 << " receive_length=" << message.GetReceiveLength();

         Status<void> status;
         if (message.GetSendLength())
           status = message.ReadAll(send_buffer.data(), message.GetSendLength());

         if (!status) {
           CHECK(message.ReplyError(status.error()));
           return {};
         }

         if (message.GetSendLength()) {
           status =
               message.WriteAll(send_buffer.data(), message.GetSendLength());
         }

         {
           ATRACE_NAME("Reply");
           if (!status)
             CHECK(message.ReplyError(status.error()));
           else
             CHECK(message.Reply(message.GetSendLength()));
         }
         return {};
       }

       case BenchmarkOps::Stats: {
         VLOG(1) << "BenchmarkService::HandleMessage: op=echo send_length="
                 << message.GetSendLength()
                 << " receive_length=" << message.GetReceiveLength();

         // Snapshot the stats when the message is received.
         const uint64_t receive_time_ns = GetClockNs();
         sched_stats_.Update();

         // Use the RPC system to return the results.
         RemoteMethodReturn<BenchmarkRPC::Stats>(
             message, BenchmarkRPC::Stats::Return{receive_time_ns, GetClockNs(),
                                                  sched_stats_});
         return {};
       }

       case BenchmarkOps::WriteVector:
         VLOG(1) << "BenchmarkService::HandleMessage: op=writevec send_length="
                 << message.GetSendLength()
                 << " receive_length=" << message.GetReceiveLength();

         DispatchRemoteMethod<BenchmarkRPC::WriteVector>(
             *this, &BenchmarkService::OnWriteVector, message, kMaxMessageSize);
         return {};

       case BenchmarkOps::EchoVector:
         VLOG(1) << "BenchmarkService::HandleMessage: op=echovec send_length="
                 << message.GetSendLength()
                 << " receive_length=" << message.GetReceiveLength();

         DispatchRemoteMethod<BenchmarkRPC::EchoVector>(
             *this, &BenchmarkService::OnEchoVector, message, kMaxMessageSize);
         return {};

       case BenchmarkOps::Quit:
         Cancel();
         return ErrorStatus{ESHUTDOWN};

       default:
         VLOG(1) << "BenchmarkService::HandleMessage: default case; op="
                 << message.GetOp();
         return Service::DefaultHandleMessage(message);
     }
   }

   // Updates the scheduler stats from procfs for this thread.
   void UpdateSchedStats() { sched_stats_.Update(); }

  private:
   friend BASE;

   BenchmarkService(std::unique_ptr<Endpoint> endpoint)
       : BASE("BenchmarkService", std::move(endpoint)),
         send_buffer(kMaxMessageSize),
         receive_buffer(kMaxMessageSize) {}

   std::vector<uint8_t> send_buffer;
   std::vector<uint8_t> receive_buffer;

   // Each service thread has its own scheduler stats object.
   static thread_local SchedStats sched_stats_;

   using BufferType = BufferWrapper<
       std::vector<uint8_t, DefaultInitializationAllocator<uint8_t>>>;

   int OnWriteVector(Message&, const BufferType& data) { return data.size(); }
   BufferType OnEchoVector(Message&, BufferType&& data) {
     return std::move(data);
   }

   BenchmarkService(const BenchmarkService&) = delete;
   void operator=(const BenchmarkService&) = delete;
 };

 thread_local SchedStats BenchmarkService::sched_stats_;

 // Implements the client side of the benchmark.
 class BenchmarkClient : public ClientBase<BenchmarkClient> {
  public:
   int Nop() {
     ATRACE_NAME("BenchmarkClient::Nop");
     VLOG(1) << "BenchmarkClient::Nop";
     Transaction transaction{*this};
     return ReturnStatusOrError(transaction.Send<int>(BenchmarkOps::Nop));
   }

   int Write(const void* buffer, size_t length) {
     ATRACE_NAME("BenchmarkClient::Write");
     VLOG(1) << "BenchmarkClient::Write: buffer=" << buffer
             << " length=" << length;
     Transaction transaction{*this};
     return ReturnStatusOrError(
         transaction.Send<int>(BenchmarkOps::Write, buffer, length, nullptr, 0));
     // return write(endpoint_fd(), buffer, length);
   }

   int Read(void* buffer, size_t length) {
     ATRACE_NAME("BenchmarkClient::Read");
     VLOG(1) << "BenchmarkClient::Read: buffer=" << buffer
             << " length=" << length;
     Transaction transaction{*this};
     return ReturnStatusOrError(
         transaction.Send<int>(BenchmarkOps::Read, nullptr, 0, buffer, length));
     // return read(endpoint_fd(), buffer, length);
   }

   int Echo(const void* send_buffer, size_t send_length, void* receive_buffer,
            size_t receive_length) {
     ATRACE_NAME("BenchmarkClient::Echo");
     VLOG(1) << "BenchmarkClient::Echo: send_buffer=" << send_buffer
             << " send_length=" << send_length
             << " receive_buffer=" << receive_buffer
             << " receive_length=" << receive_length;
     Transaction transaction{*this};
     return ReturnStatusOrError(
         transaction.Send<int>(BenchmarkOps::Echo, send_buffer, send_length,
                               receive_buffer, receive_length));
   }

   int Stats(std::tuple<uint64_t, uint64_t, SchedStats>* stats_out) {
     ATRACE_NAME("BenchmarkClient::Stats");
     VLOG(1) << "BenchmarkClient::Stats";

     auto status = InvokeRemoteMethodInPlace<BenchmarkRPC::Stats>(stats_out);
     return status ? 0 : -status.error();
   }

   int WriteVector(const BufferWrapper<std::vector<uint8_t>>& data) {
     ATRACE_NAME("BenchmarkClient::Stats");
     VLOG(1) << "BenchmarkClient::Stats";

     auto status = InvokeRemoteMethod<BenchmarkRPC::WriteVector>(data);
     return ReturnStatusOrError(status);
   }

   template <typename T>
   int WriteVector(const BufferWrapper<T>& data) {
     ATRACE_NAME("BenchmarkClient::WriteVector");
     VLOG(1) << "BenchmarkClient::WriteVector";

     auto status = InvokeRemoteMethod<BenchmarkRPC::WriteVector>(data);
     return ReturnStatusOrError(status);
   }

   template <typename T, typename U>
   int EchoVector(const BufferWrapper<T>& data, BufferWrapper<U>* data_out) {
     ATRACE_NAME("BenchmarkClient::EchoVector");
     VLOG(1) << "BenchmarkClient::EchoVector";

     MessageBuffer<ReplyBuffer>::Reserve(kMaxMessageSize - 1);
     auto status =
         InvokeRemoteMethodInPlace<BenchmarkRPC::EchoVector>(data_out, data);
     return status ? 0 : -status.error();
   }

   int Quit() {
     VLOG(1) << "BenchmarkClient::Quit";
     Transaction transaction{*this};
     return ReturnStatusOrError(transaction.Send<int>(BenchmarkOps::Echo));
   }

  private:
   friend BASE;

   BenchmarkClient(const std::string& service_path)
       : BASE(ClientChannelFactory::Create(service_path),
              ProgramOptions.timeout) {}

   BenchmarkClient(const BenchmarkClient&) = delete;
   void operator=(const BenchmarkClient&) = delete;
 };

 // Creates a benchmark service at |path| and dispatches messages.
 int ServiceCommand(const std::string& path) {
   if (path.empty())
     return -EINVAL;

   // Start the requested number of dispatch threads.
   std::vector<std::thread> dispatch_threads;
   int service_count = ProgramOptions.instances;
   int service_id_counter = 0;
   int thread_id_counter = 0;
   std::atomic<bool> done(false);

   while (service_count--) {
     std::cerr << "Starting service instance " << service_id_counter
               << std::endl;
     auto service = BenchmarkService::Create(
         android::pdx::default_transport::Endpoint::CreateAndBindSocket(
             GetServicePath(path, service_id_counter),
             android::pdx::default_transport::Endpoint::kBlocking));
     if (!service) {
       std::cerr << "Failed to create service instance!!" << std::endl;
       done = true;
       break;
     }

     int thread_count = ProgramOptions.threads;
     while (thread_count--) {
       std::cerr << "Starting dispatch thread " << thread_id_counter
                 << " service " << service_id_counter << std::endl;

       dispatch_threads.emplace_back(
           [&](const int thread_id, const int service_id,
               const std::shared_ptr<BenchmarkService>& local_service) {
             SetThreadName("service" + std::to_string(service_id));

             // Read the initial schedstats for this thread from procfs.
             local_service->UpdateSchedStats();

             ATRACE_NAME("BenchmarkService::Dispatch");
             while (!done) {
               auto ret = local_service->ReceiveAndDispatch();
               if (!ret) {
                 if (ret.error() != ESHUTDOWN) {
                   std::cerr << "Error while dispatching message on thread "
                             << thread_id << " service " << service_id << ": "
                             << ret.GetErrorMessage() << std::endl;
                 } else {
                   std::cerr << "Quitting thread " << thread_id << " service "
                             << service_id << std::endl;
                 }
                 done = true;
                 return;
               }
             }
           },
           thread_id_counter++, service_id_counter, service);
     }

     service_id_counter++;
   }

   // Wait for the dispatch threads to exit.
   for (auto& thread : dispatch_threads) {
     thread.join();
   }

   return 0;
 }

 int ClientCommand(const std::string& path) {
   // Start the requested number of client threads.
   std::vector<std::thread> client_threads;
   std::vector<std::future<BenchmarkResult>> client_results;
   int service_count = ProgramOptions.instances;
   int thread_id_counter = 0;
   int service_id_counter = 0;

   // Aggregate statistics, updated when worker threads exit.
   std::atomic<uint64_t> total_bytes(0);
   std::atomic<uint64_t> total_time_ns(0);

   // Samples for variance calculation.
   std::vector<uint64_t> latency_samples_ns(
       ProgramOptions.instances * ProgramOptions.threads * ProgramOptions.count);
   const size_t samples_per_thread = ProgramOptions.count;

   std::vector<uint8_t> send_buffer(ProgramOptions.blocksize);
   std::vector<uint8_t> receive_buffer(kMaxMessageSize);

   // Barriers for synchronizing thread start.
   std::vector<std::future<void>> ready_barrier_futures;
   std::promise<void> go_barrier_promise;
   std::future<void> go_barrier_future = go_barrier_promise.get_future();

   // Barrier for synchronizing thread tear down.
   std::promise<void> done_barrier_promise;
   std::future<void> done_barrier_future = done_barrier_promise.get_future();

   while (service_count--) {
     int thread_count = ProgramOptions.threads;
     while (thread_count--) {
       std::cerr << "Starting client thread " << thread_id_counter << " service "
                 << service_id_counter << std::endl;

       std::promise<BenchmarkResult> result_promise;
       client_results.push_back(result_promise.get_future());

       std::promise<void> ready_barrier_promise;
       ready_barrier_futures.push_back(ready_barrier_promise.get_future());

       client_threads.emplace_back(
           [&](const int thread_id, const int service_id,
               std::promise<BenchmarkResult> result, std::promise<void> ready) {
             SetThreadName("client" + std::to_string(thread_id) + "/" +
                           std::to_string(service_id));

             ATRACE_NAME("BenchmarkClient::Dispatch");

             auto client =
                 BenchmarkClient::Create(GetServicePath(path, service_id));
             if (!client) {
               std::cerr << "Failed to create client for service " << service_id
                         << std::endl;
               return -ENOMEM;
             }

             uint64_t* thread_samples =
                 &latency_samples_ns[samples_per_thread * thread_id];

             // Per-thread statistics.
             uint64_t bytes_sent = 0;
             uint64_t time_start_ns;
             uint64_t time_end_ns;
             SchedStats sched_stats;

             // Signal ready and wait for go.
             ready.set_value();
             go_barrier_future.wait();

             // Warmup the scheduler.
             int warmup = ProgramOptions.warmup;
             while (warmup--) {
               for (int i = 0; i < 1000000; i++)
                 ;
             }

             sched_stats.Update();
             time_start_ns = GetClockNs();

             int count = ProgramOptions.count;
             while (count--) {
               uint64_t iteration_start_ns = GetClockNs();

               switch (ProgramOptions.opcode) {
                 case BenchmarkOps::Nop: {
                   const int ret = client->Nop();
                   if (ret < 0) {
                     std::cerr << "Failed to send nop: " << strerror(-ret)
                               << std::endl;
                     return ret;
                   } else {
                     VLOG(1) << "Success";
                   }
                   break;
                 }

                 case BenchmarkOps::Read: {
                   const int ret = client->Read(receive_buffer.data(),
                                                ProgramOptions.blocksize);
                   if (ret < 0) {
                     std::cerr << "Failed to read: " << strerror(-ret)
                               << std::endl;
                     return ret;
                   } else if (ret != ProgramOptions.blocksize) {
                     std::cerr << "Expected ret=" << ProgramOptions.blocksize
                               << "; actual ret=" << ret << std::endl;
                     return -EINVAL;
                   } else {
                     VLOG(1) << "Success";
                     bytes_sent += ret;
                   }
                   break;
                 }

                 case BenchmarkOps::Write: {
                   const int ret =
                       client->Write(send_buffer.data(), send_buffer.size());
                   if (ret < 0) {
                     std::cerr << "Failed to write: " << strerror(-ret)
                               << std::endl;
                     return ret;
                   } else if (ret != ProgramOptions.blocksize) {
                     std::cerr << "Expected ret=" << ProgramOptions.blocksize
                               << "; actual ret=" << ret << std::endl;
                     return -EINVAL;
                   } else {
                     VLOG(1) << "Success";
                     bytes_sent += ret;
                   }
                   break;
                 }

                 case BenchmarkOps::Echo: {
                   const int ret = client->Echo(
                       send_buffer.data(), send_buffer.size(),
                       receive_buffer.data(), receive_buffer.size());
                   if (ret < 0) {
                     std::cerr << "Failed to echo: " << strerror(-ret)
                               << std::endl;
                     return ret;
                   } else if (ret != ProgramOptions.blocksize) {
                     std::cerr << "Expected ret=" << ProgramOptions.blocksize
                               << "; actual ret=" << ret << std::endl;
                     return -EINVAL;
                   } else {
                     VLOG(1) << "Success";
                     bytes_sent += ret * 2;
                   }
                   break;
                 }

                 case BenchmarkOps::Stats: {
                   std::tuple<uint64_t, uint64_t, SchedStats> stats;
                   const int ret = client->Stats(&stats);
                   if (ret < 0) {
                     std::cerr << "Failed to get stats: " << strerror(-ret)
                               << std::endl;
                     return ret;
                   } else {
                     VLOG(1) << "Success";
                     std::cerr
                         << "Round trip: receive_time_ns=" << std::get<0>(stats)
                         << " reply_time_ns=" << std::get<1>(stats)
                         << " cpu_time_s=" << std::get<2>(stats).cpu_time_s()
                         << " wait_s=" << std::get<2>(stats).wait_s()
                         << std::endl;
                   }
                   break;
                 }

                 case BenchmarkOps::WriteVector: {
                   const int ret = client->WriteVector(
                       WrapBuffer(send_buffer.data(), ProgramOptions.blocksize));
                   if (ret < 0) {
                     std::cerr << "Failed to write vector: " << strerror(-ret)
                               << std::endl;
                     return ret;
                   } else {
                     VLOG(1) << "Success";
                     bytes_sent += ret;
                   }
                   break;
                 }

                 case BenchmarkOps::EchoVector: {
                   thread_local BufferWrapper<std::vector<
                       uint8_t, DefaultInitializationAllocator<uint8_t>>>
                       response_buffer;
                   const int ret = client->EchoVector(
                       WrapBuffer(send_buffer.data(), ProgramOptions.blocksize),
                       &response_buffer);
                   if (ret < 0) {
                     std::cerr << "Failed to echo vector: " << strerror(-ret)
                               << std::endl;
                     return ret;
                   } else {
                     VLOG(1) << "Success";
                     bytes_sent += send_buffer.size() + response_buffer.size();
                   }
                   break;
                 }

                 case BenchmarkOps::Quit: {
                   const int ret = client->Quit();
                   if (ret < 0 && ret != -ESHUTDOWN) {
                     std::cerr << "Failed to send quit: " << strerror(-ret);
                     return ret;
                   } else {
                     VLOG(1) << "Success";
                   }
                   break;
                 }

                 default:
                   std::cerr
                       << "Invalid client operation: " << ProgramOptions.opcode
                       << std::endl;
                   return -EINVAL;
               }

               uint64_t iteration_end_ns = GetClockNs();
               uint64_t iteration_delta_ns =
                   iteration_end_ns - iteration_start_ns;
               thread_samples[count] = iteration_delta_ns;

               if (iteration_delta_ns > (kNanosPerSecond / 100)) {
                 SchedStats stats = sched_stats;
                 stats.Update();
                 std::cerr << "Thread " << thread_id << " iteration_delta_s="
                           << (static_cast<double>(iteration_delta_ns) /
                               kNanosPerSecond)
                           << " " << stats.cpu_time_s() << " " << stats.wait_s()
                           << std::endl;
               }
             }

             time_end_ns = GetClockNs();
             sched_stats.Update();

             const double time_delta_s =
                 static_cast<double>(time_end_ns - time_start_ns) /
                 kNanosPerSecond;

             total_bytes += bytes_sent;
             total_time_ns += time_end_ns - time_start_ns;

             result.set_value(
                 {thread_id, service_id, time_delta_s, bytes_sent, sched_stats});
             done_barrier_future.wait();

             return 0;
           },
           thread_id_counter++, service_id_counter, std::move(result_promise),
           std::move(ready_barrier_promise));
     }

     service_id_counter++;
   }

   // Wait for workers to be ready.
   std::cerr << "Waiting for workers to be ready..." << std::endl;
   for (auto& ready : ready_barrier_futures)
     ready.wait();

   // Signal workers to go.
   std::cerr << "Kicking off benchmark." << std::endl;
   go_barrier_promise.set_value();

   // Wait for all the worker threas to finish.
   for (auto& result : client_results)
     result.wait();

   // Report worker thread results.
   for (auto& result : client_results) {
     BenchmarkResult benchmark_result = result.get();
     std::cerr << std::fixed << "Thread " << benchmark_result.thread_id
               << " service " << benchmark_result.service_id << ":" << std::endl;
     std::cerr << "\t " << benchmark_result.bytes_sent << " bytes in "
               << benchmark_result.time_delta_s << " seconds ("
               << std::setprecision(0) << (benchmark_result.bytes_sent / 1024.0 /
                                           benchmark_result.time_delta_s)
               << " K/s; " << std::setprecision(3)
               << (ProgramOptions.count / benchmark_result.time_delta_s)
               << " txn/s; " << std::setprecision(9)
               << (benchmark_result.time_delta_s / ProgramOptions.count)
               << " s/txn)" << std::endl;
     std::cerr << "\tStats: " << benchmark_result.sched_stats.cpu_time_s() << " "
               << (benchmark_result.sched_stats.cpu_time_s() /
                   ProgramOptions.count)
               << " " << benchmark_result.sched_stats.wait_s() << " "
               << (benchmark_result.sched_stats.wait_s() / ProgramOptions.count)
               << " " << benchmark_result.sched_stats.timeslices() << std::endl;
   }

   // Signal worker threads to exit.
   done_barrier_promise.set_value();

   // Wait for the worker threads to exit.
   for (auto& thread : client_threads) {
     thread.join();
   }

   // Report aggregate results.
   const int total_threads = ProgramOptions.threads * ProgramOptions.instances;
   const int iterations = ProgramOptions.count;
   const double total_time_s =
       static_cast<double>(total_time_ns) / kNanosPerSecond;
   // This is about how much wall time it took to completely transfer all the
   // paylaods.
   const double average_time_s = total_time_s / total_threads;

   const uint64_t min_sample_time_ns =
       *std::min_element(latency_samples_ns.begin(), latency_samples_ns.end());
   const double min_sample_time_s =
       static_cast<double>(min_sample_time_ns) / kNanosPerSecond;

   const uint64_t max_sample_time_ns =
       *std::max_element(latency_samples_ns.begin(), latency_samples_ns.end());
   const double max_sample_time_s =
       static_cast<double>(max_sample_time_ns) / kNanosPerSecond;

   const double total_sample_time_s =
       std::accumulate(latency_samples_ns.begin(), latency_samples_ns.end(), 0.0,
                       [](double s, uint64_t ns) {
                         return s + static_cast<double>(ns) / kNanosPerSecond;
                       });
   const double average_sample_time_s =
       total_sample_time_s / latency_samples_ns.size();

   const double sum_of_squared_deviations = std::accumulate(
       latency_samples_ns.begin(), latency_samples_ns.end(), 0.0,
       [&](double s, uint64_t ns) {
         const double delta =
             static_cast<double>(ns) / kNanosPerSecond - average_sample_time_s;
         return s + delta * delta;
       });
   const double variance = sum_of_squared_deviations / latency_samples_ns.size();
   const double standard_deviation = std::sqrt(variance);

   const int num_buckets = 200;
   const uint64_t sample_range_ns = max_sample_time_ns - min_sample_time_ns;
   const uint64_t ns_per_bucket = sample_range_ns / num_buckets;
   std::array<uint64_t, num_buckets> sample_buckets = {{0}};

   // Count samples in each bucket range.
   for (uint64_t sample_ns : latency_samples_ns) {
     sample_buckets[(sample_ns - min_sample_time_ns) / (ns_per_bucket + 1)] += 1;
   }

   // Calculate population percentiles.
   const uint64_t percent_50 =
       static_cast<uint64_t>(latency_samples_ns.size() * 0.5);
   const uint64_t percent_90 =
       static_cast<uint64_t>(latency_samples_ns.size() * 0.9);
   const uint64_t percent_95 =
       static_cast<uint64_t>(latency_samples_ns.size() * 0.95);
   const uint64_t percent_99 =
       static_cast<uint64_t>(latency_samples_ns.size() * 0.99);

   uint64_t sample_count = 0;
   double latency_50th_percentile_s, latency_90th_percentile_s,
       latency_95th_percentile_s, latency_99th_percentile_s;
   for (int i = 0; i < num_buckets; i++) {
     // Report the midpoint of the bucket range as the value of the
     // corresponding
     // percentile.
     const double bucket_midpoint_time_s =
         (ns_per_bucket * i + 0.5 * ns_per_bucket + min_sample_time_ns) /
         kNanosPerSecond;
     if (sample_count < percent_50 &&
         (sample_count + sample_buckets[i]) >= percent_50) {
       latency_50th_percentile_s = bucket_midpoint_time_s;
     }
     if (sample_count < percent_90 &&
         (sample_count + sample_buckets[i]) >= percent_90) {
       latency_90th_percentile_s = bucket_midpoint_time_s;
     }
     if (sample_count < percent_95 &&
         (sample_count + sample_buckets[i]) >= percent_95) {
       latency_95th_percentile_s = bucket_midpoint_time_s;
     }
     if (sample_count < percent_99 &&
         (sample_count + sample_buckets[i]) >= percent_99) {
       latency_99th_percentile_s = bucket_midpoint_time_s;
     }
     sample_count += sample_buckets[i];
   }

   std::cerr << std::fixed << "Total throughput over " << total_threads
             << " threads:\n\t " << total_bytes << " bytes in " << average_time_s
             << " seconds (" << std::setprecision(0)
             << (total_bytes / 1024.0 / average_time_s) << " K/s; "
             << std::setprecision(3)
             << (iterations * total_threads / average_time_s)
             << std::setprecision(9) << " txn/s; "
             << (average_time_s / (iterations * total_threads)) << " s/txn)"
             << std::endl;
   std::cerr << "Sample statistics: " << std::endl;
   std::cerr << total_sample_time_s << " s total sample time" << std::endl;
   std::cerr << average_sample_time_s << " s avg" << std::endl;
   std::cerr << standard_deviation << " s std dev" << std::endl;
   std::cerr << min_sample_time_s << " s min" << std::endl;
   std::cerr << max_sample_time_s << " s max" << std::endl;
   std::cerr << "Latency percentiles:" << std::endl;
   std::cerr << "50th: " << latency_50th_percentile_s << " s" << std::endl;
   std::cerr << "90th: " << latency_90th_percentile_s << " s" << std::endl;
   std::cerr << "95th: " << latency_95th_percentile_s << " s" << std::endl;
   std::cerr << "99th: " << latency_99th_percentile_s << " s" << std::endl;

   std::cout << total_time_ns << " " << std::fixed << std::setprecision(9)
             << average_sample_time_s << " " << std::fixed
             << std::setprecision(9) << standard_deviation << std::endl;
   return 0;
 }

 int Usage(const std::string& command_name) {
   // clang-format off
   std::cout << "Usage: " << command_name << " [options]" << std::endl;
   std::cout << "\t--verbose                   : Use verbose messages." << std::endl;
   std::cout << "\t--service <endpoint path>   : Start service at the given path." << std::endl;
   std::cout << "\t--client <endpoint path>    : Start client to the given path." << std::endl;
   std::cout << "\t--op <read | write | echo>  : Sepcify client operation mode." << std::endl;
   std::cout << "\t--bs <block size bytes>     : Sepcify block size to use." << std::endl;
   std::cout << "\t--count <count>             : Sepcify number of transactions to make." << std::endl;
   std::cout << "\t--instances <count>         : Specify number of service instances." << std::endl;
   std::cout << "\t--threads <count>           : Sepcify number of threads per instance." << std::endl;
   std::cout << "\t--timeout <timeout ms | -1> : Timeout to wait for services." << std::endl;
   std::cout << "\t--trace                     : Enable systrace logging." << std::endl;
   std::cout << "\t--warmup <iterations>       : Busy loops before running benchmarks." << std::endl;
   // clang-format on
   return -1;
 }

 }  // anonymous namespace

 int main(int argc, char** argv) {
   logging::LoggingSettings logging_settings;
   logging_settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG;
   logging::InitLogging(logging_settings);

   int getopt_code;
   int option_index;
   std::string option = "";
   std::string command = "";
   std::string command_argument = "";
   bool tracing_enabled = false;

   // Process command line options.
   while ((getopt_code =
               getopt_long(argc, argv, "", long_options, &option_index)) != -1) {
     option = long_options[option_index].name;
     VLOG(1) << "option=" << option;
     switch (getopt_code) {
       case 0:
         if (option == kOptionVerbose) {
           ProgramOptions.verbose = true;
           logging::SetMinLogLevel(-1);
         } else if (option == kOptionOpcode) {
           ParseOpcodeOption(optarg);
         } else if (option == kOptionBlocksize) {
           ProgramOptions.blocksize = std::stoi(optarg);
           if (ProgramOptions.blocksize < 0) {
             std::cerr << "Invalid blocksize argument: "
                       << ProgramOptions.blocksize << std::endl;
             return -EINVAL;
           }
         } else if (option == kOptionCount) {
           ProgramOptions.count = std::stoi(optarg);
           if (ProgramOptions.count < 1) {
             std::cerr << "Invalid count argument: " << ProgramOptions.count
                       << std::endl;
             return -EINVAL;
           }
         } else if (option == kOptionThreads) {
           ProgramOptions.threads = std::stoi(optarg);
           if (ProgramOptions.threads < 1) {
             std::cerr << "Invalid threads argument: " << ProgramOptions.threads
                       << std::endl;
             return -EINVAL;
           }
         } else if (option == kOptionInstances) {
           ProgramOptions.instances = std::stoi(optarg);
           if (ProgramOptions.instances < 1) {
             std::cerr << "Invalid instances argument: "
                       << ProgramOptions.instances << std::endl;
             return -EINVAL;
           }
         } else if (option == kOptionTimeout) {
           ProgramOptions.timeout = std::stoi(optarg);
         } else if (option == kOptionTrace) {
           tracing_enabled = true;
         } else if (option == kOptionWarmup) {
           ProgramOptions.warmup = std::stoi(optarg);
         } else {
           command = option;
           if (optarg)
             command_argument = optarg;
         }
         break;
     }
   }

   // Setup ATRACE/systrace based on command line.
   atrace_setup();
   atrace_set_tracing_enabled(tracing_enabled);

   VLOG(1) << "command=" << command << " command_argument=" << command_argument;

   if (command == "") {
     return Usage(argv[0]);
   } else if (command == kOptionService) {
     return ServiceCommand(command_argument);
   } else if (command == kOptionClient) {
     return ClientCommand(command_argument);
   } else {
     return Usage(argv[0]);
   }
 }