[uapp][trace-benchmark] Commonize benchmark handling, improve fidelity

Treat "notrace" and "disabled" tests the same as "normal" tests.

Plus rerun benchmarks multiple times and report min,max,average
to cope with excessive randomness in results.

Tested: trace-benchmark

Change-Id: I0fadfd3845d2038fb0cfc93a9b9eff93d6c373e9
diff --git a/system/uapp/trace-benchmark/benchmarks.cpp b/system/uapp/trace-benchmark/benchmarks.cpp
index 97ade91..550f93c 100644
--- a/system/uapp/trace-benchmark/benchmarks.cpp
+++ b/system/uapp/trace-benchmark/benchmarks.cpp
@@ -34,36 +34,20 @@
             // is a stress test so all the app is doing is filling the trace
             // buffer. :-)
             async::Loop loop(&kAsyncLoopConfigNoAttachToThread);
-            BenchmarkHandler handler(&loop, spec_->name, spec_->mode,
-                                     spec_->buffer_size);
+            BenchmarkHandler handler(&loop, spec_->mode, spec_->buffer_size);
 
             loop.StartThread("trace-engine loop", nullptr);
-            handler.Start();
 
-            RunAndMeasure(name, spec_->num_iterations, benchmark);
+            RunAndMeasure(name, spec_->name, spec_->num_iterations, benchmark,
+                          [&handler] () { handler.Start(); },
+                          [&handler] () { handler.Stop(); });
 
-            // Acquire the context before we stop. We can't after we stop
-            // as the context has likely been released (no more
-            // references).
-            trace::internal::trace_buffer_header header;
-            {
-                auto context = trace::TraceProlongedContext::Acquire();
-                trace_stop_engine(ZX_OK);
-                trace_context_snapshot_buffer_header(context.get(), &header);
-            }
-            if (handler.mode() == TRACE_BUFFERING_MODE_ONESHOT) {
-                ZX_DEBUG_ASSERT(header.wrapped_count == 0);
-            } else {
-                printf("Trace buffer wrapped %u times, %" PRIu64 " records dropped\n",
-                       header.wrapped_count, header.num_records_dropped);
-            }
-
+            loop.Quit();
             loop.JoinThreads();
         } else {
             // For the disabled benchmarks we just use the default number
             // of iterations.
-            ZX_DEBUG_ASSERT(spec_ == nullptr);
-            RunAndMeasure(name, benchmark);
+            RunAndMeasure(name, spec_->name, benchmark, [](){}, [](){});
         }
     }
 
@@ -191,13 +175,17 @@
 } // namespace
 
 void RunTracingDisabledBenchmarks() {
-    puts("\nRunning benchmarks with tracing disabled...\n");
-    RunBenchmarks(false, nullptr);
+    static const BenchmarkSpec spec = {
+        "disabled",
+        TRACE_BUFFERING_MODE_ONESHOT, // unused
+        0,
+        kDefaultRunIterations,
+    };
+    RunBenchmarks(false, &spec);
 }
 
 void RunTracingEnabledBenchmarks(const BenchmarkSpec* spec) {
     // No trailing \n on purpose. The extra blank line is provided by
     // BenchmarkHandler.Start().
-    puts("\nRunning benchmarks with tracing enabled...");
     RunBenchmarks(true, spec);
 }
diff --git a/system/uapp/trace-benchmark/benchmarks_ntrace.cpp b/system/uapp/trace-benchmark/benchmarks_ntrace.cpp
index 8a47cda..3b37e74 100644
--- a/system/uapp/trace-benchmark/benchmarks_ntrace.cpp
+++ b/system/uapp/trace-benchmark/benchmarks_ntrace.cpp
@@ -11,34 +11,53 @@
 
 #include "runner.h"
 
+static void NullSetup() {}
+static void NullTeardown() {}
+
 void RunNoTraceBenchmarks() {
-    puts("Running benchmarks with NTRACE...\n");
+    RunAndMeasure(
+        "TRACE_ENABLED", "NTRACE",
+        [] {
+             ZX_DEBUG_ASSERT(!TRACE_ENABLED());
+        },
+        NullSetup, NullTeardown);
 
-    RunAndMeasure("TRACE_ENABLED with NTRACE", [] {
-        ZX_DEBUG_ASSERT(!TRACE_ENABLED());
-    });
+    RunAndMeasure(
+        "TRACE_CATEGORY_ENABLED", "NTRACE",
+        [] {
+             ZX_DEBUG_ASSERT(!TRACE_CATEGORY_ENABLED("+enabled"));
+        },
+        NullSetup, NullTeardown);
 
-    RunAndMeasure("TRACE_CATEGORY_ENABLED with NTRACE", [] {
-        ZX_DEBUG_ASSERT(!TRACE_CATEGORY_ENABLED("+enabled"));
-    });
+    RunAndMeasure(
+        "TRACE_DURATION_BEGIN macro with 0 arguments", "NTRACE",
+        [] {
+             TRACE_DURATION_BEGIN("+enabled", "name");
+        },
+        NullSetup, NullTeardown);
 
-    RunAndMeasure("TRACE_DURATION_BEGIN macro with 0 arguments with NTRACE", [] {
-        TRACE_DURATION_BEGIN("+enabled", "name");
-    });
+    RunAndMeasure(
+        "TRACE_DURATION_BEGIN macro with 1 int32 argument", "NTRACE",
+        [] {
+             TRACE_DURATION_BEGIN("+enabled", "name",
+                                  "k1", 1);
+        },
+        NullSetup, NullTeardown);
 
-    RunAndMeasure("TRACE_DURATION_BEGIN macro with 1 int32 argument with NTRACE", [] {
-        TRACE_DURATION_BEGIN("+enabled", "name",
-                             "k1", 1);
-    });
+    RunAndMeasure(
+        "TRACE_DURATION_BEGIN macro with 4 int32 arguments", "NTRACE",
+        [] {
+             TRACE_DURATION_BEGIN("+enabled", "name",
+                                  "k1", 1, "k2", 2, "k3", 3, "k4", 4);
+        },
+        NullSetup, NullTeardown);
 
-    RunAndMeasure("TRACE_DURATION_BEGIN macro with 4 int32 arguments with NTRACE", [] {
-        TRACE_DURATION_BEGIN("+enabled", "name",
-                             "k1", 1, "k2", 2, "k3", 3, "k4", 4);
-    });
-
-    RunAndMeasure("TRACE_DURATION_BEGIN macro with 8 int32 arguments with NTRACE", [] {
-        TRACE_DURATION_BEGIN("+enabled", "name",
-                             "k1", 1, "k2", 2, "k3", 3, "k4", 4,
-                             "k5", 5, "k6", 6, "k7", 7, "k8", 8);
-    });
+    RunAndMeasure(
+        "TRACE_DURATION_BEGIN macro with 8 int32 arguments", "NTRACE",
+        [] {
+             TRACE_DURATION_BEGIN("+enabled", "name",
+                                  "k1", 1, "k2", 2, "k3", 3, "k4", 4,
+                                  "k5", 5, "k6", 6, "k7", 7, "k8", 8);
+        },
+        NullSetup, NullTeardown);
 }
diff --git a/system/uapp/trace-benchmark/handler.h b/system/uapp/trace-benchmark/handler.h
index e325ff9..88f0781 100644
--- a/system/uapp/trace-benchmark/handler.h
+++ b/system/uapp/trace-benchmark/handler.h
@@ -11,17 +11,33 @@
 
 #include <fbl/array.h>
 #include <lib/async-loop/cpp/loop.h>
-
 #include <trace/handler.h>
+#include <lib/zx/event.h>
 
 class BenchmarkHandler : public trace::TraceHandler {
 public:
-    BenchmarkHandler(async::Loop* loop, const char* name,
-                     trace_buffering_mode_t mode, size_t buffer_size)
+    static constexpr int kWaitStoppedTimeoutSeconds = 10;
+
+    BenchmarkHandler(async::Loop* loop, trace_buffering_mode_t mode,
+                     size_t buffer_size)
         : loop_(loop),
-          name_(name),
           mode_(mode),
           buffer_(new uint8_t[buffer_size], buffer_size) {
+        auto status = zx::event::create(0u, &observer_event_);
+        ZX_DEBUG_ASSERT_MSG(status == ZX_OK,
+                            "zx::event::create returned %s\n",
+                            zx_status_get_string(status));
+        status = trace_register_observer(observer_event_.get());
+        ZX_DEBUG_ASSERT_MSG(status == ZX_OK,
+                            "trace_register_observer returned %s\n",
+                            zx_status_get_string(status));
+    }
+
+    ~BenchmarkHandler() {
+        auto status = trace_unregister_observer(observer_event_.get());
+        ZX_DEBUG_ASSERT_MSG(status == ZX_OK,
+                            "trace_unregister_observer returned %s\n",
+                            zx_status_get_string(status));
     }
 
     trace_buffering_mode_t mode() const { return mode_; }
@@ -30,9 +46,46 @@
         zx_status_t status = trace_start_engine(loop_->dispatcher(),
                                                 this, mode_,
                                                 buffer_.get(), buffer_.size());
-        ZX_DEBUG_ASSERT(status == ZX_OK);
+        ZX_DEBUG_ASSERT_MSG(status == ZX_OK,
+                            "trace_start_engine returned %s\n",
+                            zx_status_get_string(status));
+        ZX_DEBUG_ASSERT(trace_state() == TRACE_STARTED);
+        observer_event_.signal(ZX_EVENT_SIGNALED, 0u);
+        trace_notify_observer_updated(observer_event_.get());
+    }
 
-        printf("\nTrace with benchmark spec \"%s\" started\n", name_);
+    void Stop() {
+        // Acquire the context before we stop. We can't after we stop
+        // as the context has likely been released (no more
+        // references).
+        trace::internal::trace_buffer_header header;
+        {
+            auto context = trace::TraceProlongedContext::Acquire();
+            auto status = trace_stop_engine(ZX_OK);
+            ZX_DEBUG_ASSERT_MSG(status == ZX_OK,
+                                "trace_stop_engine returned %s\n",
+                                zx_status_get_string(status));
+            trace_context_snapshot_buffer_header(context.get(), &header);
+        }
+
+        // Tracing hasn't actually stopped yet. It's stopping, but that won't
+        // complete until all context references are gone (which they are),
+        // and the engine has processed that fact (which it hasn't necessarily
+        // yet).
+        while (trace_state() != TRACE_STOPPED) {
+            auto status = observer_event_.wait_one(
+                ZX_EVENT_SIGNALED,
+                zx::deadline_after(zx::sec(kWaitStoppedTimeoutSeconds)),
+                nullptr);
+            ZX_DEBUG_ASSERT_MSG(status == ZX_OK,
+                                "observer_event_.wait_one returned %s\n",
+                                zx_status_get_string(status));
+            observer_event_.signal(ZX_EVENT_SIGNALED, 0u);
+        }
+
+        if (mode_ == TRACE_BUFFERING_MODE_ONESHOT) {
+            ZX_DEBUG_ASSERT(header.wrapped_count == 0);
+        }
     }
 
 private:
@@ -44,8 +97,13 @@
     void TraceStopped(async_dispatcher_t* async,
                       zx_status_t disposition,
                       size_t buffer_bytes_written) override {
-        printf("Trace stopped, disposition = %s\n",
-               zx_status_get_string(disposition));
+        // This is noise if the status is ZX_OK, so just print if error.
+        // There's also no point in printing for ZX_ERR_NO_MEMORY, as that
+        // information can be determined from the number of records dropped.
+        if  (disposition != ZX_OK && disposition != ZX_ERR_NO_MEMORY) {
+            printf("WARNING: Trace stopped, disposition = %s\n",
+                   zx_status_get_string(disposition));
+        }
 
         if (mode_ == TRACE_BUFFERING_MODE_STREAMING) {
             ZX_DEBUG_ASSERT(disposition == ZX_OK ||
@@ -57,8 +115,6 @@
             // any records.
             ZX_DEBUG_ASSERT(disposition == ZX_OK);
         }
-
-        loop_->Quit();
     }
 
     void NotifyBufferFull(uint32_t wrapped_count,
@@ -74,7 +130,7 @@
     }
 
     async::Loop* const loop_;
-    const char* const name_;
     const trace_buffering_mode_t mode_;
     fbl::Array<uint8_t> const buffer_;
+    zx::event observer_event_;
 };
diff --git a/system/uapp/trace-benchmark/main.cpp b/system/uapp/trace-benchmark/main.cpp
index afd92ff..c697884 100644
--- a/system/uapp/trace-benchmark/main.cpp
+++ b/system/uapp/trace-benchmark/main.cpp
@@ -26,9 +26,8 @@
 
     static const BenchmarkSpec specs[] = {
         {
-            // The buffer is not allowed to fill in oneshot mode, so there's
-            // no use in reporting the buffer size in the name here.
-            "oneshot",
+            // Note: The buffer is not allowed to fill in oneshot mode.
+            "oneshot, 16MB buffer",
             TRACE_BUFFERING_MODE_ONESHOT,
             kLargeBufferSizeBytes,
             kDefaultRunIterations,
diff --git a/system/uapp/trace-benchmark/runner.h b/system/uapp/trace-benchmark/runner.h
index 14b33e0..9274b0d 100644
--- a/system/uapp/trace-benchmark/runner.h
+++ b/system/uapp/trace-benchmark/runner.h
@@ -5,14 +5,22 @@
 #pragma once
 
 #include <stdio.h>
+#include <fbl/function.h>
 
 #include <zircon/syscalls.h>
+#include <lib/zx/time.h>
 
-static constexpr unsigned kWarmUpIterations = 10;
+constexpr unsigned kWarmUpIterations = 10;
 // N.B. This value can't be so large that the buffer fills in oneshot mode.
 // The benchmark will assert-fail if the buffer fills: Otherwise the benchmark
 // is invalid.
-static constexpr unsigned kDefaultRunIterations = 100000;
+constexpr unsigned kDefaultRunIterations = 100000;
+
+// The number of test runs to do.
+// We do this many runs and report min,max,average.
+// We do this because there's some variability in the runs, and this helps
+// identify when it's happening and cope.
+constexpr unsigned kNumTestRuns = 10;
 
 // Measures how long it takes to run some number of iterations of a closure.
 // Returns a value in microseconds.
@@ -27,23 +35,52 @@
             static_cast<float>(zx_ticks_per_second()));
 }
 
+using thunk = fbl::Function<void ()>;
+
 // Runs a closure repeatedly and prints its timing.
 template <typename T>
-void RunAndMeasure(const char* test_name, unsigned iterations,
-                   const T& closure) {
-    printf("* %s...\n", test_name);
+void RunAndMeasure(const char* test_name, const char* spec_name,
+                   unsigned iterations, const T& closure,
+                   thunk setup, thunk teardown) {
+    printf("\n* %s: %s ...\n", spec_name, test_name);
 
+    setup();
     float warm_up_time = Measure(kWarmUpIterations, closure);
-    printf("  - warm-up: %u iterations in %.1f us, %.3f us per iteration\n",
+    teardown();
+    printf("  - warm-up: %u iterations in %.3f us, %.3f us per iteration\n",
            kWarmUpIterations, warm_up_time, warm_up_time / kWarmUpIterations);
 
-    float run_time = Measure(iterations, closure);
-    printf("  - run: %u iterations in %.1f us, %.3f us per iteration\n",
+    float run_times[kNumTestRuns];
+    for (unsigned i = 0; i < kNumTestRuns; ++i) {
+        setup();
+        run_times[i] = Measure(iterations, closure);
+        teardown();
+        zx::nanosleep(zx::deadline_after(zx::msec(10)));
+    }
+
+    float min = 0, max = 0;
+    float cumulative = 0;
+    for (const auto rt : run_times) {
+        if (min == 0 || min > rt)
+            min = rt;
+        if (max == 0 || max < rt)
+            max = rt;
+        cumulative += rt;
+    }
+    float average = cumulative / kNumTestRuns;
+
+    printf("  - run: %u test runs, %u iterations per run\n",
+           kNumTestRuns, iterations);
+    printf("  - total (usec): min: %.3f, max: %.3f, ave: %.3f\n",
+           min, max, average);
+    printf("  - per-iteration (usec): min: %.3f\n",
            // The static cast is to avoid a "may change value" warning.
-           iterations, run_time, run_time / static_cast<float>(iterations));
+           min / static_cast<float>(iterations));
 }
 
 template <typename T>
-void RunAndMeasure(const char* test_name, const T& closure) {
-    RunAndMeasure(test_name, kDefaultRunIterations, closure);
+void RunAndMeasure(const char* test_name, const char* spec_name,
+                   const T& closure, thunk setup, thunk teardown) {
+    RunAndMeasure(test_name, spec_name, kDefaultRunIterations, closure,
+                  fbl::move(setup), fbl::move(teardown));
 }