[counters] Support a "max" mode for kcounters

This adds a new mode for kcounters that treats the value as a maximum
across cores, rather than a sum. These are declared with KCOUNTER_MAX()
and modified with kcounter_max().

Additionally, adds two new max counters, one for live handles, and one
for maximum allocation size in cmpct_alloc().

In implementing this I noticed that I think `kernel.handles.live`
probably doesn't do quite what it wishes it was doing (maybe?) and filed
ZX-3015.

ZX-3092 #comment [counters] Support a "max" mode for kcounters
ZX-3105 #comment [counters] Support a "max" mode for kcounters

Test: CQ, manual inspection of `k counters view`
Change-Id: I556fe74dfcb54e1fd61421fb13b6be65d4687243
diff --git a/kernel/include/kernel/atomic.h b/kernel/include/kernel/atomic.h
index e998c30..feae36d 100644
--- a/kernel/include/kernel/atomic.h
+++ b/kernel/include/kernel/atomic.h
@@ -130,6 +130,10 @@
     return __atomic_load_n(ptr, __ATOMIC_SEQ_CST);
 }
 
+static inline int64_t atomic_load_64_relaxed(volatile int64_t* ptr) {
+    return __atomic_load_n(ptr, __ATOMIC_RELAXED);
+}
+
 static inline void atomic_store_64(volatile int64_t* ptr, int64_t newval) {
     __atomic_store_n(ptr, newval, __ATOMIC_SEQ_CST);
 }
@@ -188,6 +192,12 @@
     return __atomic_fetch_add(ptr, val, __ATOMIC_RELAXED);
 }
 
+static inline bool atomic_cmpxchg_64_relaxed(volatile int64_t* ptr, int64_t* oldval,
+                                             int64_t newval) {
+    return __atomic_compare_exchange_n(ptr, oldval, newval, false,
+                                       __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+}
+
 static inline bool atomic_cmpxchg_u64_relaxed(volatile uint64_t* ptr, uint64_t* oldval,
                                               uint64_t newval) {
     return __atomic_compare_exchange_n(ptr, oldval, newval, false,
diff --git a/kernel/kernel.ld b/kernel/kernel.ld
index 5389416..7db2aa3 100644
--- a/kernel/kernel.ld
+++ b/kernel/kernel.ld
@@ -221,13 +221,12 @@
         KEEP(*(SORT_BY_NAME(.bss.kcounter.*)))
 
         /*
-         * Sanity check that the aggregate size of kcounters_arena
-         * SMP_MAX_CPUS slots for each counter.  The k_counter_desc structs
-         * in .kcounter.desc are 8 bytes each, which matches the size of a
-         * single counter.  (It's only for this sanity check that we need
-         * to care how big k_counter_desc is.)
+         * Sanity check that the aggregate size of kcounters_arena SMP_MAX_CPUS
+         * slots for each counter.  The k_counter_desc structs in .kcounter.desc
+         * are 16 bytes each. (It's only for this sanity check that we need to
+         * care how big k_counter_desc is.)
          */
-        ASSERT(. - kcounters_arena == SIZEOF(.kcounter.desc) * SMP_MAX_CPUS,
+        ASSERT(. - kcounters_arena == SIZEOF(.kcounter.desc) * 8 * SMP_MAX_CPUS / 16,
                "kcounters_arena size mismatch");
 
         *(.bss*)
diff --git a/kernel/lib/counters/counters.cpp b/kernel/lib/counters/counters.cpp
index 39c6924..276ffb3 100644
--- a/kernel/lib/counters/counters.cpp
+++ b/kernel/lib/counters/counters.cpp
@@ -147,17 +147,29 @@
 static void dump_counter(const k_counter_desc* desc, bool verbose) {
     size_t counter_index = kcounter_index(desc);
 
-    uint64_t sum = 0;
+    uint64_t summary = 0;
     uint64_t values[SMP_MAX_CPUS];
-    for (size_t ix = 0; ix != SMP_MAX_CPUS; ++ix) {
-        // This value is not atomically consistent, therefore is just
-        // an approximation. TODO(cpu): for ARM this might need some magic.
-        values[ix] = percpu[ix].counters[counter_index];
-        sum += values[ix];
+
+    if (desc->type == k_counter_type::max) {
+        for (size_t ix = 0; ix != SMP_MAX_CPUS; ++ix) {
+            // This value is not atomically consistent, therefore is just
+            // an approximation. TODO(cpu): for ARM this might need some magic.
+            values[ix] = percpu[ix].counters[counter_index];
+            if (values[ix] > summary) {
+                summary = values[ix];
+            }
+        }
+    } else {
+        for (size_t ix = 0; ix != SMP_MAX_CPUS; ++ix) {
+            // This value is not atomically consistent, therefore is just
+            // an approximation. TODO(cpu): for ARM this might need some magic.
+            values[ix] = percpu[ix].counters[counter_index];
+            summary += values[ix];
+        }
     }
 
-    printf("[%.2zu] %s = %lu\n", counter_index, desc->name, sum);
-    if (sum == 0u) {
+    printf("[%.2zu] %s = %lu\n", counter_index, desc->name, summary);
+    if (summary == 0u) {
         return;
     }
 
diff --git a/kernel/lib/counters/include/lib/counters.h b/kernel/lib/counters/include/lib/counters.h
index 88618e8..04d4c63 100644
--- a/kernel/lib/counters/include/lib/counters.h
+++ b/kernel/lib/counters/include/lib/counters.h
@@ -24,9 +24,16 @@
 // Kernel counters public API:
 // 1- define a new counter.
 //      KCOUNTER(counter_name, "<counter name>");
+//      KCOUNTER_MAX(counter_name, "<counter name>");
 //
 // 2- counters start at zero, increment the counter:
 //      kcounter_add(counter_name, 1);
+//    or
+//      kcounter_max(counter_name, value);
+//
+// By default with KCOUNTER, the `k counters` presentation will calculate a
+// sum() across cores rather than summing. KCOUNTER_MAX() calculates the max()
+// of the counters across cores.
 //
 //
 // Naming the counters
@@ -40,12 +47,19 @@
 // operations are never used to set their value so they are both
 // imprecise and reflect only the operations on a particular core.
 
+enum class k_counter_type : uint64_t {
+    sum = 1,
+    min = 2,
+    max = 3,
+};
+
 struct k_counter_desc {
     const char* name;
+    k_counter_type type;
 };
-static_assert(sizeof(struct k_counter_desc) ==
-              sizeof(((struct percpu){}).counters[0]),
-              "the kernel.ld ASSERT knows that these sizes match");
+static_assert(
+    sizeof(struct k_counter_desc) == 16,
+    "kernel.ld uses this size to ASSERT that enough space has been reserved in the counters arena");
 
 // Define the descriptor and reserve the arena space for the counters.
 // Because of -fdata-sections, each kcounter_arena_* array will be
@@ -55,11 +69,15 @@
 // slots used for this particular counter (that would have terrible
 // cache effects); it just reserves enough space for counters_init() to
 // dole out in per-CPU chunks.
-#define KCOUNTER(var, name)                                         \
-    __USED int64_t kcounter_arena_##var[SMP_MAX_CPUS]               \
-        __asm__("kcounter." name);                                  \
-    __USED __SECTION("kcountdesc." name)                            \
-    static const struct k_counter_desc var[] = { { name } }
+#define KCOUNTER(var, name)                                                                        \
+    __USED int64_t kcounter_arena_##var[SMP_MAX_CPUS] __asm__("kcounter." name);                   \
+    __USED __SECTION("kcountdesc." name) static const struct k_counter_desc var[] = {              \
+        {name, k_counter_type::sum}}
+
+#define KCOUNTER_MAX(var, name)                                                                    \
+    __USED int64_t kcounter_arena_##var[SMP_MAX_CPUS] __asm__("kcounter." name);                   \
+    __USED __SECTION("kcountdesc." name) static const struct k_counter_desc var[] = {              \
+        {name, k_counter_type::max}}
 
 // Via magic in kernel.ld, all the descriptors wind up in a contiguous
 // array bounded by these two symbols, sorted by name.
@@ -89,3 +107,15 @@
     *kcounter_slot(var) += add;
 #endif
 }
+
+// TODO(travisg|scottmg): Revisit, consider more efficient arm-specific
+// instruction sequence here.
+static inline void kcounter_max(const struct k_counter_desc* var, int64_t value) {
+    int64_t prev_value = atomic_load_64_relaxed(kcounter_slot(var));
+    while (prev_value < value && !atomic_cmpxchg_64_relaxed(kcounter_slot(var), &prev_value, value))
+        ;
+}
+
+static inline void kcounter_max_counter(const struct k_counter_desc* var, const struct k_counter_desc* other_var) {
+    kcounter_max(var, *kcounter_slot(other_var));
+}
diff --git a/kernel/lib/heap/cmpctmalloc/cmpctmalloc.cpp b/kernel/lib/heap/cmpctmalloc/cmpctmalloc.cpp
index 62ace3c..596d32e 100644
--- a/kernel/lib/heap/cmpctmalloc/cmpctmalloc.cpp
+++ b/kernel/lib/heap/cmpctmalloc/cmpctmalloc.cpp
@@ -18,10 +18,11 @@
 #include <kernel/mutex.h>
 #include <kernel/spinlock.h>
 #include <kernel/thread.h>
-#include <vm/vm.h>
+#include <lib/counters.h>
 #include <lib/heap.h>
 #include <platform.h>
 #include <trace.h>
+#include <vm/vm.h>
 
 // Malloc implementation tuned for space.
 //
@@ -115,6 +116,8 @@
 
 #define LOCAL_TRACE 0
 
+KCOUNTER_MAX(max_allocation, "kernel.heap.max_allocation");
+
 // Use HEAP_ENABLE_TESTS to enable internal testing. The tests are not useful
 // when the target system is up. By that time we have done hundreds of allocations
 // already.
@@ -902,6 +905,8 @@
         return NULL;
     }
 
+    kcounter_max(max_allocation, size);
+
     // Large allocations are no longer allowed. See ZX-1318 for details.
     if (size > (HEAP_LARGE_ALLOC_BYTES - sizeof(header_t))) {
         return NULL;
diff --git a/kernel/object/handle.cpp b/kernel/object/handle.cpp
index a6c1cff..219307c 100644
--- a/kernel/object/handle.cpp
+++ b/kernel/object/handle.cpp
@@ -24,6 +24,7 @@
 KCOUNTER(handle_count_made, "kernel.handles.made");
 KCOUNTER(handle_count_duped, "kernel.handles.duped");
 KCOUNTER(handle_count_live, "kernel.handles.live");
+KCOUNTER(handle_count_max_live, "kernel.handles.max_live");
 
 // Masks for building a Handle's base_value, which ProcessDispatcher
 // uses to create zx_handle_t values.
@@ -119,6 +120,7 @@
         return nullptr;
     kcounter_add(handle_count_made, 1);
     kcounter_add(handle_count_live, 1);
+    kcounter_max_counter(handle_count_max_live, handle_count_live);
     return HandleOwner(new (addr) Handle(ktl::move(dispatcher),
                                          rights, base_value));
 }
@@ -139,6 +141,7 @@
         return nullptr;
     kcounter_add(handle_count_duped, 1);
     kcounter_add(handle_count_live, 1);
+    kcounter_max_counter(handle_count_max_live, handle_count_live);
     return HandleOwner(new (addr) Handle(source, rights, base_value));
 }