[kernel][vm] Keep per cpu counts of vm_page states

The purpose of this change is to improve the performance of
ZX_INFO_KMEM_STATS.  By maintaining counters of vm_page states we no
longer need to walk all pages to compute ZX_INFO_KMEM_STATS.

A previous version of this change used global atomic integers, but
that proved to be too expensive (ZX-3916).  Instead, we use per cpu
counters and access them with preemption disabled.

Move vm_page_state to it's own file to prevent a cicular dependency
between page and percpu.

Add a couple static helper methods to Percpus to facilitate accessing
per cpu counters like vm_page_counts.

Ran zircon_benchmarks on a NUC to ensure this doesn't significantly
regress performance.  Looked at Channel/WriteRead/64bytes as it seems
to be a bellwether.

Before --

      Mean    Std dev        Min        Max     Median Unit         Mean Mbytes/sec Test case
       562         33        542       6403        555 nanoseconds          108.552 Channel/WriteRead/64bytes
       277         19        263       4391        272 nanoseconds              N/A Channel/WriteRead/64bytes.write
       285         17        274       2680        282 nanoseconds              N/A Channel/WriteRead/64bytes.read

After --

      Mean    Std dev        Min        Max     Median Unit         Mean Mbytes/sec Test case
       571         13        558       6263        569 nanoseconds          106.880 Channel/WriteRead/64bytes
       284         10        275       4458        282 nanoseconds              N/A Channel/WriteRead/64bytes.write
       287          7        279       1805        286 nanoseconds              N/A Channel/WriteRead/64bytes.read

Bug: ZX-833 #comment per cpu counts
Test: booted and ran 'kstats -m'
Change-Id: I18083c4dbc3d0423cc59cf3700a93dc72de8d2e3
diff --git a/zircon/docs/syscalls/object_get_info.md b/zircon/docs/syscalls/object_get_info.md
index 2a18cfc..67410e6 100644
--- a/zircon/docs/syscalls/object_get_info.md
+++ b/zircon/docs/syscalls/object_get_info.md
@@ -608,11 +608,12 @@
 
 *buffer* type: `zx_info_kmem_stats_t[1]`
 
-Returns information about kernel memory usage. It can be expensive to gather.
+Returns information about kernel memory usage.
 
 ```
 typedef struct zx_info_kmem_stats {
     // The total amount of physical memory available to the system.
+    // Note, the values below may not exactly add up to this total.
     size_t total_bytes;
 
     // The amount of unallocated memory.
diff --git a/zircon/kernel/include/kernel/percpu.h b/zircon/kernel/include/kernel/percpu.h
index 7f58450..a6865f3 100644
--- a/zircon/kernel/include/kernel/percpu.h
+++ b/zircon/kernel/include/kernel/percpu.h
@@ -14,6 +14,7 @@
 #include <kernel/timer.h>
 #include <list.h>
 #include <sys/types.h>
+#include <vm/page_state.h>
 #include <zircon/compiler.h>
 
 struct percpu;
@@ -41,7 +42,19 @@
     // Unused argument is so this can be passed to LK_INIT_HOOK() directly.
     static void HeapInit(uint32_t);
 
- private:
+    // Call |Func| with the current CPU's percpu struct with preemption disabled.
+    //
+    // |Func| should accept a |percpu*| and should not block.
+    template <typename Func>
+    static void WithCurrentPreemptDisable(Func&& func);
+
+    // Call |Func| once per CPU with each CPU's percpu struct with preemption disabled.
+    //
+    // |Func| should accept a |percpu*| and should not block.
+    template <typename Func>
+    static void ForEachPreemptDisable(Func&& func);
+
+private:
     // Number of percpu entries.
     static size_t count_;
 
@@ -102,6 +115,12 @@
     // each cpu has a dedicated thread for processing dpcs
     thread_t* dpc_thread;
 
+    // Page state counts are percpu because they change frequently and we don't want to pay for
+    // synchronization.
+    //
+    // When accessing, be sure to do so with preemption disabled. See |WithCurrent| and |ForEach|.
+    vm_page_counts_t vm_page_counts;
+
     // Initialize this percpu object, |cpu_num| will be used to initialize
     // embedded objects.
     void Init(cpu_num_t cpu_num);
@@ -128,3 +147,19 @@
     return &Percpus::Get(arch_curr_cpu_num());
 }
 
+template <typename Func>
+void Percpus::WithCurrentPreemptDisable(Func&& func) {
+    thread_preempt_disable();
+    func(&Percpus::Get(arch_curr_cpu_num()));
+    thread_preempt_reenable();
+}
+
+template <typename Func>
+void Percpus::ForEachPreemptDisable(Func&& func) {
+    thread_preempt_disable();
+    const size_t count = Percpus::Count();
+    for (cpu_num_t cpu_num = 0; cpu_num < count; ++cpu_num) {
+        func(&Percpus::Get(cpu_num));
+    }
+    thread_preempt_reenable();
+}
diff --git a/zircon/kernel/syscalls/object.cpp b/zircon/kernel/syscalls/object.cpp
index 92457fe..b009c50 100644
--- a/zircon/kernel/syscalls/object.cpp
+++ b/zircon/kernel/syscalls/object.cpp
@@ -509,46 +509,54 @@
         // TODO: figure out a better handle to hang this off to and push this copy code into
         // that dispatcher.
 
-        size_t state_count[VM_PAGE_STATE_COUNT_] = {};
-        pmm_count_total_states(state_count);
-
-        size_t total = 0;
+        // |get_count| returns an estimate so the sum of the counts may not equal the total.
+        uint64_t state_count[VM_PAGE_STATE_COUNT_] = {};
         for (uint32_t i = 0; i < VM_PAGE_STATE_COUNT_; i++) {
-            total += state_count[i];
+            state_count[i] = vm_page_t::get_count(vm_page_state(i));
         }
 
-        size_t unused_size = 0;
-        size_t free_heap_bytes = 0;
+        uint64_t unused_size = 0;
+        uint64_t free_heap_bytes = 0;
         heap_get_info(&unused_size, &free_heap_bytes);
 
         // Note that this intentionally uses uint64_t instead of
         // size_t in case we ever have a 32-bit userspace but more
         // than 4GB physical memory.
         zx_info_kmem_stats_t stats = {};
-        stats.total_bytes = total * PAGE_SIZE;
-        size_t other_bytes = stats.total_bytes;
+        stats.total_bytes = pmm_count_total_bytes();
+
+        // Holds the sum of bytes in the broken out states. This sum could be less than the total
+        // because we aren't counting all possible states (e.g. VM_PAGE_STATE_ALLOC). This sum could
+        // be greater than the total because per-state counts are approximate.
+        uint64_t sum_bytes = 0;
 
         stats.free_bytes = state_count[VM_PAGE_STATE_FREE] * PAGE_SIZE;
-        other_bytes -= stats.free_bytes;
+        sum_bytes += stats.free_bytes;
 
         stats.wired_bytes = state_count[VM_PAGE_STATE_WIRED] * PAGE_SIZE;
-        other_bytes -= stats.wired_bytes;
+        sum_bytes += stats.wired_bytes;
 
         stats.total_heap_bytes = state_count[VM_PAGE_STATE_HEAP] * PAGE_SIZE;
-        other_bytes -= stats.total_heap_bytes;
+        sum_bytes += stats.total_heap_bytes;
         stats.free_heap_bytes = free_heap_bytes;
 
         stats.vmo_bytes = state_count[VM_PAGE_STATE_OBJECT] * PAGE_SIZE;
-        other_bytes -= stats.vmo_bytes;
+        sum_bytes += stats.vmo_bytes;
 
         stats.mmu_overhead_bytes = state_count[VM_PAGE_STATE_MMU] * PAGE_SIZE;
-        other_bytes -= stats.mmu_overhead_bytes;
+        sum_bytes += stats.mmu_overhead_bytes;
 
         stats.ipc_bytes = state_count[VM_PAGE_STATE_IPC] * PAGE_SIZE;
-        other_bytes -= stats.ipc_bytes;
+        sum_bytes += stats.ipc_bytes;
 
-        // All other VM_PAGE_STATE_* counts get lumped into other_bytes.
-        stats.other_bytes = other_bytes;
+        // Is there unaccounted memory?
+        if (stats.total_bytes > sum_bytes) {
+            // Everything else gets counted as "other".
+            stats.other_bytes = stats.total_bytes - sum_bytes;
+        } else {
+            // One or more of our per-state counts may have been off. We'll ignore it.
+            stats.other_bytes = 0;
+        }
 
         return single_record_result(
             _buffer, buffer_size, _actual, _avail, &stats, sizeof(stats));
diff --git a/zircon/kernel/vm/include/vm/page.h b/zircon/kernel/vm/include/vm/page.h
index 482342b..326ebf5 100644
--- a/zircon/kernel/vm/include/vm/page.h
+++ b/zircon/kernel/vm/include/vm/page.h
@@ -7,28 +7,12 @@
 
 #pragma once
 
-#include <fbl/algorithm.h>
 #include <list.h>
 #include <stdint.h>
 #include <sys/types.h>
+#include <vm/page_state.h>
 #include <zircon/compiler.h>
 
-enum vm_page_state : uint32_t {
-    VM_PAGE_STATE_FREE = 0,
-    VM_PAGE_STATE_ALLOC,
-    VM_PAGE_STATE_OBJECT,
-    VM_PAGE_STATE_WIRED,
-    VM_PAGE_STATE_HEAP,
-    VM_PAGE_STATE_MMU,   // allocated to serve arch-specific mmu purposes
-    VM_PAGE_STATE_IOMMU, // allocated for platform-specific iommu structures
-    VM_PAGE_STATE_IPC,
-
-    VM_PAGE_STATE_COUNT_
-};
-
-#define VM_PAGE_STATE_BITS 3
-static_assert((1u << VM_PAGE_STATE_BITS) >= VM_PAGE_STATE_COUNT_, "");
-
 // core per page structure allocated at pmm arena creation time
 typedef struct vm_page {
     struct list_node queue_node;
@@ -66,6 +50,16 @@
 
     void set_state(vm_page_state new_state);
 
+    // Return the approximate number of pages in state |state|.
+    //
+    // When called concurrently with |set_state|, the count may be off by a small amount.
+    static uint64_t get_count(vm_page_state state);
+
+    // Add |n| to the count of pages in state |state|.
+    //
+    // Should be used when first constructing pages.
+    static void add_to_initial_count(vm_page_state state, uint64_t n);
+
 } vm_page_t;
 
 // assert that the page structure isn't growing uncontrollably
diff --git a/zircon/kernel/vm/include/vm/page_state.h b/zircon/kernel/vm/include/vm/page_state.h
new file mode 100644
index 0000000..61f1e50
--- /dev/null
+++ b/zircon/kernel/vm/include/vm/page_state.h
@@ -0,0 +1,31 @@
+// Copyright 2019 The Fuchsia Authors
+//
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT
+#pragma once
+
+#include <stdint.h>
+
+// Defines the state of a VM page (|vm_page_t|).
+//
+// Be sure to keep this enum in sync with the definition of |vm_page_t|.
+enum vm_page_state : uint32_t {
+    VM_PAGE_STATE_FREE = 0,
+    VM_PAGE_STATE_ALLOC,
+    VM_PAGE_STATE_OBJECT,
+    VM_PAGE_STATE_WIRED,
+    VM_PAGE_STATE_HEAP,
+    VM_PAGE_STATE_MMU,   // allocated to serve arch-specific mmu purposes
+    VM_PAGE_STATE_IOMMU, // allocated for platform-specific iommu structures
+    VM_PAGE_STATE_IPC,
+
+    VM_PAGE_STATE_COUNT_
+};
+
+#define VM_PAGE_STATE_BITS 3
+static_assert((1u << VM_PAGE_STATE_BITS) >= VM_PAGE_STATE_COUNT_, "");
+
+typedef struct vm_page_counts {
+    int64_t by_state[VM_PAGE_STATE_COUNT_];
+} vm_page_counts_t;
diff --git a/zircon/kernel/vm/include/vm/pmm.h b/zircon/kernel/vm/include/vm/pmm.h
index 480598d5..7ce8757 100644
--- a/zircon/kernel/vm/include/vm/pmm.h
+++ b/zircon/kernel/vm/include/vm/pmm.h
@@ -63,11 +63,6 @@
 // Return amount of physical memory in system, in bytes.
 uint64_t pmm_count_total_bytes();
 
-// Counts the number of pages in every state. For every page in every arena,
-// increments the corresponding VM_PAGE_STATE_*-indexed entry of
-// |state_count|. Does not zero out the entries first.
-void pmm_count_total_states(size_t state_count[VM_PAGE_STATE_COUNT_]) __NONNULL((1));
-
 // virtual to physical
 paddr_t vaddr_to_paddr(const void* va);
 
diff --git a/zircon/kernel/vm/page.cpp b/zircon/kernel/vm/page.cpp
index f456e25..7784f04 100644
--- a/zircon/kernel/vm/page.cpp
+++ b/zircon/kernel/vm/page.cpp
@@ -5,13 +5,15 @@
 // license that can be found in the LICENSE file or at
 // https://opensource.org/licenses/MIT
 
+#include <vm/page.h>
+
 #include <err.h>
 #include <inttypes.h>
+#include <kernel/percpu.h>
 #include <lib/console.h>
 #include <stdio.h>
 #include <string.h>
 #include <trace.h>
-#include <vm/page.h>
 #include <vm/physmap.h>
 #include <vm/pmm.h>
 #include <vm/vm.h>
@@ -48,7 +50,29 @@
     constexpr uint32_t kMask = (1 << VM_PAGE_STATE_BITS) - 1;
     DEBUG_ASSERT_MSG(new_state == (new_state & kMask), "invalid state %u\n", new_state);
 
+    const vm_page_state old_state = vm_page_state(state_priv);
     state_priv = (new_state & kMask);
+
+    Percpus::WithCurrentPreemptDisable(
+        [&old_state, &new_state](percpu* p) {
+            p->vm_page_counts.by_state[old_state] -= 1;
+            p->vm_page_counts.by_state[new_state] += 1;
+        });
+}
+
+uint64_t vm_page::get_count(vm_page_state state) {
+    int64_t result = 0;
+    Percpus::ForEachPreemptDisable([&state, &result](percpu* p) {
+        result += p->vm_page_counts.by_state[state];
+    });
+    return result >= 0 ? result : 0;
+}
+
+void vm_page::add_to_initial_count(vm_page_state state, uint64_t n) {
+    Percpus::WithCurrentPreemptDisable(
+        [&state, &n](percpu* p) {
+            p->vm_page_counts.by_state[state] += n;
+        });
 }
 
 static int cmd_vm_page(int argc, const cmd_args* argv, uint32_t flags) {
diff --git a/zircon/kernel/vm/pmm.cpp b/zircon/kernel/vm/pmm.cpp
index 5f5002f..7635266 100644
--- a/zircon/kernel/vm/pmm.cpp
+++ b/zircon/kernel/vm/pmm.cpp
@@ -107,10 +107,6 @@
     return pmm_node.CountTotalBytes();
 }
 
-void pmm_count_total_states(size_t state_count[VM_PAGE_STATE_COUNT_]) {
-    pmm_node.CountTotalStates(state_count);
-}
-
 static void pmm_dump_timer(struct timer* t, zx_time_t now, void*) {
     zx_time_t deadline = zx_time_add_duration(now, ZX_SEC(1));
     timer_set_oneshot(t, deadline, &pmm_dump_timer, nullptr);
diff --git a/zircon/kernel/vm/pmm_arena.cpp b/zircon/kernel/vm/pmm_arena.cpp
index b70d48a1..3d2ecaa 100644
--- a/zircon/kernel/vm/pmm_arena.cpp
+++ b/zircon/kernel/vm/pmm_arena.cpp
@@ -54,6 +54,9 @@
 
     page_array_ = (vm_page_t*)raw_page_array;
 
+    // we've just constructed |page_count| pages in the state VM_PAGE_STATE_FREE
+    vm_page::add_to_initial_count(VM_PAGE_STATE_FREE, page_count);
+
     // compute the range of the array that backs the array itself
     size_t array_start_index = (PAGE_ALIGN(range.pa) - info_.base) / PAGE_SIZE;
     size_t array_end_index = array_start_index + page_array_size / PAGE_SIZE;
@@ -73,7 +76,6 @@
         if (i >= array_start_index && i < array_end_index) {
             p.set_state(VM_PAGE_STATE_WIRED);
         } else {
-            p.set_state(VM_PAGE_STATE_FREE);
             list_add_tail(&list, &p.queue_node);
         }
     }
diff --git a/zircon/kernel/vm/pmm_node.cpp b/zircon/kernel/vm/pmm_node.cpp
index 24ea4c2..fe3696b 100644
--- a/zircon/kernel/vm/pmm_node.cpp
+++ b/zircon/kernel/vm/pmm_node.cpp
@@ -326,15 +326,6 @@
     return arena_cumulative_size_;
 }
 
-void PmmNode::CountTotalStates(uint64_t state_count[VM_PAGE_STATE_COUNT_]) const {
-    // TODO(MG-833): This is extremely expensive, holding a global lock
-    // and touching every page/arena. We should keep a running count instead.
-    Guard<fbl::Mutex> guard{&lock_};
-    for (auto& a : arena_list_) {
-        a.CountStates(state_count);
-    }
-}
-
 void PmmNode::DumpFree() const TA_NO_THREAD_SAFETY_ANALYSIS {
     auto megabytes_free = CountFreePages() / 256u;
     printf(" %zu free MBs\n", megabytes_free);
diff --git a/zircon/kernel/vm/pmm_node.h b/zircon/kernel/vm/pmm_node.h
index a57e7af..6897cfb 100644
--- a/zircon/kernel/vm/pmm_node.h
+++ b/zircon/kernel/vm/pmm_node.h
@@ -38,7 +38,6 @@
 
     uint64_t CountFreePages() const;
     uint64_t CountTotalBytes() const;
-    void CountTotalStates(uint64_t state_count[VM_PAGE_STATE_COUNT_]) const;
 
     // printf free and overall state of the internal arenas
     // NOTE: both functions skip mutexes and can be called inside timer or crash context