WIP: shows ~1% difference on i7-7567U

Fill with 0: 7804us
Fill with 1: 7863us
Fill with 2: 7864us
diff --git a/demos/CMakeLists.txt b/demos/CMakeLists.txt
index 39ed150..d84fedd 100644
--- a/demos/CMakeLists.txt
+++ b/demos/CMakeLists.txt
@@ -205,3 +205,5 @@
 
 # Meltdown DE -- speculative computation with division by zero remainder
 add_demo(meltdown_de SYSTEMS Linux PROCESSORS i686 x86_64)
+
+add_demo(fill_speed)
diff --git a/demos/fill_speed.cc b/demos/fill_speed.cc
new file mode 100644
index 0000000..1f02ad0
--- /dev/null
+++ b/demos/fill_speed.cc
@@ -0,0 +1,80 @@
+#include <algorithm>
+#include <chrono>
+#include <iostream>
+#include <functional>
+#include <vector>
+
+#include "compiler_specifics.h"
+
+using Clock = std::chrono::high_resolution_clock;
+
+void CompilerOpaqueUse(void* p) {
+  asm volatile ("" :: "r"(p) : "memory");
+}
+
+Clock::duration Runtime(std::function<void()> f) {
+  auto start = Clock::now();
+  f();
+  return Clock::now() - start;
+}
+
+int64_t Microseconds(const Clock::duration &d) {
+  return std::chrono::duration_cast<std::chrono::microseconds>(d).count();
+}
+
+SAFESIDE_NEVER_INLINE
+Clock::duration TestOne(int buffer[], int size, int v) {
+  int warmup = 5;  // avoid transient effects, improve consistency
+  int rounds = 50;  // amplify result
+
+  Clock::duration d;
+
+  for (int i = 0; i < warmup + 1; ++i) {
+    d = Runtime([&]() {
+      for (int r = 0; r < rounds; ++r) {
+        std::fill(buffer, buffer + size, v);
+        CompilerOpaqueUse(buffer);
+      }
+    });
+  }
+
+  return d;
+}
+
+void Test() {
+  // target ~50% of L3 size
+  // see `sudo lshw` or /sys/devices/system/cpu/cpu0/cache/index3/size
+  int buffer_bytes = 2 * 1024 * 1024;
+
+  std::vector<int> buffer(buffer_bytes / sizeof(int));
+
+  std::vector<Clock::duration> d0s, d1s, d2s;
+
+  int samples = 30;  // reduce variation
+
+  for (int i = 0; i < samples; ++i) {
+    d0s.push_back(TestOne(buffer.data(), buffer.size(), 0));
+
+    // Use two non-zero values so we can also show the amount of variation/
+    // jitter between runs that we *always* expect to act the same.
+    d1s.push_back(TestOne(buffer.data(), buffer.size(), 1));
+    d2s.push_back(TestOne(buffer.data(), buffer.size(), 2));
+  }
+
+  std::sort(d0s.begin(), d0s.end());
+  std::sort(d1s.begin(), d1s.end());
+  std::sort(d2s.begin(), d2s.end());
+
+  std::cout << "Fill with 0: " << Microseconds(d0s[samples/2]) << "us"
+            << std::endl;
+  std::cout << "Fill with 1: " << Microseconds(d1s[samples/2]) << "us"
+            << std::endl;
+  std::cout << "Fill with 2: " << Microseconds(d2s[samples/2]) << "us"
+            << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+  Test();
+
+  return 0;
+}