Speculation over execution hardware breakpoint fault. (#72)

Also making ForceRead inlined to avoid the call and ret instructions in the speculation window.
diff --git a/demos/CMakeLists.txt b/demos/CMakeLists.txt
index 7322953..c6391be 100644
--- a/demos/CMakeLists.txt
+++ b/demos/CMakeLists.txt
@@ -102,10 +102,14 @@
 
 if((${CMAKE_SYSTEM_NAME} MATCHES "^(Linux)$") AND
    (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(i.86)|(x86_64)$"))
-  # Speculation over hardware breakpoint trap
+  # Speculation over hardware breakpoint trap (read watcher)
   add_executable(speculation_over_hw_breakpoint speculation_over_hw_breakpoint.cc)
   target_link_libraries(speculation_over_hw_breakpoint safeside)
 
+  # Speculation over hardware breakpoint fault (execution watcher)
+  add_executable(speculation_over_exec_hw_breakpoint speculation_over_exec_hw_breakpoint.cc)
+  target_link_libraries(speculation_over_exec_hw_breakpoint safeside)
+
   # Meltdown AC -- speculative fetching of unaligned data
   add_executable(meltdown_ac meltdown_ac.cc)
   target_link_libraries(meltdown_ac safeside)
diff --git a/demos/speculation_over_exec_hw_breakpoint.cc b/demos/speculation_over_exec_hw_breakpoint.cc
new file mode 100644
index 0000000..3d90897
--- /dev/null
+++ b/demos/speculation_over_exec_hw_breakpoint.cc
@@ -0,0 +1,218 @@
+/*
+ * Copyright 2019 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Demonstrates speculative execution over hardware breakpoint fault.
+ * That is a breakpoint that guards an instruction address and is triggered when
+ * that instruction is executed (not read nor written).
+ * We fork the process and run the demonstration in the child, while the parent
+ * takes care for setting up the breakpoint and moving the instruction pointer
+ * over the dead code after the fault.
+ **/
+
+#include "compiler_specifics.h"
+
+#if !SAFESIDE_LINUX
+#  error Unsupported OS. Linux required.
+#endif
+
+#if !SAFESIDE_IA32 && !SAFESIDE_X64
+#  error Unsupported CPU. X86/64 required.
+#endif
+
+#include <array>
+#include <cstring>
+#include <iostream>
+
+#include <signal.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "cache_sidechannel.h"
+#include "instr.h"
+#include "local_content.h"
+#include "utils.h"
+
+// Points to the "nop" instruction that will be guarded by the execution
+// breakpoint.
+extern char breakpoint[];
+
+static char LeakByte(const char *data, size_t offset) {
+  CacheSideChannel sidechannel;
+  const std::array<BigByte, 256> &oracle = sidechannel.GetOracle();
+
+  for (int run = 0;; ++run) {
+    size_t safe_offset = run % strlen(public_data);
+    sidechannel.FlushOracle();
+
+    // We have to precompute the addresses in the oracle, because here the
+    // speculation window on Intel (not on AMD) is too small to allow
+    // computation of the unsafe address in the oracle speculatively.
+    const void *safe_address =
+        oracle.data() + static_cast<size_t>(data[safe_offset]);
+
+    // Architecturally dead variable - never read again. It is also the only
+    // fetch of the "data[offset]". Therefore its value is architecturally
+    // isolated from the rest of the program.
+    const void *unsafe_address =
+        oracle.data() + static_cast<size_t>(data[offset]);
+
+    // Successful access of the safe address in the Oracle.
+    ForceRead(safe_address);
+
+    // NOP instruction after the breakpoint label. That one is guarded by the
+    // execution breakpoint. Contrary to the read/write hardware watcher, this
+    // is a fault (not a trap) and the tracer moves the instruction pointer to
+    // afterspeculation instead.
+    asm volatile(
+        "breakpoint:\n"
+        "nop\n");
+
+    // Dead code. Executed only speculatively.
+    ForceRead(unsafe_address);
+
+    std::cout << "Dead code. Must not be printed." << std::endl;
+
+    // The exit call must not be unconditional, otherwise clang would optimize
+    // out everything that follows it and the linking would fail.
+    if (strlen(public_data) != 0) {
+      exit(EXIT_FAILURE);
+    }
+
+    // Tracer moves the instruction pointer to this label.
+    asm volatile("afterspeculation:");
+
+    std::pair<bool, char> result =
+        sidechannel.RecomputeScores(public_data[safe_offset]);
+
+    if (result.first) {
+      return result.second;
+    }
+
+    if (run > 100000) {
+      std::cerr << "Does not converge " << result.second << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+}
+
+void ChildProcess() {
+  // Allow the parent to trace child's execution.
+  int res = ptrace(PTRACE_TRACEME, 0, nullptr, nullptr);
+  if (res == -1) {
+    std::cerr << "PTRACE_TRACEME failed." << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  // Synchronize with the parent. Let it setup the hardware breakpoint on the
+  // critical nop instruction.
+  raise(SIGSTOP);
+  MemoryAndSpeculationBarrier();
+
+  std::cout << "Leaking the string: ";
+  std::cout.flush();
+  const size_t private_offset = private_data - public_data;
+  for (size_t i = 0; i < strlen(private_data); ++i) {
+    std::cout << LeakByte(public_data, private_offset + i);
+    std::cout.flush();
+  }
+  std::cout << "\nDone!\n";
+}
+
+void ParentProcess(pid_t child) {
+  while (true) {
+    int wstatus, res;
+    wait(&wstatus);
+    if (!WIFSTOPPED(wstatus)) {
+      break;  // Unexpected wait event.
+    }
+
+    if (WSTOPSIG(wstatus) == SIGSTOP) {
+      // Set debug registers.
+      // The child stopped itself with "raise(SIGSTOP)". We have to put the
+      // breakpoint on the "nop" instruction marked by the "breakpoint" label
+      // and let the child continue.
+      res = ptrace(PTRACE_POKEUSER, child, offsetof(user, u_debugreg[0]),
+                   breakpoint);
+      if (res == -1) {
+        std::cerr << "PTRACE_POKEUSER on dr0 failed." << std::endl;
+        exit(EXIT_FAILURE);
+      }
+
+      // Setting the 0th bit in dr7.
+      // 0th bit means the active breakpoint is in local dr0.
+      // We leave the length bits set to 00 so that we get one-byte
+      // granularity. We also leave the mode bits set to 00, because it's an
+      // execution breakpoint.
+      res = ptrace(PTRACE_POKEUSER, child, offsetof(user, u_debugreg[7]), 0x1);
+      if (res == -1) {
+        std::cerr << "PTRACE_POKEUSER on dr7 failed." << errno << std::endl;
+        exit(EXIT_FAILURE);
+      }
+    } else if (WSTOPSIG(wstatus) == SIGTRAP) {
+      // Move instruction pointer.
+      // The child was trapped by executing the hardware breakpoint. We just
+      // move its instruction pointer to the afterspeculation label.
+      user_regs_struct regs;
+      // Read general purpose register values of the child.
+      res = ptrace(PTRACE_GETREGS, child, nullptr, &regs);
+      if (res == -1) {
+        std::cerr << "PTRACE_GETREGS failed." << std::endl;
+        exit(EXIT_FAILURE);
+      }
+
+      // Move the child's instruction pointer to afterspeculation.
+#if SAFESIDE_X64
+      regs.rip = reinterpret_cast<size_t>(afterspeculation);
+#else
+      regs.eip = reinterpret_cast<size_t>(afterspeculation);
+#endif
+
+      // Store the shifted child's instruction pointer value.
+      res = ptrace(PTRACE_SETREGS, child, nullptr, &regs);
+      if (res == -1) {
+        std::cerr << "PTRACE_SETREGS failed." << std::endl;
+        exit(EXIT_FAILURE);
+      }
+    } else {
+      // Unexpected signal received by the child.
+      // The child didn't stop with SIGSTOP nor SIGTRAP.
+      // Terminating the parent.
+      break;
+    }
+
+    // Wake up the child.
+    res = ptrace(PTRACE_CONT, child, nullptr, nullptr);
+    if (res == -1) {
+      std::cerr << "PTRACE_CONT after signal failed." << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+}
+
+int main() {
+  pid_t pid = fork();
+  if (pid == 0) {
+    // Tracee.
+    ChildProcess();
+  } else {
+    // Tracer.
+    ParentProcess(pid);
+  }
+}
diff --git a/demos/utils.cc b/demos/utils.cc
index fb3264a..872cda9 100644
--- a/demos/utils.cc
+++ b/demos/utils.cc
@@ -21,13 +21,6 @@
 
 constexpr size_t kCacheLineSize = 64;
 
-// Forced memory load. Used during both real and speculative execution to create
-// a microarchitectural side effect in the cache. Also used for latency
-// measurement in the FLUSH+RELOAD technique.
-void ForceRead(const void *p) {
-  (void)*reinterpret_cast<const volatile char *>(p);
-}
-
 // Flush a memory interval from cache. Used to induce speculative execution on
 // flushed values until they are fetched back to the cache.
 void FlushFromCache(const char *start, const char *end) {
diff --git a/demos/utils.h b/demos/utils.h
index e961c37..914290f 100644
--- a/demos/utils.h
+++ b/demos/utils.h
@@ -14,11 +14,21 @@
  * limitations under the License.
  */
 
-// Forced memory load. Used during both real and speculative execution to create
-// a microarchitectural side effect in the cache. Also used for latency
-// measurement in the FLUSH+RELOAD technique.
-void ForceRead(const void *p);
+#ifndef DEMOS_UTILS_H
+#define DEMOS_UTILS_H
+
+#include "compiler_specifics.h"
+
+// Forced memory load. Loads the memory into cache. Used during both real and
+// speculative execution to create a microarchitectural side effect in the
+// cache. Also used for latency measurement in the FLUSH+RELOAD technique.
+// Should be inlined to minimize the speculation window.
+SAFESIDE_ALWAYS_INLINE
+inline void ForceRead(const void *p) {
+  (void)*reinterpret_cast<const volatile char *>(p);
+}
 
 // Flush a memory interval from cache. Used to induce speculative execution on
 // flushed values until they are fetched back to the cache.
 void FlushFromCache(const char *start, const char *end);
+#endif  // DEMOS_UTILS_H