demos/timing_array.cc - third_party/safeside - Git at Google

 /*
  * Copyright 2020 Google LLC
  *
  * Licensed under both the 3-Clause BSD License and the GPLv2, found in the
  * LICENSE and LICENSE.GPL-2.0 files, respectively, in the root directory.
  *
  * SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
  */

 #include "timing_array.h"

 #include <stdlib.h>

 #include <algorithm>
 #include <cstdint>
 #include <limits>
 #include <vector>

 #include "asm/measurereadlatency.h"
 #include "instr.h"
 #include "utils.h"

 TimingArray::TimingArray() {
   // Explicitly initialize the elements of the array.
   //
   // It's not important what value we write as long as we force *something* to
   // be written to each element. Otherwise, the backing allocation could be a
   // range of zero-fill-on-demand (ZFOD), copy-on-write pages that all start
   // off mapped to the same physical page of zeros. Since the cache on modern
   // Intel CPUs is physically tagged, some elements might map to the same cache
   // line and we wouldn't observe a timing difference between reading accessed
   // and unaccessed elements.
   for (int i = 0; i < size(); ++i) {
     ElementAt(i) = -1;
   }

   // Init the first time through, then keep for later instances.
   static uint64_t threshold = FindCachedReadLatencyThreshold();
   cached_read_latency_threshold_ = threshold;
 }

 void TimingArray::FlushFromCache() {
   // We only need to flush the cache lines with elements on them.
   for (int i = 0; i < size(); ++i) {
     FlushDataCacheLineNoBarrier(&ElementAt(i));
   }

   // Wait for flushes to finish.
   MemoryAndSpeculationBarrier();
 }

 int TimingArray::FindFirstCachedElementIndexAfter(int start_after) {
   // Fail if element is out of bounds.
   if (start_after >= size()) {
     return -1;
   }

   // Start at the element after `start_after`, wrapping around until we've
   // found a cached element or tried every element.
   for (int i = 1; i <= size(); ++i) {
     int el = (start_after + i) % size();
     uint64_t read_latency = MeasureReadLatency(&ElementAt(el));
     if (read_latency <= cached_read_latency_threshold_) {
       return el;
     }
   }

   // Didn't find a cached element.
   return -1;
 }

 int TimingArray::FindFirstCachedElementIndex() {
   // Start "after" the last element, which means start at the first.
   return FindFirstCachedElementIndexAfter(size() - 1);
 }

 // Determines a threshold value (as returned from MeasureReadLatency) where:
 //   - Reads _at or below_ the threshold _almost certainly_ came from cache
 //   - Reads _above_ the threshold _probably_ came from memory
 //
 // There are a lot of ways to arrive at such a threshold value. Our current
 // approach is, roughly:
 //   1. Flush all array elements from cache.
 //   2. Read all elements into cache, noting the fastest read.
 //   3. Read all elements again, now from cache, noting the slowest read.
 //      Take the midpoint of the "fastest uncached" and "slowest cached" and
 //      call it a "candidate threshold".
 //   4. Repeat (1)-(3) many times to get a distribution of candidate thresholds.
 //   5. Choose a threshold from the low side of that distribution.
 //
 // In a previous attempt, we just collected the "slowest cached" distribution
 // and sampled the threshold from there. That worked in many cases, but failed
 // when the observed latency of cached reads could change over time -- for
 // example, on laptops with aggressive dynamic frequency scaling.
 //
 // Now we also measure "fastest uncached" and use it to bias the threshold up.
 // The hope is we add margin to more reliably recognize true positives without
 // meaningfully increasing the likelihood of false postives.
 uint64_t TimingArray::FindCachedReadLatencyThreshold() {
   const int iterations = 10000;

   // A few considerations for choosing the percentile:
   //   - Lower means fewer false positives
   //   - Too low and we see outliers from uncached reads that were "too fast",
   //     e.g. places where we failed to defeat the prefetcher
   //   - Too high and we see outliers from slow cached *or* uncached reads,
   //     e.g. measurements that were interrupted by context switches, or
   //     where a coscheduled hyperthread introduced cache contention.
   const int percentile = 10;

   // For testing, allow the threshold to be specified as an environment
   // variable.
   const char *threshold_from_env = getenv("CACHED_THRESHOLD");
   if (threshold_from_env) {
     return atoi(threshold_from_env);
   }

   std::vector<uint64_t> candidate_thresholds;
   for (int n = 0; n < iterations; ++n) {
     FlushFromCache();

     // We have to measure reads across all elements of the array, since that's
     // what the code that later *uses* our computed threshold will be doing.
     // Certain measurement artifacts only appear when reading many values:
     //   - TimingArray forces values onto different pages, which introduces TLB
     //     pressure. After re-reading ~64 elements of a 256-element array on an
     //     Intel Xeon processor, we see latency increase as we start hitting the
     //     L2 TLB. The 7-Zip LZMA benchmarks have some useful measurements on
     //     this: https://www.7-cpu.com/cpu/Skylake.html
     //   - Our first version of TimingArray didn't implement cache coloring and
     //     all elements were contending for the same cache set. As a result,
     //     after reading 256 values, we had evicted all but the first ~8 from
     //     the L1 cache. Our threshold computation took this into account. If we
     //     had just looped over reading one memory value, the computed threshold
     //     would have been too low to classify reads from L2 or L3 cache.

     uint64_t fastest_uncached = std::numeric_limits<uint64_t>::max();
     for (int i = 0; i < size(); ++i) {
       fastest_uncached =
           std::min(fastest_uncached, MeasureReadLatency(&ElementAt(i)));
     }

     uint64_t slowest_cached = std::numeric_limits<uint64_t>::min();  // aka 0
     for (int i = 0; i < size(); ++i) {
       slowest_cached =
           std::max(slowest_cached, MeasureReadLatency(&ElementAt(i)));
     }

     // The candidate threshold is the midpoint.
     candidate_thresholds.push_back((fastest_uncached + slowest_cached) / 2);
   }

   // Return the Nth% candidate threshold.
   std::sort(candidate_thresholds.begin(), candidate_thresholds.end());
   int index = (percentile / 100.0) * (iterations - 1);
   return candidate_thresholds[index];
 }
	/*
	* Copyright 2020 Google LLC
	*
	* Licensed under both the 3-Clause BSD License and the GPLv2, found in the
	* LICENSE and LICENSE.GPL-2.0 files, respectively, in the root directory.
	*
	* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
	*/

	#include "timing_array.h"

	#include <stdlib.h>

	#include <algorithm>
	#include <cstdint>
	#include <limits>
	#include <vector>

	#include "asm/measurereadlatency.h"
	#include "instr.h"
	#include "utils.h"

	TimingArray::TimingArray() {
	// Explicitly initialize the elements of the array.
	//
	// It's not important what value we write as long as we force something to
	// be written to each element. Otherwise, the backing allocation could be a
	// range of zero-fill-on-demand (ZFOD), copy-on-write pages that all start
	// off mapped to the same physical page of zeros. Since the cache on modern
	// Intel CPUs is physically tagged, some elements might map to the same cache
	// line and we wouldn't observe a timing difference between reading accessed
	// and unaccessed elements.
	for (int i = 0; i < size(); ++i) {
	ElementAt(i) = -1;
	}

	// Init the first time through, then keep for later instances.
	static uint64_t threshold = FindCachedReadLatencyThreshold();
	cached_read_latency_threshold_ = threshold;
	}

	void TimingArray::FlushFromCache() {
	// We only need to flush the cache lines with elements on them.
	for (int i = 0; i < size(); ++i) {
	FlushDataCacheLineNoBarrier(&ElementAt(i));
	}

	// Wait for flushes to finish.
	MemoryAndSpeculationBarrier();
	}

	int TimingArray::FindFirstCachedElementIndexAfter(int start_after) {
	// Fail if element is out of bounds.
	if (start_after >= size()) {
	return -1;
	}

	// Start at the element after `start_after`, wrapping around until we've
	// found a cached element or tried every element.
	for (int i = 1; i <= size(); ++i) {
	int el = (start_after + i) % size();
	uint64_t read_latency = MeasureReadLatency(&ElementAt(el));
	if (read_latency <= cached_read_latency_threshold_) {
	return el;
	}
	}

	// Didn't find a cached element.
	return -1;
	}

	int TimingArray::FindFirstCachedElementIndex() {
	// Start "after" the last element, which means start at the first.
	return FindFirstCachedElementIndexAfter(size() - 1);
	}

	// Determines a threshold value (as returned from MeasureReadLatency) where:
	// - Reads _at or below_ the threshold _almost certainly_ came from cache
	// - Reads _above_ the threshold _probably_ came from memory
	//
	// There are a lot of ways to arrive at such a threshold value. Our current
	// approach is, roughly:
	// 1. Flush all array elements from cache.
	// 2. Read all elements into cache, noting the fastest read.
	// 3. Read all elements again, now from cache, noting the slowest read.
	// Take the midpoint of the "fastest uncached" and "slowest cached" and
	// call it a "candidate threshold".
	// 4. Repeat (1)-(3) many times to get a distribution of candidate thresholds.
	// 5. Choose a threshold from the low side of that distribution.
	//
	// In a previous attempt, we just collected the "slowest cached" distribution
	// and sampled the threshold from there. That worked in many cases, but failed
	// when the observed latency of cached reads could change over time -- for
	// example, on laptops with aggressive dynamic frequency scaling.
	//
	// Now we also measure "fastest uncached" and use it to bias the threshold up.
	// The hope is we add margin to more reliably recognize true positives without
	// meaningfully increasing the likelihood of false postives.
	uint64_t TimingArray::FindCachedReadLatencyThreshold() {
	const int iterations = 10000;

	// A few considerations for choosing the percentile:
	// - Lower means fewer false positives
	// - Too low and we see outliers from uncached reads that were "too fast",
	// e.g. places where we failed to defeat the prefetcher
	// - Too high and we see outliers from slow cached or uncached reads,
	// e.g. measurements that were interrupted by context switches, or
	// where a coscheduled hyperthread introduced cache contention.
	const int percentile = 10;

	// For testing, allow the threshold to be specified as an environment
	// variable.
	const char *threshold_from_env = getenv("CACHED_THRESHOLD");
	if (threshold_from_env) {
	return atoi(threshold_from_env);
	}

	std::vector<uint64_t> candidate_thresholds;
	for (int n = 0; n < iterations; ++n) {
	FlushFromCache();

	// We have to measure reads across all elements of the array, since that's
	// what the code that later uses our computed threshold will be doing.
	// Certain measurement artifacts only appear when reading many values:
	// - TimingArray forces values onto different pages, which introduces TLB
	// pressure. After re-reading ~64 elements of a 256-element array on an
	// Intel Xeon processor, we see latency increase as we start hitting the
	// L2 TLB. The 7-Zip LZMA benchmarks have some useful measurements on
	// this: https://www.7-cpu.com/cpu/Skylake.html
	// - Our first version of TimingArray didn't implement cache coloring and
	// all elements were contending for the same cache set. As a result,
	// after reading 256 values, we had evicted all but the first ~8 from
	// the L1 cache. Our threshold computation took this into account. If we
	// had just looped over reading one memory value, the computed threshold
	// would have been too low to classify reads from L2 or L3 cache.

	uint64_t fastest_uncached = std::numeric_limits<uint64_t>::max();
	for (int i = 0; i < size(); ++i) {
	fastest_uncached =
	std::min(fastest_uncached, MeasureReadLatency(&ElementAt(i)));
	}

	uint64_t slowest_cached = std::numeric_limits<uint64_t>::min(); // aka 0
	for (int i = 0; i < size(); ++i) {
	slowest_cached =
	std::max(slowest_cached, MeasureReadLatency(&ElementAt(i)));
	}

	// The candidate threshold is the midpoint.
	candidate_thresholds.push_back((fastest_uncached + slowest_cached) / 2);
	}

	// Return the Nth% candidate threshold.
	std::sort(candidate_thresholds.begin(), candidate_thresholds.end());
	int index = (percentile / 100.0) * (iterations - 1);
	return candidate_thresholds[index];
	}