blob: 683264e3ae364e91edca663588cae63d4be5b80f [file] [log] [blame]
/*
* Copyright 2020 Google LLC
*
* Licensed under both the 3-Clause BSD License and the GPLv2, found in the
* LICENSE and LICENSE.GPL-2.0 files, respectively, in the root directory.
*
* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
*/
#include "timing_array.h"
#include <stdlib.h>
#include <algorithm>
#include <cstdint>
#include <limits>
#include <vector>
#include "asm/measurereadlatency.h"
#include "instr.h"
#include "utils.h"
TimingArray::TimingArray() {
// Explicitly initialize the elements of the array.
//
// It's not important what value we write as long as we force *something* to
// be written to each element. Otherwise, the backing allocation could be a
// range of zero-fill-on-demand (ZFOD), copy-on-write pages that all start
// off mapped to the same physical page of zeros. Since the cache on modern
// Intel CPUs is physically tagged, some elements might map to the same cache
// line and we wouldn't observe a timing difference between reading accessed
// and unaccessed elements.
for (int i = 0; i < size(); ++i) {
ElementAt(i) = -1;
}
// Init the first time through, then keep for later instances.
static uint64_t threshold = FindCachedReadLatencyThreshold();
cached_read_latency_threshold_ = threshold;
}
void TimingArray::FlushFromCache() {
// We only need to flush the cache lines with elements on them.
for (int i = 0; i < size(); ++i) {
FlushDataCacheLineNoBarrier(&ElementAt(i));
}
// Wait for flushes to finish.
MemoryAndSpeculationBarrier();
}
int TimingArray::FindFirstCachedElementIndexAfter(int start_after) {
// Fail if element is out of bounds.
if (start_after >= size()) {
return -1;
}
// Start at the element after `start_after`, wrapping around until we've
// found a cached element or tried every element.
for (int i = 1; i <= size(); ++i) {
int el = (start_after + i) % size();
uint64_t read_latency = MeasureReadLatency(&ElementAt(el));
if (read_latency <= cached_read_latency_threshold_) {
return el;
}
}
// Didn't find a cached element.
return -1;
}
int TimingArray::FindFirstCachedElementIndex() {
// Start "after" the last element, which means start at the first.
return FindFirstCachedElementIndexAfter(size() - 1);
}
// Determines a threshold value (as returned from MeasureReadLatency) where:
// - Reads _at or below_ the threshold _almost certainly_ came from cache
// - Reads _above_ the threshold _probably_ came from memory
//
// There are a lot of ways to arrive at such a threshold value. Our current
// approach is, roughly:
// 1. Flush all array elements from cache.
// 2. Read all elements into cache, noting the fastest read.
// 3. Read all elements again, now from cache, noting the slowest read.
// Take the midpoint of the "fastest uncached" and "slowest cached" and
// call it a "candidate threshold".
// 4. Repeat (1)-(3) many times to get a distribution of candidate thresholds.
// 5. Choose a threshold from the low side of that distribution.
//
// In a previous attempt, we just collected the "slowest cached" distribution
// and sampled the threshold from there. That worked in many cases, but failed
// when the observed latency of cached reads could change over time -- for
// example, on laptops with aggressive dynamic frequency scaling.
//
// Now we also measure "fastest uncached" and use it to bias the threshold up.
// The hope is we add margin to more reliably recognize true positives without
// meaningfully increasing the likelihood of false postives.
uint64_t TimingArray::FindCachedReadLatencyThreshold() {
const int iterations = 10000;
// A few considerations for choosing the percentile:
// - Lower means fewer false positives
// - Too low and we see outliers from uncached reads that were "too fast",
// e.g. places where we failed to defeat the prefetcher
// - Too high and we see outliers from slow cached *or* uncached reads,
// e.g. measurements that were interrupted by context switches, or
// where a coscheduled hyperthread introduced cache contention.
const int percentile = 10;
// For testing, allow the threshold to be specified as an environment
// variable.
const char *threshold_from_env = getenv("CACHED_THRESHOLD");
if (threshold_from_env) {
return atoi(threshold_from_env);
}
std::vector<uint64_t> candidate_thresholds;
for (int n = 0; n < iterations; ++n) {
FlushFromCache();
// We have to measure reads across all elements of the array, since that's
// what the code that later *uses* our computed threshold will be doing.
// Certain measurement artifacts only appear when reading many values:
// - TimingArray forces values onto different pages, which introduces TLB
// pressure. After re-reading ~64 elements of a 256-element array on an
// Intel Xeon processor, we see latency increase as we start hitting the
// L2 TLB. The 7-Zip LZMA benchmarks have some useful measurements on
// this: https://www.7-cpu.com/cpu/Skylake.html
// - Our first version of TimingArray didn't implement cache coloring and
// all elements were contending for the same cache set. As a result,
// after reading 256 values, we had evicted all but the first ~8 from
// the L1 cache. Our threshold computation took this into account. If we
// had just looped over reading one memory value, the computed threshold
// would have been too low to classify reads from L2 or L3 cache.
uint64_t fastest_uncached = std::numeric_limits<uint64_t>::max();
for (int i = 0; i < size(); ++i) {
fastest_uncached =
std::min(fastest_uncached, MeasureReadLatency(&ElementAt(i)));
}
uint64_t slowest_cached = std::numeric_limits<uint64_t>::min(); // aka 0
for (int i = 0; i < size(); ++i) {
slowest_cached =
std::max(slowest_cached, MeasureReadLatency(&ElementAt(i)));
}
// The candidate threshold is the midpoint.
candidate_thresholds.push_back((fastest_uncached + slowest_cached) / 2);
}
// Return the Nth% candidate threshold.
std::sort(candidate_thresholds.begin(), candidate_thresholds.end());
int index = (percentile / 100.0) * (iterations - 1);
return candidate_thresholds[index];
}