| /* |
| * Copyright 2020 Google LLC |
| * |
| * Licensed under both the 3-Clause BSD License and the GPLv2, found in the |
| * LICENSE and LICENSE.GPL-2.0 files, respectively, in the root directory. |
| * |
| * SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 |
| */ |
| |
| #include "timing_array.h" |
| |
| #include <stdlib.h> |
| |
| #include <algorithm> |
| #include <cstdint> |
| #include <limits> |
| #include <vector> |
| |
| #include "asm/measurereadlatency.h" |
| #include "instr.h" |
| #include "utils.h" |
| |
| TimingArray::TimingArray() { |
| // Explicitly initialize the elements of the array. |
| // |
| // It's not important what value we write as long as we force *something* to |
| // be written to each element. Otherwise, the backing allocation could be a |
| // range of zero-fill-on-demand (ZFOD), copy-on-write pages that all start |
| // off mapped to the same physical page of zeros. Since the cache on modern |
| // Intel CPUs is physically tagged, some elements might map to the same cache |
| // line and we wouldn't observe a timing difference between reading accessed |
| // and unaccessed elements. |
| for (int i = 0; i < size(); ++i) { |
| ElementAt(i) = -1; |
| } |
| |
| // Init the first time through, then keep for later instances. |
| static uint64_t threshold = FindCachedReadLatencyThreshold(); |
| cached_read_latency_threshold_ = threshold; |
| } |
| |
| void TimingArray::FlushFromCache() { |
| // We only need to flush the cache lines with elements on them. |
| for (int i = 0; i < size(); ++i) { |
| FlushDataCacheLineNoBarrier(&ElementAt(i)); |
| } |
| |
| // Wait for flushes to finish. |
| MemoryAndSpeculationBarrier(); |
| } |
| |
| int TimingArray::FindFirstCachedElementIndexAfter(int start_after) { |
| // Fail if element is out of bounds. |
| if (start_after >= size()) { |
| return -1; |
| } |
| |
| // Start at the element after `start_after`, wrapping around until we've |
| // found a cached element or tried every element. |
| for (int i = 1; i <= size(); ++i) { |
| int el = (start_after + i) % size(); |
| uint64_t read_latency = MeasureReadLatency(&ElementAt(el)); |
| if (read_latency <= cached_read_latency_threshold_) { |
| return el; |
| } |
| } |
| |
| // Didn't find a cached element. |
| return -1; |
| } |
| |
| int TimingArray::FindFirstCachedElementIndex() { |
| // Start "after" the last element, which means start at the first. |
| return FindFirstCachedElementIndexAfter(size() - 1); |
| } |
| |
| // Determines a threshold value (as returned from MeasureReadLatency) where: |
| // - Reads _at or below_ the threshold _almost certainly_ came from cache |
| // - Reads _above_ the threshold _probably_ came from memory |
| // |
| // There are a lot of ways to try find such a threshold value. Our current |
| // approach is, roughly: |
| // 1. Flush all array elements from cache. |
| // 2. Read all elements into cache, noting the fastest read. |
| // 3. Read all elements again, now from cache, noting the slowest read. |
| // 4. Repeat (1)-(3) many times to get a lot of "fastest uncached" and |
| // "slowest cached" datapoints. |
| // 5. Take a low-percentile value from each distribution and return their |
| // midpoint. |
| // |
| // In a previous attempt, we just collected the "slowest cached" distribution |
| // and sampled the threshold from there. That worked in many cases, but failed |
| // when the observed latency of cached reads could change over time -- for |
| // example, on laptops with aggressive dynamic frequency scaling. |
| // |
| // Now we also track "fastest uncached" and choose a threshold that falls |
| // between the two distributions, adding some margin without sacrificing |
| // precision. |
| // |
| // We have to measure reads across all elements of the array, since that's what |
| // the code that later *uses* our computed threshold will be doing. Certain |
| // measurement artifacts only appear when reading a lot of values: |
| // - TimingArray forces values onto different pages, which introduces TLB |
| // pressure. After re-reading ~64 elements of a 256-element array on an |
| // Intel Xeon processor, we see latency increase as we start hitting the L2 |
| // TLB. The 7-Zip LZMA benchmarks have some useful measurements on this: |
| // https://www.7-cpu.com/cpu/Skylake.html |
| // - Our first version of TimingArray didn't implement cache coloring and all |
| // elements were contending for the same cache set. As a result, after |
| // reading 256 values, we had evicted all but the first ~8 from the L1 |
| // cache. Our threshold computation took this into account. If we had just |
| // looped over reading one memory value, the computed threshold would have |
| // been too low to classify reads from L2 or L3 cache. |
| // |
| // Repeating the experiment lets us |
| // Repeating the experiment |
| // - A context switch might happen right before a measurement, evicting array |
| // elements from the cache; or one could happen *during* a measurement, |
| // adding arbitrary extra time to the observed latency. |
| // - A coscheduled hyperthread might introduce cache contention, forcing some |
| // reads to go to memory. |
| // - We might not always succeed in defeating the prefetcher, and a read we |
| // expected to go to memory might instead come from cache. |
| // |
| // |
| // |
| // |
| // The idea of taking low-percentile values to reduce noise is inspired in part |
| // by observations from "Opportunities and Limits of Remote Timing Attacks"[1]. |
| // |
| // [1] https://www.cs.rice.edu/~dwallach/pub/crosby-timing2009.pdf |
| uint64_t TimingArray::FindCachedReadLatencyThreshold() { |
| const int iterations = 10000; |
| |
| // For testing, allow the threshold to be specified as an environment |
| // variable. |
| const char *threshold_from_env = getenv("CACHED_THRESHOLD"); |
| if (threshold_from_env) { |
| return atoi(threshold_from_env); |
| } |
| |
| std::vector<uint64_t> candidate_thresholds; |
| for (int n = 0; n < iterations; ++n) { |
| FlushFromCache(); |
| |
| uint64_t fastest_uncached = std::numeric_limits<uint64_t>::max(); |
| for (int i = 0; i < size(); ++i) { |
| fastest_uncached = |
| std::min(fastest_uncached, MeasureReadLatency(&ElementAt(i))); |
| } |
| |
| uint64_t slowest_cached = std::numeric_limits<uint64_t>::min(); // aka 0 |
| for (int i = 0; i < size(); ++i) { |
| slowest_cached = |
| std::max(slowest_cached, MeasureReadLatency(&ElementAt(i))); |
| } |
| |
| candidate_thresholds.push_back((fastest_uncached + slowest_cached) / 2); |
| } |
| |
| // Return a candidate threshold on the lower side. |
| std::sort(candidate_thresholds.begin(), candidate_thresholds.end()); |
| return candidate_thresholds[iterations * 0.2]; |
| } |