| /* |
| * Copyright 2020 Google LLC |
| * |
| * Licensed under both the 3-Clause BSD License and the GPLv2, found in the |
| * LICENSE and LICENSE.GPL-2.0 files, respectively, in the root directory. |
| * |
| * SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 |
| */ |
| |
| #ifndef DEMOS_TIMING_ARRAY_H_ |
| #define DEMOS_TIMING_ARRAY_H_ |
| |
| #include <array> |
| #include <cstddef> |
| #include <cstdint> |
| #include <vector> |
| |
| #include "hardware_constants.h" |
| |
| // TimingArray is an indexable container that makes it easy to induce and |
| // measure cache timing side-channels to leak the value of a single byte. |
| // |
| // TimingArray goes to some trouble to make sure that element accesses do not |
| // interfere with the cache presence of other elements. |
| // |
| // Specifically: |
| // - Each element is on its own physical page of memory, which usually |
| // prevents interference from hardware prefetchers.[1] |
| // - Elements' order in memory is different than their index order, which |
| // further frustrates hardware prefetchers by avoiding memory accesses at |
| // any repeated stride. |
| // - Elements are distributed across cache sets -- as opposed to e.g. always |
| // being at offset 0 within a page -- which reduces contention within cache |
| // sets, optimizing the use of L1 and L2 caches and therefore improving |
| // side-channel signal by increasing the timing difference between cached |
| // and uncached accesses. |
| // |
| // TimingArray also includes convenience functions for cache manipulation and |
| // timing measurement. |
| // |
| // Example use: |
| // |
| // TimingArray ta; |
| // int i = -1; |
| // |
| // // Loop until we're sure we saw an element come from cache |
| // while (i == -1) { |
| // ta.FlushFromCache(); |
| // ForceRead(&ta[4]); |
| // i = ta.FindFirstCachedElementIndex(); |
| // } |
| // std::cout << "item " << i << " was in cache" << std::endl; |
| // |
| // [1] See e.g. Intel's documentation at https://cpu.fyi/d/83c#G3.1121453, |
| // which says data is only prefetched if it is on the "same 4K byte page". |
| |
| class TimingArray { |
| public: |
| // ValueType is an alias for the element type of the array. Someday we might |
| // want to make TimingArray a template class and take this as a parameter, |
| // but for now the flexibility isn't worth the extra hassle. |
| using ValueType = int; |
| |
| // This could likewise be a parameter and, likewise, isn't. Making the array |
| // smaller doesn't improve the bandwidth of the side-channel, and trying to |
| // leak more than a byte at a time significantly increases noise due to |
| // greater cache contention. |
| // |
| // "Real" elements because we add buffer elements before and after. See |
| // comment on `elements_` below. |
| static const size_t kRealElements = 256; |
| |
| TimingArray(); |
| |
| TimingArray(TimingArray&) = delete; |
| TimingArray& operator=(TimingArray&) = delete; |
| |
| ValueType& operator[](size_t i) { |
| // Map index to element. |
| // |
| // As mentioned in the class comment, we try to frustrate hardware |
| // prefetchers by applying a permutation so elements don't appear in |
| // memory in index order. We do this by using a Linear Congruential |
| // Generator (LCG) where the size of the array is the modulus. |
| // To guarantee that this is a permutation, we just need to follow the |
| // requirements here: |
| // |
| // https://en.wikipedia.org/wiki/Linear_congruential_generator#c_%E2%89%A0_0 |
| // |
| // In this case, 113 makes a good choice: 113 is prime, 113-1 = 112 is |
| // divisible by 256's only prime factor (2), both are divisible by 4. |
| // If the array were dynamically sized, we would not be able to hardcode |
| // 113, and it would be difficult to produce a good multiplier. |
| // It may be desirable, instead, to switch to a different PRNG |
| // that also supports small periods and efficient seeking. |
| // (For example, a Permuted Congruential Generator.) |
| static_assert(kRealElements == 256, "consider changing 113"); |
| size_t el = (100 + i*113) % kRealElements; |
| |
| // Add 1 to skip leading buffer element. |
| return elements_[1 + el].cache_lines[0].value; |
| } |
| |
| // We intentionally omit the "const" accessor: |
| // const ValueType& operator[](size_t i) const { ... } |
| // |
| // At the level of abstraction we care about, accessing an element at all |
| // (even to read it) is not "morally const" since it mutates cache state. |
| |
| size_t size() const { return kRealElements; } |
| |
| // Flushes all elements of the array from the cache. |
| void FlushFromCache(); |
| |
| // Reads elements of the array in index order, starting with index 0, and |
| // looks for the first read that was fast enough to have come from the cache. |
| // |
| // Returns the index of the first "fast" element, or -1 if no element was |
| // obviously read from the cache. |
| // |
| // This function uses a heuristic that errs on the side of false *negatives*, |
| // so it is common to use it in a loop. Of course, measuring the time it |
| // takes to read an element has the side effect of bringing that element into |
| // the cache, so the loop must include a cache flush and re-attempt the |
| // side-channel leak. |
| int FindFirstCachedElementIndex(); |
| |
| // Just like `FindFirstCachedElementIndex`, except it begins right *after* |
| // the index `start_after`, wrapping around to try all array elements. That |
| // is, the first element read is `(start_after+1) % size` and the last |
| // element read before returning -1 is `start_after`. |
| int FindFirstCachedElementIndexAfter(int start_after); |
| |
| // Returns the threshold value used by FindFirstCachedElementIndex to |
| // identify reads that came from cache. |
| uint64_t cached_read_latency_threshold() const { |
| return cached_read_latency_threshold_; |
| } |
| |
| private: |
| // Convenience so we don't have (*this)[i] everywhere. |
| ValueType& ElementAt(size_t i) { return (*this)[i]; } |
| |
| uint64_t cached_read_latency_threshold_; |
| uint64_t FindCachedReadLatencyThreshold(); |
| |
| // Define a struct that occupies one full cache line. Some compilers may not |
| // support aligning at `kCacheLineBytes`, which is almost always greater than |
| // `sizeof(std::max_align_t)`. In those cases we'll get a compile error. |
| // See: https://timsong-cpp.github.io/cppwp/n3337/basic.align#9 |
| struct alignas(kCacheLineBytes) CacheLine { |
| ValueType value; |
| }; |
| static_assert(sizeof(CacheLine) == kCacheLineBytes, ""); |
| |
| // Define our "Element" struct, which takes up one page plus one cache line. |
| // When we allocate an array of these, we know that adjacent elements will |
| // start on different pages and in different cache sets. |
| static const int kCacheLinesPerPage = kPageBytes / kCacheLineBytes; |
| struct Element { |
| std::array<CacheLine, kCacheLinesPerPage + 1> cache_lines; |
| }; |
| static_assert(sizeof(Element) == kPageBytes + kCacheLineBytes, ""); |
| |
| // The actual backing store for the timing array, with buffer elements before |
| // and after to avoid interference from adjacent heap allocations. |
| // |
| // We use `vector` here instead of `array` to avoid problems where |
| // `TimingArray` is put on the stack and the class is so large it skips past |
| // the stack guard page. This is more likely on PowerPC where the page size |
| // (and therefore our element stride) is 64K. |
| std::vector<Element> elements_{1 + kRealElements + 1}; |
| }; |
| |
| #endif // DEMOS_TIMING_ARRAY_H_ |