blob: de1c629e7ed41a229f8b22660cbe45726966d175 [file] [log] [blame] [edit]
#include <iostream>
#include <memory>
#include <random>
#include "benchmark/benchmark.h"
#define ITERATIONS 100000
template <typename T>
using CSAFunc = T (*)(T *, T *, T *, T);
#define INIT_BLOCK
#define EXIT_BLOCK
#define _Args(...) __VA_ARGS__
#define _STRIP_PARENS(X) X
#define STRIP_PARENS(X) _STRIP_PARENS(_Args X)
#define DEF_CSA_LOOP(name, init, loop, result) \
template <typename T> \
__attribute__((noinline)) static T run_##name##_autovec(T *A, T *B, T *C, \
T Threshold) { \
STRIP_PARENS(init); \
STRIP_PARENS(loop); \
result; \
} \
template <typename T> \
__attribute__((noinline)) static T run_##name##_novec(T *A, T *B, T *C, \
T Threshold) { \
STRIP_PARENS(init); \
_Pragma("clang loop vectorize(disable) interleave(disable)") \
STRIP_PARENS(loop); \
result; \
}
#define DEF_SINGLE_CSA_LOOP(name, loop) \
DEF_CSA_LOOP(name, \
(/* Pick a default value that's out of range of the uniform \
* distribution created for 'A' in init_data below. */ \
T Result = 101;), \
loop, ({ return Result; }));
// Find the last element in A above the given threshold.
DEF_SINGLE_CSA_LOOP(single_csa_only,
(for (unsigned i = 0; i < ITERATIONS; i++) {
if (A[i] > Threshold)
Result = A[i];
}));
// Find the last element in B where the corresponding value in A is above the
// given threshold.
DEF_SINGLE_CSA_LOOP(single_csa_cond_load,
(for (unsigned i = 0; i < ITERATIONS; i++) {
if (A[i] > Threshold)
Result = B[i];
}));
// Find the last element in B where the corresponding value in C is below the
// given threshold and the corresponding in A is above the threshold.
DEF_SINGLE_CSA_LOOP(single_csa_nested_cond_load,
(for (unsigned i = 0; i < ITERATIONS; i++) {
if (C[i] < Threshold)
if (A[i] > Threshold)
Result = B[i];
}));
// Find the last element in A above the given threshold, and use it to perform
// some arith (that may need masking).
DEF_SINGLE_CSA_LOOP(csa_with_cond_arith,
(for (unsigned i = 0; i < ITERATIONS; i++) {
if (A[i] > Threshold)
Result = (A[i] * 13) / B[i];
}));
// Find the last element in A above the given threshold (+ some arith).
DEF_SINGLE_CSA_LOOP(csa_with_in_loop_arith,
(for (unsigned i = 0; i < ITERATIONS; i++) {
// Do some work to make the difference noticeable
C[i] = A[i] * 13 + B[i] * 5;
if (A[i] > Threshold)
Result = A[i];
}));
// Find the last elements in A, B, and C above the given threshold.
DEF_CSA_LOOP(
multi_csa_only,
INIT_BLOCK(
// Pick a default value that's out of range of the uniform
// distribution created for 'A', 'B', and 'C' in init_data below.
T ResultA = 101; T ResultB = 101; T ResultC = 101;),
(for (unsigned i = 0; i < ITERATIONS; i++) {
if (A[i] > Threshold)
ResultA = A[i];
if (B[i] > Threshold)
ResultB = B[i];
if (C[i] > Threshold)
ResultC = C[i];
}),
EXIT_BLOCK({ return ResultA ^ ResultB ^ ResultC; }));
// Find the last elements in A, B, and C above the given threshold. Sets the
// result to `N / the element`, which is UB if the element is zero.
DEF_CSA_LOOP(
multi_csa_with_cond_arith,
INIT_BLOCK(
// Pick a default value that's out of range of the uniform
// distribution created for 'A', 'B', and 'C' in init_data below.
T ResultA = 101; T ResultB = 101; T ResultC = 101;),
(for (unsigned i = 0; i < ITERATIONS; i++) {
if (A[i] > Threshold)
ResultA = 7 / A[i];
if (B[i] > Threshold)
ResultB = 4 / B[i];
if (C[i] > Threshold)
ResultC = 10 / C[i];
}),
EXIT_BLOCK({ return ResultA ^ ResultB ^ ResultC; }));
// Initialize arrays A, B, and C with random numbers
template <typename T> static void init_data(T *A, T* B, T *C) {
std::uniform_int_distribution<T> dist(0, 100);
std::mt19937 rng(12345);
for (unsigned i = 0; i < ITERATIONS; i++) {
A[i] = dist(rng);
B[i] = dist(rng);
C[i] = dist(rng);
}
}
// Benchmark auto-vectorized version.
template <typename T>
static void __attribute__((always_inline))
benchmark_csa_autovec(benchmark::State &state, CSAFunc<T> VecFn,
CSAFunc<T> NoVecFn, T Threshold) {
std::unique_ptr<T[]> A(new T[ITERATIONS]);
std::unique_ptr<T[]> B(new T[ITERATIONS]);
std::unique_ptr<T[]> C(new T[ITERATIONS]);
init_data(&A[0], &B[0], &C[0]);
#ifdef BENCH_AND_VERIFY
// Verify the vectorized and un-vectorized versions produce the same results.
{
T VecRes = VecFn(&A[0], &B[0], &C[0], Threshold);
T NoVecRes = NoVecFn(&A[0], &B[0], &C[0], Threshold);
// We're only interested in whether the conditional assignment results
// were the same.
if (VecRes != NoVecRes) {
std::cerr << "ERROR: autovec result different to scalar result; "
<< VecRes << " != " << NoVecRes << "\n";
exit(1);
}
}
#endif
for (auto _ : state) {
T Res = VecFn(&A[0], &B[0], &C[0], Threshold);
benchmark::DoNotOptimize(A);
benchmark::DoNotOptimize(B);
benchmark::DoNotOptimize(C);
benchmark::DoNotOptimize(Res);
benchmark::ClobberMemory();
}
}
// Benchmark version with vectorization disabled.
template <typename T>
static void __attribute__((always_inline))
benchmark_csa_novec(benchmark::State &state, CSAFunc<T> NoVecFn, T Threshold) {
std::unique_ptr<T[]> A(new T[ITERATIONS]);
std::unique_ptr<T[]> B(new T[ITERATIONS]);
std::unique_ptr<T[]> C(new T[ITERATIONS]);
init_data(&A[0], &B[0], &C[0]);
for (auto _ : state) {
T Res = NoVecFn(&A[0], &B[0], &C[0], Threshold);
benchmark::DoNotOptimize(A);
benchmark::DoNotOptimize(B);
benchmark::DoNotOptimize(C);
benchmark::DoNotOptimize(Res);
benchmark::ClobberMemory();
}
}
#define BENCHMARK_CASE(name, ty, threshold) \
void BENCHMARK_##name##_autovec_##ty##_(benchmark::State &state) { \
benchmark_csa_autovec<ty>(state, run_##name##_autovec, run_##name##_novec, \
threshold); \
} \
BENCHMARK(BENCHMARK_##name##_autovec_##ty##_)->Unit(benchmark::kNanosecond); \
\
void BENCHMARK_##name##_novec_##ty##_(benchmark::State &state) { \
benchmark_csa_novec<ty>(state, run_##name##_novec, threshold); \
} \
BENCHMARK(BENCHMARK_##name##_novec_##ty##_)->Unit(benchmark::kNanosecond);
// Add benchmarks with and without auto-vectorization
#define ADD_BENCHMARK(ty, threshold) \
BENCHMARK_CASE(single_csa_only, ty, threshold) \
BENCHMARK_CASE(multi_csa_only, ty, threshold) \
BENCHMARK_CASE(csa_with_in_loop_arith, ty, threshold) \
BENCHMARK_CASE(single_csa_cond_load, ty, threshold) \
BENCHMARK_CASE(single_csa_nested_cond_load, ty, threshold) \
BENCHMARK_CASE(csa_with_cond_arith, ty, threshold) \
BENCHMARK_CASE(multi_csa_with_cond_arith, ty, threshold)
ADD_BENCHMARK(int32_t, 75)
ADD_BENCHMARK(uint8_t, 90)
ADD_BENCHMARK(int64_t, 60)