| #include <functional> |
| #include <iostream> |
| #include <limits> |
| #include <memory> |
| #include <random> |
| #include <stdint.h> |
| |
| #include "common.h" |
| |
| |
| // Tests for memory runtime checks generated by the vectorizer. Runs scalar and |
| // vectorized versions of a loop requiring runtime checks on the same inputs |
| // with pointers to the same buffer using various offsets between reads and |
| // writes. Fails if they do not produce the same results. |
| |
| template <typename Ty> |
| static void check(const std::unique_ptr<Ty[]> &Reference, |
| const std::unique_ptr<Ty[]> &Tmp, unsigned NumElements, |
| int Offset) { |
| if (!std::equal(&Reference[0], &Reference[0] + NumElements, &Tmp[0])) { |
| std::cerr << "Miscompare with offset " << Offset << "\n"; |
| exit(1); |
| } |
| } |
| |
| // Helper to call \p f with \p args and acts as optimization barrier for \p f. |
| template <typename F, typename... Args> |
| __attribute__((optnone)) static void callThroughOptnone(F &&f, Args &&...args) { |
| f(std::forward<Args>(args)...); |
| } |
| |
| template <typename Ty> using Fn2Ty = std::function<void(Ty *, Ty *, unsigned)>; |
| |
| // Run both \p ScalarFn and \p VectorFn on the same inputs with pointers to the |
| // same buffer. Fail if they do not produce the same results. |
| template <typename Ty> |
| static void checkOverlappingMemoryOneRuntimeCheck(Fn2Ty<Ty> ScalarFn, |
| Fn2Ty<Ty> VectorFn, |
| const char *Name) { |
| std::cout << "Checking " << Name << "\n"; |
| |
| unsigned N = 100; |
| // Make sure we have enough extra elements so we can be liberal with offsets. |
| unsigned NumArrayElements = N * 8; |
| std::unique_ptr<Ty[]> Input1(new Ty[NumArrayElements]); |
| std::unique_ptr<Ty[]> Reference(new Ty[NumArrayElements]); |
| std::unique_ptr<Ty[]> ToCheck(new Ty[NumArrayElements]); |
| |
| auto CheckWithOffset = [&](int Offset) { |
| init_data(Input1, NumArrayElements); |
| for (unsigned i = 0; i < NumArrayElements; i++) { |
| Reference[i] = Input1[i]; |
| ToCheck[i] = Input1[i]; |
| } |
| |
| // Run scalar function to generate reference output. |
| Ty *ReferenceStart = &Reference[NumArrayElements / 2]; |
| ScalarFn(ReferenceStart + Offset, ReferenceStart, N); |
| |
| // Run vector function to generate output to check. |
| Ty *StartPtr = &ToCheck[NumArrayElements / 2]; |
| callThroughOptnone(VectorFn, StartPtr + Offset, StartPtr, N); |
| |
| // Compare scalar and vector output. |
| check(Reference, ToCheck, NumArrayElements, Offset); |
| }; |
| |
| for (int i = -100; i <= 100; i++) |
| CheckWithOffset(i); |
| } |
| |
| template <typename Ty> |
| using Fn3Ty = std::function<void(Ty *, Ty *, Ty *, unsigned)>; |
| template <typename Ty> |
| static void checkOverlappingMemoryTwoRuntimeChecks(Fn3Ty<Ty> ScalarFn, |
| Fn3Ty<Ty> VectorFn, |
| const char *Name) { |
| std::cout << "Checking " << Name << "\n"; |
| |
| unsigned N = 100; |
| // Make sure we have enough extra elements so we can be liberal with offsets. |
| unsigned NumArrayElements = N * 8; |
| std::unique_ptr<Ty[]> Input1(new Ty[NumArrayElements]); |
| std::unique_ptr<Ty[]> Input2(new Ty[NumArrayElements]); |
| std::unique_ptr<Ty[]> Reference(new Ty[NumArrayElements]); |
| std::unique_ptr<Ty[]> ToCheck(new Ty[NumArrayElements]); |
| |
| auto CheckWithOffsetSecond = [&](int Offset) { |
| init_data(Input1, NumArrayElements); |
| init_data(Input2, NumArrayElements); |
| for (unsigned i = 0; i < NumArrayElements; i++) { |
| Reference[i] = Input1[i]; |
| ToCheck[i] = Input1[i]; |
| } |
| |
| // Run scalar function to generate reference output. |
| Ty *ReferenceStart = &Reference[NumArrayElements / 2]; |
| ScalarFn(ReferenceStart + Offset, &Input2[0], ReferenceStart, N); |
| |
| // Run vector function to generate output to check. |
| Ty *StartPtr = &ToCheck[NumArrayElements / 2]; |
| callThroughOptnone(VectorFn, StartPtr + Offset, &Input2[0], StartPtr, N); |
| |
| // Compare scalar and vector output. |
| check(Reference, ToCheck, NumArrayElements, Offset); |
| }; |
| |
| for (int i = -100; i <= 100; i++) |
| CheckWithOffsetSecond(i); |
| } |
| |
| |
| |
| template <typename Ty> |
| using Fn4Ty = std::function<void(Ty *, Ty *, unsigned, unsigned)>; |
| template <typename Ty> |
| static void checkOverlappingMemoryTwoRuntimeChecksNested(Fn4Ty<Ty> ScalarFn, |
| Fn4Ty<Ty> VectorFn, |
| const int OuterTC, |
| const int InnerTC, |
| const char *Name) { |
| std::cout << "Checking " << Name << "\n"; |
| |
| // Make sure we have enough extra elements so we can be liberal with offsets. |
| const unsigned NumArrayElements = (InnerTC * (OuterTC + 1)) * 8; |
| std::unique_ptr<Ty[]> Input1(new Ty[NumArrayElements]); |
| std::unique_ptr<Ty[]> Reference(new Ty[NumArrayElements]); |
| std::unique_ptr<Ty[]> ToCheck(new Ty[NumArrayElements]); |
| |
| auto CheckWithOffsetSecond = [&](int Offset) { |
| init_data(Input1, NumArrayElements); |
| for (unsigned i = 0; i < NumArrayElements; i++) { |
| Reference[i] = Input1[i]; |
| ToCheck[i] = Input1[i]; |
| } |
| |
| // Run scalar function to generate reference output. |
| Ty *ReferenceStart = &Reference[NumArrayElements / 2]; |
| ScalarFn(ReferenceStart + Offset, ReferenceStart, OuterTC, InnerTC); |
| |
| // Run vector function to generate output to check. |
| Ty *StartPtr = &ToCheck[NumArrayElements / 2]; |
| callThroughOptnone(VectorFn, StartPtr + Offset, StartPtr, OuterTC, InnerTC); |
| |
| // Compare scalar and vector output. |
| check(Reference, ToCheck, NumArrayElements, Offset); |
| }; |
| |
| // With a nested loop, sometimes the runtime checks will fail and sometimes |
| // succeed. For example, with large offsets you'd expect for the first and |
| // last one or two executions of the inner loop there is no overlap. |
| for (int i = -(2 * (InnerTC + 1)); i <= (2 * (InnerTC + 1)); i++) |
| CheckWithOffsetSecond(i); |
| } |
| |
| |
| int main(void) { |
| rng = std::mt19937(15); |
| |
| { |
| DEFINE_SCALAR_AND_VECTOR_FN2(, |
| for (unsigned i = 0; i < TC; i++) |
| A[i] = B[i] + 10; |
| ); |
| |
| checkOverlappingMemoryOneRuntimeCheck<uint8_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, step 1, uint8_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint32_t>(ScalarFn, VectorFn, |
| "1 read, 1 write, step 1, uint32_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint64_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, step 1, uint64_t"); |
| } |
| |
| { |
| DEFINE_SCALAR_AND_VECTOR_FN2(, |
| for (unsigned i = 0; i < TC; i++) |
| A[i] = B[i + 3] + 10; |
| ); |
| |
| checkOverlappingMemoryOneRuntimeCheck<uint8_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, offset 3, uint8_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint32_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, offset 3, uint32_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint64_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, offset 3, uint64_t"); |
| } |
| |
| { |
| DEFINE_SCALAR_AND_VECTOR_FN2(, |
| for (unsigned i = 3; i < TC; i++) |
| A[i] = B[i - 3] + 10; |
| ); |
| |
| checkOverlappingMemoryOneRuntimeCheck<uint8_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, offset -3, uint8_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint32_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, offset -3, uint32_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint64_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, offset -3, uint64_t"); |
| } |
| |
| { |
| DEFINE_SCALAR_AND_VECTOR_FN2(, |
| for (unsigned i = TC; i > 0; i--) |
| A[i] = B[i] + 10;); |
| |
| checkOverlappingMemoryOneRuntimeCheck<uint8_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, index count down, uint8_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint32_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, index count down, uint32_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint64_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, index count down, uint64_t"); |
| } |
| |
| { |
| DEFINE_SCALAR_AND_VECTOR_FN2(, |
| for (unsigned i = TC; i > 2; i -= 2) |
| A[i] = B[i] + 10; |
| ); |
| |
| checkOverlappingMemoryOneRuntimeCheck<uint8_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, index count down 2, uint8_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint32_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, index count down 2, uint32_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint64_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, index count down 2, uint64_t"); |
| } |
| |
| { |
| DEFINE_SCALAR_AND_VECTOR_FN2(, |
| for (unsigned i = 0, j = 0; i < TC; i++) { |
| A[i] = B[j] + 10; |
| j += 2; |
| } |
| ); |
| checkOverlappingMemoryOneRuntimeCheck<uint8_t>( |
| ScalarFn, VectorFn, |
| "1 read, 1 write, 2 inductions, different steps, uint8_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint32_t>( |
| ScalarFn, VectorFn, |
| "1 read, 1 write, 2 inductions, different steps, uint32_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint64_t>( |
| ScalarFn, VectorFn, |
| "1 read, 1 write, 2 inductions, different steps, uint64_t"); |
| } |
| |
| { |
| DEFINE_SCALAR_AND_VECTOR_FN2(, |
| _Pragma("clang loop unroll(disable)") |
| for (unsigned i = 0; i < TC; i += 2) |
| A[i] = B[i] + 10; |
| ); |
| |
| checkOverlappingMemoryOneRuntimeCheck<uint8_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, induction increment 2, uint8_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint32_t>( |
| ScalarFn, VectorFn, "1 read, 1 write, induction increment 2, uint32_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint64_t>( |
| ScalarFn, VectorFn, |
| "1 read, 1 write, induction increment 2, uint64_t"); |
| } |
| |
| { |
| DEFINE_SCALAR_AND_VECTOR_FN2(, |
| for (unsigned i = 0, sum = 0; i < TC; i++) { |
| sum += A[i] + 10; |
| B[TC / 2] = sum; |
| } |
| ); |
| |
| checkOverlappingMemoryOneRuntimeCheck<uint8_t>( |
| ScalarFn, VectorFn, "1 read, 1 write to invariant address, step 1, uint8_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint32_t>( |
| ScalarFn, VectorFn, "1 read, 1 write to invariant address, step 1, uint32_t"); |
| checkOverlappingMemoryOneRuntimeCheck<uint64_t>( |
| ScalarFn, VectorFn, "1 read, 1 write to invariant address, step 1, uint64_t"); |
| } |
| |
| { |
| DEFINE_SCALAR_AND_VECTOR_FN3( |
| for (unsigned i = 0; i < TC; i++) |
| A[i] = B[i] + C[i] + 10; |
| ); |
| |
| checkOverlappingMemoryTwoRuntimeChecks<uint32_t>( |
| ScalarFn, VectorFn, "2 reads, 1 write, simple indices, uint32_t"); |
| checkOverlappingMemoryTwoRuntimeChecks<uint8_t>( |
| ScalarFn, VectorFn, "2 reads, 1 write, simple indices, uint8_t"); |
| checkOverlappingMemoryTwoRuntimeChecks<uint64_t>( |
| ScalarFn, VectorFn, "2 reads, 1 write, simple indices, uint64_t"); |
| } |
| |
| { |
| DEFINE_SCALAR_AND_VECTOR_FN3( |
| for (unsigned i = 0; i < TC; i++) { |
| auto X = C[i] + 10; |
| A[i] = X; |
| B[i] = X + 9; |
| } |
| ); |
| |
| checkOverlappingMemoryTwoRuntimeChecks<uint8_t>( |
| ScalarFn, VectorFn, "1 read, 2 writes, simple indices, uint8_t"); |
| checkOverlappingMemoryTwoRuntimeChecks<uint32_t>( |
| ScalarFn, VectorFn, "1 read, 2 writes, simple indices, uint32_t"); |
| checkOverlappingMemoryTwoRuntimeChecks<uint64_t>( |
| ScalarFn, VectorFn, "1 read, 2 writes, simple indices, uint64_t"); |
| } |
| |
| { |
| DEFINE_NESTED_SCALAR_AND_VECTOR_FN4( |
| auto X = B[(i * OuterTC) + j]; |
| A[(i * (OuterTC + 1)) + j] = X; |
| ); |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>( |
| ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (matching trip counts), uint8_t"); |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>( |
| ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (matching trip counts), uint32_t"); |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>( |
| ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (matching trip counts), uint64_t"); |
| |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>( |
| ScalarFn, VectorFn, 100, 50, "1 read, 1 write, nested loop (different trip counts), uint8_t"); |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>( |
| ScalarFn, VectorFn, 100, 50, "1 read, 1 write, nested loop (different trip counts), uint32_t"); |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>( |
| ScalarFn, VectorFn, 100, 50, "1 read, 1 write, nested loop (different trip counts), uint64_t"); |
| } |
| |
| { |
| DEFINE_NESTED_SCALAR_AND_VECTOR_FN4( |
| auto X = B[(i * OuterTC) + j]; |
| A[(i * (OuterTC + 1)) + j] += X; |
| ); |
| |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>( |
| ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (matching trip counts), uint8_t"); |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>( |
| ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (matching trip counts), uint32_t"); |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>( |
| ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (matching trip counts), uint64_t"); |
| |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>( |
| ScalarFn, VectorFn, 100, 50, "2 reads, 1 write, nested loop (different trip counts), uint8_t"); |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>( |
| ScalarFn, VectorFn, 100, 50, "2 reads, 1 write, nested loop (different trip counts), uint32_t"); |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>( |
| ScalarFn, VectorFn, 100, 50, "2 reads, 1 write, nested loop (different trip counts), uint64_t"); |
| } |
| |
| { |
| DEFINE_NESTED_SCALAR_AND_VECTOR_FN5( |
| auto X = B[(i * OuterTC) + j]; |
| A[(i * (OuterTC + 1)) + j] = X; |
| ); |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>( |
| ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint8_t"); |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>( |
| ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint32_t"); |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>( |
| ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint64_t"); |
| } |
| |
| { |
| DEFINE_NESTED_SCALAR_AND_VECTOR_FN5( |
| auto X = B[(i * OuterTC) + j]; |
| A[(i * (OuterTC + 1)) + j] += X; |
| ); |
| |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>( |
| ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint8_t"); |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>( |
| ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint32_t"); |
| checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>( |
| ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint64_t"); |
| } |
| |
| return 0; |
| } |