MicroBenchmarks/LoopVectorization/VectorOperations.cpp - third_party/llvm-test-suite - Git at Google

 // This program tests vectorized truncates & zero-extends for performance and
 // correctness
 #include <iostream>
 #include <memory>
 #include <random>

 #include "benchmark/benchmark.h"

 #define ITERATIONS 10000

 static std::mt19937 rng;

 // Initialize array A with random numbers.
 template <typename Ty>
 static void init_data(const std::unique_ptr<Ty[]> &A, unsigned N) {
   std::uniform_int_distribution<Ty> distrib(std::numeric_limits<Ty>::min(),
                                             std::numeric_limits<Ty>::max());
   for (unsigned I = 0; I < N; I++)
     A[I] = distrib(rng);
 }

 // Truncate/Zero-extend elements to create expected results with no
 // vectorization
 template <typename Ty1, typename Ty2>
 static void truncOrZextWithNoVec(const Ty1 *A, Ty2 *B, int Iterations) {
 #pragma clang loop vectorize(disable)
   for (unsigned I = 0; I < Iterations; I++) {
     B[I] = A[I];
   }
 }

 // Truncate/Zero-extend each vector element in a vectorized loop with vectorization width 8
 template <typename Ty1, typename Ty2>
 static void truncOrZextVecInLoopWithVW8(const Ty1 *A, Ty2 *B, int Iterations) {
 #pragma clang loop vectorize_width(8) interleave_count(4)
   for (unsigned I = 0; I < Iterations; I++) {
     B[I] = A[I];
   }
 }

 // Truncate/Zero-extend each vector element in a vectorized loop with
 // vectorization width 16
 template <typename Ty1, typename Ty2>
 static void truncOrZextVecInLoopWithVW16(const Ty1 *A, Ty2 *B, int Iterations) {
 #pragma clang loop vectorize_width(16) interleave_count(4)
   for (unsigned I = 0; I < Iterations; I++) {
     B[I] = A[I];
   }
 }

 // Truncate/Zero-extend each vector element in a vectorized loop
 template <typename Ty1, typename Ty2>
 static void truncOrZextVecInLoop(const Ty1 *A, Ty2 *B, int Iterations) {
 #pragma clang loop vectorize(enable)
   for (unsigned I = 0; I < Iterations; I++) {
     B[I] = A[I];
   }
 }

 // Truncate/Zero-extend each vector element while adding in a vectorized loop
 // with vectorization width 8
 template <typename Ty1, typename Ty2>
 static void truncOrZextVecWithAddInLoopWithVW8(const Ty1 *A, Ty2 *B,
                                                int Iterations) {
 #pragma clang loop vectorize_width(8) interleave_count(4)
   for (unsigned I = 0; I < Iterations; I++) {
     B[I] += A[I];
   }
 }

 // Truncate/Zero-extend each vector element while adding in a vectorized loop
 // vectorization width 16
 template <typename Ty1, typename Ty2>
 static void truncOrZextVecWithAddInLoopWithVW16(const Ty1 *A, Ty2 *B,
                                                 int Iterations) {
 #pragma clang loop vectorize_width(16) interleave_count(4)
   for (unsigned I = 0; I < Iterations; I++) {
     B[I] += A[I];
   }
 }

 // Truncate/Zero-extend each vector element while adding in a vectorized loop
 template <typename Ty1, typename Ty2>
 static void truncOrZextVecWithAddInLoop(const Ty1 *A, Ty2 *B, int Iterations) {
 #pragma clang loop vectorize(enable)
   for (unsigned I = 0; I < Iterations; I++) {
     B[I] += A[I];
   }
 }

 template <typename Ty1, typename Ty2>
 static void __attribute__((always_inline))
 benchForTruncOrZextVecInLoop(benchmark::State &state,
                              void (*Fn)(const Ty1 *, Ty2 *, int)) {
   std::unique_ptr<Ty1[]> A(new Ty1[ITERATIONS]);
   std::unique_ptr<Ty2[]> B(new Ty2[ITERATIONS]);
   std::unique_ptr<Ty2[]> C(new Ty2[ITERATIONS]);

   init_data(A, ITERATIONS);

   // Check for correctness
   truncOrZextWithNoVec(&A[0], &C[0], ITERATIONS);
   Fn(&A[0], &B[0], ITERATIONS);
   for (int I = 0; I < ITERATIONS; I++) {
     if (B[I] != C[I]) {
       std::cerr << "ERROR: Trunc or ZExt operation on " << A[I]
                 << " is showing result " << B[I] << " instead of " << C[I]
                 << "\n";
       exit(1);
     }
   }

   for (auto _ : state) {
     benchmark::DoNotOptimize(B);
     benchmark::ClobberMemory();
     Fn(&A[0], &B[0], ITERATIONS);
   }
 }

 template <typename Ty1, typename Ty2>
 static void __attribute__((always_inline))
 benchForTruncOrZextVecWithAddInLoop(benchmark::State &state,
                                     void (*Fn)(const Ty1 *, Ty2 *, int)) {
   std::unique_ptr<Ty1[]> A(new Ty1[ITERATIONS]);
   std::unique_ptr<Ty2[]> B(new Ty2[ITERATIONS]);
   std::unique_ptr<Ty2[]> C(new Ty2[ITERATIONS]);
   init_data(A, ITERATIONS);
   init_data(B, ITERATIONS);
   for (auto _ : state) {
     benchmark::DoNotOptimize(B);
     benchmark::ClobberMemory();
     Fn(&A[0], &B[0], ITERATIONS);
   }
 }

 // Add vectorized truncate or zero-extend operation benchmarks for different element types
 #define ADD_BENCHMARK(ty1, ty2)                                                \
   void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_(            \
       benchmark::State &state) {                                               \
     benchForTruncOrZextVecInLoop<ty1, ty2>(state,                              \
                                            &truncOrZextVecInLoopWithVW8);      \
   }                                                                            \
   BENCHMARK(benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_);      \
   void benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_(           \
       benchmark::State &state) {                                               \
     benchForTruncOrZextVecInLoop<ty1, ty2>(state,                              \
                                            &truncOrZextVecInLoopWithVW16);     \
   }                                                                            \
   BENCHMARK(benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_);     \
   void benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_(                   \
       benchmark::State &state) {                                               \
     benchForTruncOrZextVecInLoop<ty1, ty2>(state, &truncOrZextVecInLoop);      \
   }                                                                            \
   BENCHMARK(benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_);             \
   void benchForTruncOrZextVecWithAddInLoopWithVW8From_##ty1##_To_##ty2##_(     \
       benchmark::State &state) {                                               \
     benchForTruncOrZextVecWithAddInLoop<ty1, ty2>(                             \
         state, &truncOrZextVecWithAddInLoopWithVW8);                           \
   }                                                                            \
   BENCHMARK(                                                                   \
       benchForTruncOrZextVecWithAddInLoopWithVW8From_##ty1##_To_##ty2##_);     \
   void benchForTruncOrZextVecWithAddInLoopWithVW16From_##ty1##_To_##ty2##_(    \
       benchmark::State &state) {                                               \
     benchForTruncOrZextVecWithAddInLoop<ty1, ty2>(                             \
         state, &truncOrZextVecWithAddInLoopWithVW16);                          \
   }                                                                            \
   BENCHMARK(                                                                   \
       benchForTruncOrZextVecWithAddInLoopWithVW16From_##ty1##_To_##ty2##_);    \
   void benchForTruncOrZextVecWithAddInLoopFrom_##ty1##_To_##ty2##_(            \
       benchmark::State &state) {                                               \
     benchForTruncOrZextVecWithAddInLoop<ty1, ty2>(                             \
         state, &truncOrZextVecWithAddInLoop);                                  \
   }                                                                            \
   BENCHMARK(benchForTruncOrZextVecWithAddInLoopFrom_##ty1##_To_##ty2##_);

 /* Vectorized truncate operations */
 ADD_BENCHMARK(uint16_t, uint8_t)
 ADD_BENCHMARK(uint32_t, uint8_t)
 ADD_BENCHMARK(uint64_t, uint8_t)
 ADD_BENCHMARK(uint32_t, uint16_t)
 ADD_BENCHMARK(uint64_t, uint16_t)
 ADD_BENCHMARK(uint64_t, uint32_t)

 /* Vectorized zero extend operations */
 ADD_BENCHMARK(uint8_t, uint16_t)
 ADD_BENCHMARK(uint8_t, uint32_t)
 ADD_BENCHMARK(uint8_t, uint64_t)
 ADD_BENCHMARK(uint16_t, uint32_t)
 ADD_BENCHMARK(uint16_t, uint64_t)
 ADD_BENCHMARK(uint32_t, uint64_t)
	// This program tests vectorized truncates & zero-extends for performance and
	// correctness
	#include <iostream>
	#include <memory>
	#include <random>

	#include "benchmark/benchmark.h"

	#define ITERATIONS 10000

	static std::mt19937 rng;

	// Initialize array A with random numbers.
	template <typename Ty>
	static void init_data(const std::unique_ptr<Ty[]> &A, unsigned N) {
	std::uniform_int_distribution<Ty> distrib(std::numeric_limits<Ty>::min(),
	std::numeric_limits<Ty>::max());
	for (unsigned I = 0; I < N; I++)
	A[I] = distrib(rng);
	}

	// Truncate/Zero-extend elements to create expected results with no
	// vectorization
	template <typename Ty1, typename Ty2>
	static void truncOrZextWithNoVec(const Ty1 A, Ty2 B, int Iterations) {
	#pragma clang loop vectorize(disable)
	for (unsigned I = 0; I < Iterations; I++) {
	B[I] = A[I];
	}
	}

	// Truncate/Zero-extend each vector element in a vectorized loop with vectorization width 8
	template <typename Ty1, typename Ty2>
	static void truncOrZextVecInLoopWithVW8(const Ty1 A, Ty2 B, int Iterations) {
	#pragma clang loop vectorize_width(8) interleave_count(4)
	for (unsigned I = 0; I < Iterations; I++) {
	B[I] = A[I];
	}
	}

	// Truncate/Zero-extend each vector element in a vectorized loop with
	// vectorization width 16
	template <typename Ty1, typename Ty2>
	static void truncOrZextVecInLoopWithVW16(const Ty1 A, Ty2 B, int Iterations) {
	#pragma clang loop vectorize_width(16) interleave_count(4)
	for (unsigned I = 0; I < Iterations; I++) {
	B[I] = A[I];
	}
	}

	// Truncate/Zero-extend each vector element in a vectorized loop
	template <typename Ty1, typename Ty2>
	static void truncOrZextVecInLoop(const Ty1 A, Ty2 B, int Iterations) {
	#pragma clang loop vectorize(enable)
	for (unsigned I = 0; I < Iterations; I++) {
	B[I] = A[I];
	}
	}

	// Truncate/Zero-extend each vector element while adding in a vectorized loop
	// with vectorization width 8
	template <typename Ty1, typename Ty2>
	static void truncOrZextVecWithAddInLoopWithVW8(const Ty1 A, Ty2 B,
	int Iterations) {
	#pragma clang loop vectorize_width(8) interleave_count(4)
	for (unsigned I = 0; I < Iterations; I++) {
	B[I] += A[I];
	}
	}

	// Truncate/Zero-extend each vector element while adding in a vectorized loop
	// vectorization width 16
	template <typename Ty1, typename Ty2>
	static void truncOrZextVecWithAddInLoopWithVW16(const Ty1 A, Ty2 B,
	int Iterations) {
	#pragma clang loop vectorize_width(16) interleave_count(4)
	for (unsigned I = 0; I < Iterations; I++) {
	B[I] += A[I];
	}
	}

	// Truncate/Zero-extend each vector element while adding in a vectorized loop
	template <typename Ty1, typename Ty2>
	static void truncOrZextVecWithAddInLoop(const Ty1 A, Ty2 B, int Iterations) {
	#pragma clang loop vectorize(enable)
	for (unsigned I = 0; I < Iterations; I++) {
	B[I] += A[I];
	}
	}

	template <typename Ty1, typename Ty2>
	static void __attribute__((always_inline))
	benchForTruncOrZextVecInLoop(benchmark::State &state,
	void (Fn)(const Ty1 , Ty2 *, int)) {
	std::unique_ptr<Ty1[]> A(new Ty1[ITERATIONS]);
	std::unique_ptr<Ty2[]> B(new Ty2[ITERATIONS]);
	std::unique_ptr<Ty2[]> C(new Ty2[ITERATIONS]);

	init_data(A, ITERATIONS);

	// Check for correctness
	truncOrZextWithNoVec(&A[0], &C[0], ITERATIONS);
	Fn(&A[0], &B[0], ITERATIONS);
	for (int I = 0; I < ITERATIONS; I++) {
	if (B[I] != C[I]) {
	std::cerr << "ERROR: Trunc or ZExt operation on " << A[I]
	<< " is showing result " << B[I] << " instead of " << C[I]
	<< "\n";
	exit(1);
	}
	}

	for (auto _ : state) {
	benchmark::DoNotOptimize(B);
	benchmark::ClobberMemory();
	Fn(&A[0], &B[0], ITERATIONS);
	}
	}

	template <typename Ty1, typename Ty2>
	static void __attribute__((always_inline))
	benchForTruncOrZextVecWithAddInLoop(benchmark::State &state,
	void (Fn)(const Ty1 , Ty2 *, int)) {
	std::unique_ptr<Ty1[]> A(new Ty1[ITERATIONS]);
	std::unique_ptr<Ty2[]> B(new Ty2[ITERATIONS]);
	std::unique_ptr<Ty2[]> C(new Ty2[ITERATIONS]);
	init_data(A, ITERATIONS);
	init_data(B, ITERATIONS);
	for (auto _ : state) {
	benchmark::DoNotOptimize(B);
	benchmark::ClobberMemory();
	Fn(&A[0], &B[0], ITERATIONS);
	}
	}

	// Add vectorized truncate or zero-extend operation benchmarks for different element types
	#define ADD_BENCHMARK(ty1, ty2) \
	void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_( \
	benchmark::State &state) { \
	benchForTruncOrZextVecInLoop<ty1, ty2>(state, \
	&truncOrZextVecInLoopWithVW8); \
	} \
	BENCHMARK(benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_); \
	void benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_( \
	benchmark::State &state) { \
	benchForTruncOrZextVecInLoop<ty1, ty2>(state, \
	&truncOrZextVecInLoopWithVW16); \
	} \
	BENCHMARK(benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_); \
	void benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_( \
	benchmark::State &state) { \
	benchForTruncOrZextVecInLoop<ty1, ty2>(state, &truncOrZextVecInLoop); \
	} \
	BENCHMARK(benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_); \
	void benchForTruncOrZextVecWithAddInLoopWithVW8From_##ty1##_To_##ty2##_( \
	benchmark::State &state) { \
	benchForTruncOrZextVecWithAddInLoop<ty1, ty2>( \
	state, &truncOrZextVecWithAddInLoopWithVW8); \
	} \
	BENCHMARK( \
	benchForTruncOrZextVecWithAddInLoopWithVW8From_##ty1##_To_##ty2##_); \
	void benchForTruncOrZextVecWithAddInLoopWithVW16From_##ty1##_To_##ty2##_( \
	benchmark::State &state) { \
	benchForTruncOrZextVecWithAddInLoop<ty1, ty2>( \
	state, &truncOrZextVecWithAddInLoopWithVW16); \
	} \
	BENCHMARK( \
	benchForTruncOrZextVecWithAddInLoopWithVW16From_##ty1##_To_##ty2##_); \
	void benchForTruncOrZextVecWithAddInLoopFrom_##ty1##_To_##ty2##_( \
	benchmark::State &state) { \
	benchForTruncOrZextVecWithAddInLoop<ty1, ty2>( \
	state, &truncOrZextVecWithAddInLoop); \
	} \
	BENCHMARK(benchForTruncOrZextVecWithAddInLoopFrom_##ty1##_To_##ty2##_);

	/* Vectorized truncate operations */
	ADD_BENCHMARK(uint16_t, uint8_t)
	ADD_BENCHMARK(uint32_t, uint8_t)
	ADD_BENCHMARK(uint64_t, uint8_t)
	ADD_BENCHMARK(uint32_t, uint16_t)
	ADD_BENCHMARK(uint64_t, uint16_t)
	ADD_BENCHMARK(uint64_t, uint32_t)

	/* Vectorized zero extend operations */
	ADD_BENCHMARK(uint8_t, uint16_t)
	ADD_BENCHMARK(uint8_t, uint32_t)
	ADD_BENCHMARK(uint8_t, uint64_t)
	ADD_BENCHMARK(uint16_t, uint32_t)
	ADD_BENCHMARK(uint16_t, uint64_t)
	ADD_BENCHMARK(uint32_t, uint64_t)