MicroBenchmarks/LoopVectorization/LoopInterleaving.cpp - third_party/llvm-test-suite - Git at Google

 // This program tests performance impact of Interleaving Count with varying loop
 // iteration count for different types of loops, such as loops with or
 // without reductions inside it, loops with different vectorization widths.
 #include <iostream>
 #include <memory>
 #include <random>

 #include "benchmark/benchmark.h"

 #define ELEMENTS 2048
 #define ALIGNED16 __attribute__((aligned(16)))

 static std::mt19937 rng;
 unsigned int g_sum = 0;

 int A[ELEMENTS] ALIGNED16;
 int B[ELEMENTS] ALIGNED16;
 int C[ELEMENTS] ALIGNED16;
 int D[ELEMENTS] ALIGNED16;
 int E[ELEMENTS] ALIGNED16;
 int F[ELEMENTS] ALIGNED16;

 // Initialize arrays with random numbers.
 static void init_data(unsigned N) {
   std::uniform_int_distribution<int> distrib(std::numeric_limits<int>::min(),
                                              std::numeric_limits<int>::max());
   for (unsigned I = 0; I < N; I++) {
     A[I] = distrib(rng);
     B[I] = distrib(rng);
     C[I] = distrib(rng);
     D[I] = distrib(rng);
     E[I] = distrib(rng);
     F[I] = distrib(rng);
   }
 }

 static void __attribute__((always_inline))
 runBenchForLoopInterleaving(benchmark::State &state, int (*Fn)(int),
                             int Iterations) {
   std::uniform_int_distribution<int> distrib(std::numeric_limits<int>::min(),
                                              std::numeric_limits<int>::max());
   init_data(ELEMENTS);
   for (auto _ : state) {
     benchmark::DoNotOptimize(A);
     benchmark::DoNotOptimize(B);
     benchmark::DoNotOptimize(C);
     benchmark::DoNotOptimize(D);
     benchmark::DoNotOptimize(E);
     benchmark::DoNotOptimize(F);
     benchmark::ClobberMemory();
     g_sum += Fn(Iterations);
   }
 }

 #define STRINGIFY(a) #a

 // Loops without Reduction with different vectorization configurations

 static int __attribute__((noinline)) loopNoReductionAutoVec(int Iterations) {
 #pragma clang loop unroll(disable)
   for (int J = 0; J < Iterations; J++) {
     A[J] = B[J] + C[J];
   }
   return 0;
 }

 static int __attribute__((noinline)) bigLoopNoReductionAutoVec(int Iterations) {
 #pragma clang loop unroll(disable)
   for (int J = 0; J < Iterations; J++) {
     A[J] = B[J] + C[J];
     D[J]++;
     E[J] *= 2;
     F[J] /= 5;
   }
   return 0;
 }

 #define loopNoReductionWithVecHint(vw, ic)                                     \
   static int __attribute__((noinline))                                         \
   loopWithVW##vw##IC##ic(int Iterations) {                                     \
     _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count(         \
         ic))) for (int J = 0; J < Iterations; J++) {                           \
       A[J] = B[J] + C[J];                                                      \
     }                                                                          \
     return 0;                                                                  \
   }

 #define bigLoopNoReductionWithVecHint(vw, ic)                                  \
   static int __attribute__((noinline))                                         \
   bigLoopWithVW##vw##IC##ic(int Iterations) {                                  \
     _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count(         \
         ic))) for (int J = 0; J < Iterations; J++) {                           \
       A[J] = B[J] + C[J];                                                      \
       D[J]++;                                                                  \
       E[J] *= 2;                                                               \
       F[J] /= 5;                                                               \
     }                                                                          \
     return 0;                                                                  \
   }

 // Loops with Reduction with different vectorization configurations

 static int __attribute__((noinline)) loopWithReductionAutoVec(int Iterations) {
   unsigned sum = 0;
 #pragma clang loop unroll(disable)
   for (int J = 0; J < Iterations; J++) {
     sum += A[J];
   }
   return sum;
 }

 static int __attribute__((noinline))
 bigLoopWithReductionAutoVec(int Iterations) {
   unsigned sum = 0;
 #pragma clang loop unroll(disable)
   for (int J = 0; J < Iterations; J++) {
     sum += A[J];
     D[J]++;
     E[J] *= 2;
     F[J] /= 5;
   }
   return sum;
 }

 #define loopWithReductionWithVecHint(vw, ic)                                   \
   static int __attribute__((noinline))                                         \
   loopWithReductionWithVW##vw##IC##ic(int Iterations) {                        \
     unsigned sum = 0;                                                          \
     _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count(         \
         ic))) for (int J = 0; J < Iterations; J++) {                           \
       sum += A[J];                                                             \
     }                                                                          \
     return sum;                                                                \
   }

 #define bigLoopWithReductionWithVecHint(vw, ic)                                \
   static int __attribute__((noinline))                                         \
   bigLoopWithReductionWithVW##vw##IC##ic(int Iterations) {                     \
     unsigned sum = 0;                                                          \
     _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count(         \
         ic))) for (int J = 0; J < Iterations; J++) {                           \
       sum += A[J];                                                             \
       D[J]++;                                                                  \
       E[J] *= 2;                                                               \
       F[J] /= 5;                                                               \
     }                                                                          \
     return sum;                                                                \
   }

 // We are evaluating 4 types of loops for different vectorization configurations
 // 1) Loops without reductions
 // 2) Loops with reductions
 // 3) Bigger loop bodies without reductions
 // 4) Bigger loop bodies with some reductions
 // For each, we are evaluating the following vectorization configurations of
 // vectorization width (VW), interleaving count (IC):
 // 1) automatically selected by the compiler (without vectorization hint)
 // 2) VW=4, IC=1
 // 3) VW=4, IC=2
 // 4) VW=4, IC=4
 // 5) VW=1, IC=1
 // 6) VW=1, IC=2
 // 7) VW=1, IC=4
 // Of these, configurations 5-7 are skipped for loop type 1 & 3).
 // Creating a function for the above configurations with different Vectorization
 // Hints:
 loopNoReductionWithVecHint(4, 1);
 loopNoReductionWithVecHint(4, 2);
 loopNoReductionWithVecHint(4, 4);
 loopWithReductionWithVecHint(4, 1);
 loopWithReductionWithVecHint(4, 2);
 loopWithReductionWithVecHint(4, 4);
 loopWithReductionWithVecHint(1, 1);
 loopWithReductionWithVecHint(1, 2);
 loopWithReductionWithVecHint(1, 4);
 bigLoopNoReductionWithVecHint(4, 1);
 bigLoopNoReductionWithVecHint(4, 2);
 bigLoopNoReductionWithVecHint(4, 4);
 bigLoopWithReductionWithVecHint(4, 1);
 bigLoopWithReductionWithVecHint(4, 2);
 bigLoopWithReductionWithVecHint(4, 4);
 bigLoopWithReductionWithVecHint(1, 1);
 bigLoopWithReductionWithVecHint(1, 2);
 bigLoopWithReductionWithVecHint(1, 4);

 #define ADD_BENCHMARK(Itr)                                                     \
   void benchAutoVecForLoopTC##Itr(benchmark::State &state) {                   \
     runBenchForLoopInterleaving(state, &loopNoReductionAutoVec, Itr);          \
   }                                                                            \
   BENCHMARK(benchAutoVecForLoopTC##Itr);                                       \
   void benchForIC1VW4LoopTC##Itr(benchmark::State &state) {                    \
     runBenchForLoopInterleaving(state, &loopWithVW4IC1, Itr);                  \
   }                                                                            \
   BENCHMARK(benchForIC1VW4LoopTC##Itr);                                        \
   void benchForIC2VW4LoopTC##Itr(benchmark::State &state) {                    \
     runBenchForLoopInterleaving(state, &loopWithVW4IC2, Itr);                  \
   }                                                                            \
   BENCHMARK(benchForIC2VW4LoopTC##Itr);                                        \
   void benchForIC4VW4LoopTC##Itr(benchmark::State &state) {                    \
     runBenchForLoopInterleaving(state, &loopWithVW4IC4, Itr);                  \
   }                                                                            \
   BENCHMARK(benchForIC4VW4LoopTC##Itr);                                        \
   void benchForLoopWithReductionAutoVecTC##Itr(benchmark::State &state) {      \
     runBenchForLoopInterleaving(state, &loopWithReductionAutoVec, Itr);        \
   }                                                                            \
   BENCHMARK(benchForLoopWithReductionAutoVecTC##Itr);                          \
   void benchForIC1VW4LoopWithReductionTC##Itr(benchmark::State &state) {       \
     runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC1, Itr);     \
   }                                                                            \
   BENCHMARK(benchForIC1VW4LoopWithReductionTC##Itr);                           \
   void benchForIC2VW4LoopWithReductionTC##Itr(benchmark::State &state) {       \
     runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC2, Itr);     \
   }                                                                            \
   BENCHMARK(benchForIC2VW4LoopWithReductionTC##Itr);                           \
   void benchForIC4VW4LoopWithReductionTC##Itr(benchmark::State &state) {       \
     runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC4, Itr);     \
   }                                                                            \
   BENCHMARK(benchForIC4VW4LoopWithReductionTC##Itr);                           \
   void benchForIC1VW1LoopWithReductionTC##Itr(benchmark::State &state) {       \
     runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC1, Itr);     \
   }                                                                            \
   BENCHMARK(benchForIC1VW1LoopWithReductionTC##Itr);                           \
   void benchForIC2VW1LoopWithReductionTC##Itr(benchmark::State &state) {       \
     runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC2, Itr);     \
   }                                                                            \
   BENCHMARK(benchForIC2VW1LoopWithReductionTC##Itr);                           \
   void benchForIC4VW1LoopWithReductionTC##Itr(benchmark::State &state) {       \
     runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC4, Itr);     \
   }                                                                            \
   BENCHMARK(benchForIC4VW1LoopWithReductionTC##Itr);                           \
   void benchAutoVecForBigLoopTC##Itr(benchmark::State &state) {                \
     runBenchForLoopInterleaving(state, &loopNoReductionAutoVec, Itr);          \
   }                                                                            \
   BENCHMARK(benchAutoVecForBigLoopTC##Itr);                                    \
   void benchForIC1VW4BigLoopTC##Itr(benchmark::State &state) {                 \
     runBenchForLoopInterleaving(state, &bigLoopWithVW4IC1, Itr);               \
   }                                                                            \
   BENCHMARK(benchForIC1VW4BigLoopTC##Itr);                                     \
   void benchForIC2VW4BigLoopTC##Itr(benchmark::State &state) {                 \
     runBenchForLoopInterleaving(state, &bigLoopWithVW4IC2, Itr);               \
   }                                                                            \
   BENCHMARK(benchForIC2VW4BigLoopTC##Itr);                                     \
   void benchForIC4VW4BigLoopTC##Itr(benchmark::State &state) {                 \
     runBenchForLoopInterleaving(state, &bigLoopWithVW4IC4, Itr);               \
   }                                                                            \
   BENCHMARK(benchForIC4VW4BigLoopTC##Itr);                                     \
   void benchForBigLoopWithReductionAutoVecTC##Itr(benchmark::State &state) {   \
     runBenchForLoopInterleaving(state, &bigLoopWithReductionAutoVec, Itr);     \
   }                                                                            \
   BENCHMARK(benchForBigLoopWithReductionAutoVecTC##Itr);                       \
   void benchForIC1VW4BigLoopWithReductionTC##Itr(benchmark::State &state) {    \
     runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC1, Itr);  \
   }                                                                            \
   BENCHMARK(benchForIC1VW4BigLoopWithReductionTC##Itr);                        \
   void benchForIC2VW4BigLoopWithReductionTC##Itr(benchmark::State &state) {    \
     runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC2, Itr);  \
   }                                                                            \
   BENCHMARK(benchForIC2VW4BigLoopWithReductionTC##Itr);                        \
   void benchForIC4VW4BigLoopWithReductionTC##Itr(benchmark::State &state) {    \
     runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC4, Itr);  \
   }                                                                            \
   BENCHMARK(benchForIC4VW4BigLoopWithReductionTC##Itr);                        \
   void benchForIC1VW1BigLoopWithReductionTC##Itr(benchmark::State &state) {    \
     runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC1, Itr);  \
   }                                                                            \
   BENCHMARK(benchForIC1VW1BigLoopWithReductionTC##Itr);                        \
   void benchForIC2VW1BigLoopWithReductionTC##Itr(benchmark::State &state) {    \
     runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC2, Itr);  \
   }                                                                            \
   BENCHMARK(benchForIC2VW1BigLoopWithReductionTC##Itr);                        \
   void benchForIC4VW1BigLoopWithReductionTC##Itr(benchmark::State &state) {    \
     runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC4, Itr);  \
   }                                                                            \
   BENCHMARK(benchForIC4VW1BigLoopWithReductionTC##Itr);

 #ifdef ALL_LOOP_IC_TESTS
 ADD_BENCHMARK(1)
 ADD_BENCHMARK(2)
 ADD_BENCHMARK(3)
 ADD_BENCHMARK(4)
 ADD_BENCHMARK(5)
 ADD_BENCHMARK(6)
 ADD_BENCHMARK(7)
 ADD_BENCHMARK(8)
 ADD_BENCHMARK(9)
 ADD_BENCHMARK(10)
 ADD_BENCHMARK(11)
 ADD_BENCHMARK(12)
 ADD_BENCHMARK(13)
 ADD_BENCHMARK(14)
 ADD_BENCHMARK(15)
 ADD_BENCHMARK(16)
 ADD_BENCHMARK(17)
 ADD_BENCHMARK(18)
 ADD_BENCHMARK(19)
 ADD_BENCHMARK(20)
 ADD_BENCHMARK(21)
 ADD_BENCHMARK(22)
 ADD_BENCHMARK(23)
 ADD_BENCHMARK(24)
 ADD_BENCHMARK(25)
 ADD_BENCHMARK(26)
 ADD_BENCHMARK(27)
 ADD_BENCHMARK(28)
 ADD_BENCHMARK(29)
 ADD_BENCHMARK(30)
 ADD_BENCHMARK(31)
 ADD_BENCHMARK(32)
 ADD_BENCHMARK(33)
 ADD_BENCHMARK(34)
 ADD_BENCHMARK(35)
 ADD_BENCHMARK(36)
 ADD_BENCHMARK(37)
 ADD_BENCHMARK(38)
 ADD_BENCHMARK(39)
 ADD_BENCHMARK(40)
 ADD_BENCHMARK(41)
 ADD_BENCHMARK(42)
 ADD_BENCHMARK(43)
 ADD_BENCHMARK(44)
 ADD_BENCHMARK(45)
 ADD_BENCHMARK(46)
 ADD_BENCHMARK(47)
 ADD_BENCHMARK(48)
 ADD_BENCHMARK(49)
 ADD_BENCHMARK(50)
 ADD_BENCHMARK(51)
 ADD_BENCHMARK(52)
 ADD_BENCHMARK(53)
 ADD_BENCHMARK(54)
 ADD_BENCHMARK(55)
 ADD_BENCHMARK(56)
 ADD_BENCHMARK(57)
 ADD_BENCHMARK(58)
 ADD_BENCHMARK(59)
 ADD_BENCHMARK(60)
 ADD_BENCHMARK(61)
 ADD_BENCHMARK(62)
 ADD_BENCHMARK(63)
 ADD_BENCHMARK(64)
 ADD_BENCHMARK(65)
 ADD_BENCHMARK(66)
 ADD_BENCHMARK(67)
 ADD_BENCHMARK(68)
 ADD_BENCHMARK(69)
 ADD_BENCHMARK(70)
 ADD_BENCHMARK(71)
 ADD_BENCHMARK(72)
 ADD_BENCHMARK(73)
 ADD_BENCHMARK(74)
 ADD_BENCHMARK(75)
 ADD_BENCHMARK(76)
 ADD_BENCHMARK(77)
 ADD_BENCHMARK(78)
 ADD_BENCHMARK(79)
 ADD_BENCHMARK(80)
 ADD_BENCHMARK(81)
 ADD_BENCHMARK(82)
 ADD_BENCHMARK(83)
 ADD_BENCHMARK(84)
 ADD_BENCHMARK(85)
 ADD_BENCHMARK(86)
 ADD_BENCHMARK(87)
 ADD_BENCHMARK(88)
 ADD_BENCHMARK(89)
 ADD_BENCHMARK(90)
 ADD_BENCHMARK(91)
 ADD_BENCHMARK(92)
 ADD_BENCHMARK(93)
 ADD_BENCHMARK(94)
 ADD_BENCHMARK(95)
 ADD_BENCHMARK(96)
 ADD_BENCHMARK(97)
 ADD_BENCHMARK(98)
 ADD_BENCHMARK(99)
 ADD_BENCHMARK(100)
 ADD_BENCHMARK(101)
 ADD_BENCHMARK(102)
 ADD_BENCHMARK(103)
 ADD_BENCHMARK(104)
 ADD_BENCHMARK(105)
 ADD_BENCHMARK(106)
 ADD_BENCHMARK(107)
 ADD_BENCHMARK(108)
 ADD_BENCHMARK(109)
 ADD_BENCHMARK(110)
 ADD_BENCHMARK(111)
 ADD_BENCHMARK(112)
 ADD_BENCHMARK(113)
 ADD_BENCHMARK(114)
 ADD_BENCHMARK(115)
 ADD_BENCHMARK(116)
 ADD_BENCHMARK(117)
 ADD_BENCHMARK(118)
 ADD_BENCHMARK(119)
 ADD_BENCHMARK(120)
 ADD_BENCHMARK(121)
 ADD_BENCHMARK(122)
 ADD_BENCHMARK(123)
 ADD_BENCHMARK(124)
 ADD_BENCHMARK(125)
 ADD_BENCHMARK(126)
 ADD_BENCHMARK(127)
 ADD_BENCHMARK(128)
 #else
 ADD_BENCHMARK(1)
 ADD_BENCHMARK(2)
 ADD_BENCHMARK(3)
 ADD_BENCHMARK(4)
 ADD_BENCHMARK(7)
 ADD_BENCHMARK(8)
 ADD_BENCHMARK(15)
 ADD_BENCHMARK(16)
 ADD_BENCHMARK(31)
 ADD_BENCHMARK(32)
 ADD_BENCHMARK(63)
 ADD_BENCHMARK(64)
 ADD_BENCHMARK(127)
 ADD_BENCHMARK(128)
 #endif
	// This program tests performance impact of Interleaving Count with varying loop
	// iteration count for different types of loops, such as loops with or
	// without reductions inside it, loops with different vectorization widths.
	#include <iostream>
	#include <memory>
	#include <random>

	#include "benchmark/benchmark.h"

	#define ELEMENTS 2048
	#define ALIGNED16 __attribute__((aligned(16)))

	static std::mt19937 rng;
	unsigned int g_sum = 0;

	int A[ELEMENTS] ALIGNED16;
	int B[ELEMENTS] ALIGNED16;
	int C[ELEMENTS] ALIGNED16;
	int D[ELEMENTS] ALIGNED16;
	int E[ELEMENTS] ALIGNED16;
	int F[ELEMENTS] ALIGNED16;

	// Initialize arrays with random numbers.
	static void init_data(unsigned N) {
	std::uniform_int_distribution<int> distrib(std::numeric_limits<int>::min(),
	std::numeric_limits<int>::max());
	for (unsigned I = 0; I < N; I++) {
	A[I] = distrib(rng);
	B[I] = distrib(rng);
	C[I] = distrib(rng);
	D[I] = distrib(rng);
	E[I] = distrib(rng);
	F[I] = distrib(rng);
	}
	}

	static void __attribute__((always_inline))
	runBenchForLoopInterleaving(benchmark::State &state, int (*Fn)(int),
	int Iterations) {
	std::uniform_int_distribution<int> distrib(std::numeric_limits<int>::min(),
	std::numeric_limits<int>::max());
	init_data(ELEMENTS);
	for (auto _ : state) {
	benchmark::DoNotOptimize(A);
	benchmark::DoNotOptimize(B);
	benchmark::DoNotOptimize(C);
	benchmark::DoNotOptimize(D);
	benchmark::DoNotOptimize(E);
	benchmark::DoNotOptimize(F);
	benchmark::ClobberMemory();
	g_sum += Fn(Iterations);
	}
	}

	#define STRINGIFY(a) #a

	// Loops without Reduction with different vectorization configurations

	static int __attribute__((noinline)) loopNoReductionAutoVec(int Iterations) {
	#pragma clang loop unroll(disable)
	for (int J = 0; J < Iterations; J++) {
	A[J] = B[J] + C[J];
	}
	return 0;
	}

	static int __attribute__((noinline)) bigLoopNoReductionAutoVec(int Iterations) {
	#pragma clang loop unroll(disable)
	for (int J = 0; J < Iterations; J++) {
	A[J] = B[J] + C[J];
	D[J]++;
	E[J] *= 2;
	F[J] /= 5;
	}
	return 0;
	}

	#define loopNoReductionWithVecHint(vw, ic) \
	static int __attribute__((noinline)) \
	loopWithVW##vw##IC##ic(int Iterations) { \
	_Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \
	ic))) for (int J = 0; J < Iterations; J++) { \
	A[J] = B[J] + C[J]; \
	} \
	return 0; \
	}

	#define bigLoopNoReductionWithVecHint(vw, ic) \
	static int __attribute__((noinline)) \
	bigLoopWithVW##vw##IC##ic(int Iterations) { \
	_Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \
	ic))) for (int J = 0; J < Iterations; J++) { \
	A[J] = B[J] + C[J]; \
	D[J]++; \
	E[J] *= 2; \
	F[J] /= 5; \
	} \
	return 0; \
	}

	// Loops with Reduction with different vectorization configurations

	static int __attribute__((noinline)) loopWithReductionAutoVec(int Iterations) {
	unsigned sum = 0;
	#pragma clang loop unroll(disable)
	for (int J = 0; J < Iterations; J++) {
	sum += A[J];
	}
	return sum;
	}

	static int __attribute__((noinline))
	bigLoopWithReductionAutoVec(int Iterations) {
	unsigned sum = 0;
	#pragma clang loop unroll(disable)
	for (int J = 0; J < Iterations; J++) {
	sum += A[J];
	D[J]++;
	E[J] *= 2;
	F[J] /= 5;
	}
	return sum;
	}

	#define loopWithReductionWithVecHint(vw, ic) \
	static int __attribute__((noinline)) \
	loopWithReductionWithVW##vw##IC##ic(int Iterations) { \
	unsigned sum = 0; \
	_Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \
	ic))) for (int J = 0; J < Iterations; J++) { \
	sum += A[J]; \
	} \
	return sum; \
	}

	#define bigLoopWithReductionWithVecHint(vw, ic) \
	static int __attribute__((noinline)) \
	bigLoopWithReductionWithVW##vw##IC##ic(int Iterations) { \
	unsigned sum = 0; \
	_Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \
	ic))) for (int J = 0; J < Iterations; J++) { \
	sum += A[J]; \
	D[J]++; \
	E[J] *= 2; \
	F[J] /= 5; \
	} \
	return sum; \
	}

	// We are evaluating 4 types of loops for different vectorization configurations
	// 1) Loops without reductions
	// 2) Loops with reductions
	// 3) Bigger loop bodies without reductions
	// 4) Bigger loop bodies with some reductions
	// For each, we are evaluating the following vectorization configurations of
	// vectorization width (VW), interleaving count (IC):
	// 1) automatically selected by the compiler (without vectorization hint)
	// 2) VW=4, IC=1
	// 3) VW=4, IC=2
	// 4) VW=4, IC=4
	// 5) VW=1, IC=1
	// 6) VW=1, IC=2
	// 7) VW=1, IC=4
	// Of these, configurations 5-7 are skipped for loop type 1 & 3).
	// Creating a function for the above configurations with different Vectorization
	// Hints:
	loopNoReductionWithVecHint(4, 1);
	loopNoReductionWithVecHint(4, 2);
	loopNoReductionWithVecHint(4, 4);
	loopWithReductionWithVecHint(4, 1);
	loopWithReductionWithVecHint(4, 2);
	loopWithReductionWithVecHint(4, 4);
	loopWithReductionWithVecHint(1, 1);
	loopWithReductionWithVecHint(1, 2);
	loopWithReductionWithVecHint(1, 4);
	bigLoopNoReductionWithVecHint(4, 1);
	bigLoopNoReductionWithVecHint(4, 2);
	bigLoopNoReductionWithVecHint(4, 4);
	bigLoopWithReductionWithVecHint(4, 1);
	bigLoopWithReductionWithVecHint(4, 2);
	bigLoopWithReductionWithVecHint(4, 4);
	bigLoopWithReductionWithVecHint(1, 1);
	bigLoopWithReductionWithVecHint(1, 2);
	bigLoopWithReductionWithVecHint(1, 4);

	#define ADD_BENCHMARK(Itr) \
	void benchAutoVecForLoopTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &loopNoReductionAutoVec, Itr); \
	} \
	BENCHMARK(benchAutoVecForLoopTC##Itr); \
	void benchForIC1VW4LoopTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &loopWithVW4IC1, Itr); \
	} \
	BENCHMARK(benchForIC1VW4LoopTC##Itr); \
	void benchForIC2VW4LoopTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &loopWithVW4IC2, Itr); \
	} \
	BENCHMARK(benchForIC2VW4LoopTC##Itr); \
	void benchForIC4VW4LoopTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &loopWithVW4IC4, Itr); \
	} \
	BENCHMARK(benchForIC4VW4LoopTC##Itr); \
	void benchForLoopWithReductionAutoVecTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &loopWithReductionAutoVec, Itr); \
	} \
	BENCHMARK(benchForLoopWithReductionAutoVecTC##Itr); \
	void benchForIC1VW4LoopWithReductionTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC1, Itr); \
	} \
	BENCHMARK(benchForIC1VW4LoopWithReductionTC##Itr); \
	void benchForIC2VW4LoopWithReductionTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC2, Itr); \
	} \
	BENCHMARK(benchForIC2VW4LoopWithReductionTC##Itr); \
	void benchForIC4VW4LoopWithReductionTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC4, Itr); \
	} \
	BENCHMARK(benchForIC4VW4LoopWithReductionTC##Itr); \
	void benchForIC1VW1LoopWithReductionTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC1, Itr); \
	} \
	BENCHMARK(benchForIC1VW1LoopWithReductionTC##Itr); \
	void benchForIC2VW1LoopWithReductionTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC2, Itr); \
	} \
	BENCHMARK(benchForIC2VW1LoopWithReductionTC##Itr); \
	void benchForIC4VW1LoopWithReductionTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC4, Itr); \
	} \
	BENCHMARK(benchForIC4VW1LoopWithReductionTC##Itr); \
	void benchAutoVecForBigLoopTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &loopNoReductionAutoVec, Itr); \
	} \
	BENCHMARK(benchAutoVecForBigLoopTC##Itr); \
	void benchForIC1VW4BigLoopTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &bigLoopWithVW4IC1, Itr); \
	} \
	BENCHMARK(benchForIC1VW4BigLoopTC##Itr); \
	void benchForIC2VW4BigLoopTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &bigLoopWithVW4IC2, Itr); \
	} \
	BENCHMARK(benchForIC2VW4BigLoopTC##Itr); \
	void benchForIC4VW4BigLoopTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &bigLoopWithVW4IC4, Itr); \
	} \
	BENCHMARK(benchForIC4VW4BigLoopTC##Itr); \
	void benchForBigLoopWithReductionAutoVecTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &bigLoopWithReductionAutoVec, Itr); \
	} \
	BENCHMARK(benchForBigLoopWithReductionAutoVecTC##Itr); \
	void benchForIC1VW4BigLoopWithReductionTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC1, Itr); \
	} \
	BENCHMARK(benchForIC1VW4BigLoopWithReductionTC##Itr); \
	void benchForIC2VW4BigLoopWithReductionTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC2, Itr); \
	} \
	BENCHMARK(benchForIC2VW4BigLoopWithReductionTC##Itr); \
	void benchForIC4VW4BigLoopWithReductionTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC4, Itr); \
	} \
	BENCHMARK(benchForIC4VW4BigLoopWithReductionTC##Itr); \
	void benchForIC1VW1BigLoopWithReductionTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC1, Itr); \
	} \
	BENCHMARK(benchForIC1VW1BigLoopWithReductionTC##Itr); \
	void benchForIC2VW1BigLoopWithReductionTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC2, Itr); \
	} \
	BENCHMARK(benchForIC2VW1BigLoopWithReductionTC##Itr); \
	void benchForIC4VW1BigLoopWithReductionTC##Itr(benchmark::State &state) { \
	runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC4, Itr); \
	} \
	BENCHMARK(benchForIC4VW1BigLoopWithReductionTC##Itr);

	#ifdef ALL_LOOP_IC_TESTS
	ADD_BENCHMARK(1)
	ADD_BENCHMARK(2)
	ADD_BENCHMARK(3)
	ADD_BENCHMARK(4)
	ADD_BENCHMARK(5)
	ADD_BENCHMARK(6)
	ADD_BENCHMARK(7)
	ADD_BENCHMARK(8)
	ADD_BENCHMARK(9)
	ADD_BENCHMARK(10)
	ADD_BENCHMARK(11)
	ADD_BENCHMARK(12)
	ADD_BENCHMARK(13)
	ADD_BENCHMARK(14)
	ADD_BENCHMARK(15)
	ADD_BENCHMARK(16)
	ADD_BENCHMARK(17)
	ADD_BENCHMARK(18)
	ADD_BENCHMARK(19)
	ADD_BENCHMARK(20)
	ADD_BENCHMARK(21)
	ADD_BENCHMARK(22)
	ADD_BENCHMARK(23)
	ADD_BENCHMARK(24)
	ADD_BENCHMARK(25)
	ADD_BENCHMARK(26)
	ADD_BENCHMARK(27)
	ADD_BENCHMARK(28)
	ADD_BENCHMARK(29)
	ADD_BENCHMARK(30)
	ADD_BENCHMARK(31)
	ADD_BENCHMARK(32)
	ADD_BENCHMARK(33)
	ADD_BENCHMARK(34)
	ADD_BENCHMARK(35)
	ADD_BENCHMARK(36)
	ADD_BENCHMARK(37)
	ADD_BENCHMARK(38)
	ADD_BENCHMARK(39)
	ADD_BENCHMARK(40)
	ADD_BENCHMARK(41)
	ADD_BENCHMARK(42)
	ADD_BENCHMARK(43)
	ADD_BENCHMARK(44)
	ADD_BENCHMARK(45)
	ADD_BENCHMARK(46)
	ADD_BENCHMARK(47)
	ADD_BENCHMARK(48)
	ADD_BENCHMARK(49)
	ADD_BENCHMARK(50)
	ADD_BENCHMARK(51)
	ADD_BENCHMARK(52)
	ADD_BENCHMARK(53)
	ADD_BENCHMARK(54)
	ADD_BENCHMARK(55)
	ADD_BENCHMARK(56)
	ADD_BENCHMARK(57)
	ADD_BENCHMARK(58)
	ADD_BENCHMARK(59)
	ADD_BENCHMARK(60)
	ADD_BENCHMARK(61)
	ADD_BENCHMARK(62)
	ADD_BENCHMARK(63)
	ADD_BENCHMARK(64)
	ADD_BENCHMARK(65)
	ADD_BENCHMARK(66)
	ADD_BENCHMARK(67)
	ADD_BENCHMARK(68)
	ADD_BENCHMARK(69)
	ADD_BENCHMARK(70)
	ADD_BENCHMARK(71)
	ADD_BENCHMARK(72)
	ADD_BENCHMARK(73)
	ADD_BENCHMARK(74)
	ADD_BENCHMARK(75)
	ADD_BENCHMARK(76)
	ADD_BENCHMARK(77)
	ADD_BENCHMARK(78)
	ADD_BENCHMARK(79)
	ADD_BENCHMARK(80)
	ADD_BENCHMARK(81)
	ADD_BENCHMARK(82)
	ADD_BENCHMARK(83)
	ADD_BENCHMARK(84)
	ADD_BENCHMARK(85)
	ADD_BENCHMARK(86)
	ADD_BENCHMARK(87)
	ADD_BENCHMARK(88)
	ADD_BENCHMARK(89)
	ADD_BENCHMARK(90)
	ADD_BENCHMARK(91)
	ADD_BENCHMARK(92)
	ADD_BENCHMARK(93)
	ADD_BENCHMARK(94)
	ADD_BENCHMARK(95)
	ADD_BENCHMARK(96)
	ADD_BENCHMARK(97)
	ADD_BENCHMARK(98)
	ADD_BENCHMARK(99)
	ADD_BENCHMARK(100)
	ADD_BENCHMARK(101)
	ADD_BENCHMARK(102)
	ADD_BENCHMARK(103)
	ADD_BENCHMARK(104)
	ADD_BENCHMARK(105)
	ADD_BENCHMARK(106)
	ADD_BENCHMARK(107)
	ADD_BENCHMARK(108)
	ADD_BENCHMARK(109)
	ADD_BENCHMARK(110)
	ADD_BENCHMARK(111)
	ADD_BENCHMARK(112)
	ADD_BENCHMARK(113)
	ADD_BENCHMARK(114)
	ADD_BENCHMARK(115)
	ADD_BENCHMARK(116)
	ADD_BENCHMARK(117)
	ADD_BENCHMARK(118)
	ADD_BENCHMARK(119)
	ADD_BENCHMARK(120)
	ADD_BENCHMARK(121)
	ADD_BENCHMARK(122)
	ADD_BENCHMARK(123)
	ADD_BENCHMARK(124)
	ADD_BENCHMARK(125)
	ADD_BENCHMARK(126)
	ADD_BENCHMARK(127)
	ADD_BENCHMARK(128)
	#else
	ADD_BENCHMARK(1)
	ADD_BENCHMARK(2)
	ADD_BENCHMARK(3)
	ADD_BENCHMARK(4)
	ADD_BENCHMARK(7)
	ADD_BENCHMARK(8)
	ADD_BENCHMARK(15)
	ADD_BENCHMARK(16)
	ADD_BENCHMARK(31)
	ADD_BENCHMARK(32)
	ADD_BENCHMARK(63)
	ADD_BENCHMARK(64)
	ADD_BENCHMARK(127)
	ADD_BENCHMARK(128)
	#endif