diff --git a/MicroBenchmarks/LoopVectorization/CMakeLists.txt b/MicroBenchmarks/LoopVectorization/CMakeLists.txt --- a/MicroBenchmarks/LoopVectorization/CMakeLists.txt +++ b/MicroBenchmarks/LoopVectorization/CMakeLists.txt @@ -14,6 +14,7 @@ MathFunctions.cpp RuntimeChecks.cpp VectorOperations.cpp + VectorInterleaving.cpp ) target_link_libraries(LoopVectorizationBenchmarks benchmark) diff --git a/MicroBenchmarks/LoopVectorization/VectorInterleaving.cpp b/MicroBenchmarks/LoopVectorization/VectorInterleaving.cpp new file mode 100644 --- /dev/null +++ b/MicroBenchmarks/LoopVectorization/VectorInterleaving.cpp @@ -0,0 +1,209 @@ +// This program tests performance impact of Loop vectorization interleaving +// count with varying loop iteration count +#include +#include +#include + +#include "benchmark/benchmark.h" + +#define ELEMENTS 2048 +#define ALIGNED16 __attribute__((aligned(16))) + +static std::mt19937 rng; + +int A[ELEMENTS] ALIGNED16; +int B[ELEMENTS] ALIGNED16; +int C[ELEMENTS] ALIGNED16; + +// Initialize array A with random numbers. +static void init_data(unsigned N) { + std::uniform_int_distribution distrib(std::numeric_limits::min(), + std::numeric_limits::max()); + for (unsigned I = 0; I < N; I++) { + A[I] = distrib(rng); + B[I] = distrib(rng); + C[I] = distrib(rng); + } +} + +static void __attribute__((noinline)) loopWithoutInterleaving(int Iterations) { +#pragma clang loop vectorize_width(4) interleave(disable) + for (int J = 0; J < Iterations; J++) { + A[J] = B[J] + C[J]; + } +} + +static void __attribute__((noinline)) loopWithInterleaving(int Iterations) { +#pragma clang loop vectorize_width(4) interleave_count(2) + for (int J = 0; J < Iterations; J++) { + A[J] = B[J] + C[J]; + } +} + +static void __attribute__((noinline)) loopWithInterleaving4(int Iterations) { +#pragma clang loop vectorize_width(4) interleave_count(4) + for (int J = 0; J < Iterations; J++) { + A[J] = B[J] + C[J]; + } +} + +static void __attribute__((always_inline)) +benchForLoopInterleaveThreshold(benchmark::State &state, void (*Fn)(int), + int Iterations) { + std::uniform_int_distribution distrib(std::numeric_limits::min(), + std::numeric_limits::max()); + init_data(ELEMENTS); + for (auto _ : state) { + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::DoNotOptimize(C); + benchmark::ClobberMemory(); + Fn(Iterations); + } +} + +#define ADD_BENCHMARK(Itr) \ + void benchForNoInterleaveLoopTestTC##Itr(benchmark::State &state) { \ + benchForLoopInterleaveThreshold(state, &loopWithoutInterleaving, Itr); \ + } \ + BENCHMARK(benchForNoInterleaveLoopTestTC##Itr); \ + void benchForInterleaveLoopTestTC##Itr(benchmark::State &state) { \ + benchForLoopInterleaveThreshold(state, &loopWithInterleaving, Itr); \ + } \ + BENCHMARK(benchForInterleaveLoopTestTC##Itr); \ + void benchForInterleave4LoopTestTC##Itr(benchmark::State &state) { \ + benchForLoopInterleaveThreshold(state, &loopWithInterleaving4, Itr); \ + } \ + BENCHMARK(benchForInterleave4LoopTestTC##Itr); + +ADD_BENCHMARK(1) +ADD_BENCHMARK(2) +ADD_BENCHMARK(3) +ADD_BENCHMARK(4) +ADD_BENCHMARK(5) +ADD_BENCHMARK(6) +ADD_BENCHMARK(7) +ADD_BENCHMARK(8) +ADD_BENCHMARK(9) +ADD_BENCHMARK(10) +ADD_BENCHMARK(11) +ADD_BENCHMARK(12) +ADD_BENCHMARK(13) +ADD_BENCHMARK(14) +ADD_BENCHMARK(15) +ADD_BENCHMARK(16) +ADD_BENCHMARK(17) +ADD_BENCHMARK(18) +ADD_BENCHMARK(19) +ADD_BENCHMARK(20) +ADD_BENCHMARK(21) +ADD_BENCHMARK(22) +ADD_BENCHMARK(23) +ADD_BENCHMARK(24) +ADD_BENCHMARK(25) +ADD_BENCHMARK(26) +ADD_BENCHMARK(27) +ADD_BENCHMARK(28) +ADD_BENCHMARK(29) +ADD_BENCHMARK(30) +ADD_BENCHMARK(31) +ADD_BENCHMARK(32) +ADD_BENCHMARK(33) +ADD_BENCHMARK(34) +ADD_BENCHMARK(35) +ADD_BENCHMARK(36) +ADD_BENCHMARK(37) +ADD_BENCHMARK(38) +ADD_BENCHMARK(39) +ADD_BENCHMARK(40) +ADD_BENCHMARK(41) +ADD_BENCHMARK(42) +ADD_BENCHMARK(43) +ADD_BENCHMARK(44) +ADD_BENCHMARK(45) +ADD_BENCHMARK(46) +ADD_BENCHMARK(47) +ADD_BENCHMARK(48) +ADD_BENCHMARK(49) +ADD_BENCHMARK(50) +ADD_BENCHMARK(51) +ADD_BENCHMARK(52) +ADD_BENCHMARK(53) +ADD_BENCHMARK(54) +ADD_BENCHMARK(55) +ADD_BENCHMARK(56) +ADD_BENCHMARK(57) +ADD_BENCHMARK(58) +ADD_BENCHMARK(59) +ADD_BENCHMARK(60) +ADD_BENCHMARK(61) +ADD_BENCHMARK(62) +ADD_BENCHMARK(63) +ADD_BENCHMARK(64) +ADD_BENCHMARK(65) +ADD_BENCHMARK(66) +ADD_BENCHMARK(67) +ADD_BENCHMARK(68) +ADD_BENCHMARK(69) +ADD_BENCHMARK(70) +ADD_BENCHMARK(71) +ADD_BENCHMARK(72) +ADD_BENCHMARK(73) +ADD_BENCHMARK(74) +ADD_BENCHMARK(75) +ADD_BENCHMARK(76) +ADD_BENCHMARK(77) +ADD_BENCHMARK(78) +ADD_BENCHMARK(79) +ADD_BENCHMARK(80) +ADD_BENCHMARK(81) +ADD_BENCHMARK(82) +ADD_BENCHMARK(83) +ADD_BENCHMARK(84) +ADD_BENCHMARK(85) +ADD_BENCHMARK(86) +ADD_BENCHMARK(87) +ADD_BENCHMARK(88) +ADD_BENCHMARK(89) +ADD_BENCHMARK(90) +ADD_BENCHMARK(91) +ADD_BENCHMARK(92) +ADD_BENCHMARK(93) +ADD_BENCHMARK(94) +ADD_BENCHMARK(95) +ADD_BENCHMARK(96) +ADD_BENCHMARK(97) +ADD_BENCHMARK(98) +ADD_BENCHMARK(99) +ADD_BENCHMARK(100) +ADD_BENCHMARK(101) +ADD_BENCHMARK(102) +ADD_BENCHMARK(103) +ADD_BENCHMARK(104) +ADD_BENCHMARK(105) +ADD_BENCHMARK(106) +ADD_BENCHMARK(107) +ADD_BENCHMARK(108) +ADD_BENCHMARK(109) +ADD_BENCHMARK(110) +ADD_BENCHMARK(111) +ADD_BENCHMARK(112) +ADD_BENCHMARK(113) +ADD_BENCHMARK(114) +ADD_BENCHMARK(115) +ADD_BENCHMARK(116) +ADD_BENCHMARK(117) +ADD_BENCHMARK(118) +ADD_BENCHMARK(119) +ADD_BENCHMARK(120) +ADD_BENCHMARK(121) +ADD_BENCHMARK(122) +ADD_BENCHMARK(123) +ADD_BENCHMARK(124) +ADD_BENCHMARK(125) +ADD_BENCHMARK(126) +ADD_BENCHMARK(127) +ADD_BENCHMARK(128) +ADD_BENCHMARK(156) +ADD_BENCHMARK(200) +ADD_BENCHMARK(256)