diff --git a/MicroBenchmarks/LoopVectorization/CMakeLists.txt b/MicroBenchmarks/LoopVectorization/CMakeLists.txt --- a/MicroBenchmarks/LoopVectorization/CMakeLists.txt +++ b/MicroBenchmarks/LoopVectorization/CMakeLists.txt @@ -14,6 +14,7 @@ MathFunctions.cpp RuntimeChecks.cpp VectorOperations.cpp + VectorInterleaving.cpp ) target_link_libraries(LoopVectorizationBenchmarks benchmark) diff --git a/MicroBenchmarks/LoopVectorization/VectorInterleaving.cpp b/MicroBenchmarks/LoopVectorization/VectorInterleaving.cpp new file mode 100644 --- /dev/null +++ b/MicroBenchmarks/LoopVectorization/VectorInterleaving.cpp @@ -0,0 +1,441 @@ +// This program tests performance impact of Interleaving Count with varying loop +// iteration count for different types of loop, for example loops with or +// without reduction inside it, loops with different vectorization widths etc. +#include +#include +#include + +#include "benchmark/benchmark.h" + +#define ELEMENTS 2048 +#define ALIGNED16 __attribute__((aligned(16))) + +static std::mt19937 rng; +unsigned int g_sum = 0; + +int A[ELEMENTS] ALIGNED16; +int B[ELEMENTS] ALIGNED16; +int C[ELEMENTS] ALIGNED16; + +// Initialize arrays with random numbers. +static void init_data(unsigned N) { + std::uniform_int_distribution distrib(std::numeric_limits::min(), + std::numeric_limits::max()); + for (unsigned I = 0; I < N; I++) { + A[I] = distrib(rng); + B[I] = distrib(rng); + C[I] = distrib(rng); + } +} + +static void __attribute__((noinline)) loopWithVW4IC1(int Iterations) { +#pragma clang loop vectorize_width(4) interleave(disable) + for (int J = 0; J < Iterations; J++) { + A[J] = B[J] + C[J]; + } +} + +static void __attribute__((noinline)) loopWithVW4IC2(int Iterations) { +#pragma clang loop vectorize_width(4) interleave_count(2) + for (int J = 0; J < Iterations; J++) { + A[J] = B[J] + C[J]; + } +} + +static void __attribute__((noinline)) loopWithVW4IC4(int Iterations) { +#pragma clang loop vectorize_width(4) interleave_count(4) + for (int J = 0; J < Iterations; J++) { + A[J] = B[J] + C[J]; + } +} + +static int __attribute__((noinline)) +loopWithReductionWithVW4IC1(int Iterations) { + unsigned sum = 0; +#pragma clang loop vectorize_width(4) interleave(disable) + for (int J = 0; J < Iterations; J++) { + sum += A[J]; + } + return sum; +} + +static int __attribute__((noinline)) +loopWithReductionWithVW4IC2(int Iterations) { + unsigned sum = 0; +#pragma clang loop vectorize_width(4) interleave_count(2) + for (int J = 0; J < Iterations; J++) { + sum += A[J]; + } + return sum; +} + +static int __attribute__((noinline)) +loopWithReductionWithVW4IC4(int Iterations) { + unsigned sum = 0; +#pragma clang loop vectorize_width(4) interleave_count(4) + for (int J = 0; J < Iterations; J++) { + sum += A[J]; + } + return sum; +} + +static int __attribute__((noinline)) +loopWithReductionWithVW1IC1(int Iterations) { + unsigned sum = 0; +#pragma clang loop vectorize_width(1) interleave_count(1) + for (int J = 0; J < Iterations; J++) { + sum += A[J]; + } + return sum; +} + +static int __attribute__((noinline)) +loopWithReductionWithVW1IC2(int Iterations) { + unsigned sum = 0; +#pragma clang loop vectorize_width(1) interleave_count(2) + for (int J = 0; J < Iterations; J++) { + sum += A[J]; + } + return sum; +} + +static int __attribute__((noinline)) +loopWithReductionWithVW1IC4(int Iterations) { + unsigned sum = 0; +#pragma clang loop vectorize_width(1) interleave_count(4) + for (int J = 0; J < Iterations; J++) { + sum += A[J]; + } + return sum; +} + +static void __attribute__((always_inline)) +benchForLoopInterleaveThreshold(benchmark::State &state, void (*Fn)(int), + int Iterations) { + std::uniform_int_distribution distrib(std::numeric_limits::min(), + std::numeric_limits::max()); + init_data(ELEMENTS); + for (auto _ : state) { + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::DoNotOptimize(C); + benchmark::ClobberMemory(); + Fn(Iterations); + } +} + +static void __attribute__((always_inline)) +benchForWithReductionLoopInterleaveThreshold(benchmark::State &state, + int (*Fn)(int), int Iterations) { + std::uniform_int_distribution distrib(std::numeric_limits::min(), + std::numeric_limits::max()); + init_data(ELEMENTS); + for (auto _ : state) { + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::DoNotOptimize(C); + benchmark::ClobberMemory(); + g_sum += Fn(Iterations); + } +} + +#define ADD_BENCHMARK(Itr) \ + void benchForIC1VW4LoopTC##Itr(benchmark::State &state) { \ + benchForLoopInterleaveThreshold(state, &loopWithVW4IC1, Itr); \ + } \ + BENCHMARK(benchForIC1VW4LoopTC##Itr); \ + void benchForIC2VW4LoopTC##Itr(benchmark::State &state) { \ + benchForLoopInterleaveThreshold(state, &loopWithVW4IC2, Itr); \ + } \ + BENCHMARK(benchForIC2VW4LoopTC##Itr); \ + void benchForIC4VW4LoopTC##Itr(benchmark::State &state) { \ + benchForLoopInterleaveThreshold(state, &loopWithVW4IC4, Itr); \ + } \ + BENCHMARK(benchForIC4VW4LoopTC##Itr); \ + void benchForIC1VW4LoopWithReductionTC##Itr(benchmark::State &state) { \ + benchForWithReductionLoopInterleaveThreshold( \ + state, &loopWithReductionWithVW4IC1, Itr); \ + } \ + BENCHMARK(benchForIC1VW4LoopWithReductionTC##Itr); \ + void benchForIC2VW4LoopWithReductionTC##Itr(benchmark::State &state) { \ + benchForWithReductionLoopInterleaveThreshold( \ + state, &loopWithReductionWithVW4IC2, Itr); \ + } \ + BENCHMARK(benchForIC2VW4LoopWithReductionTC##Itr); \ + void benchForIC4VW4LoopWithReductionTC##Itr(benchmark::State &state) { \ + benchForWithReductionLoopInterleaveThreshold( \ + state, &loopWithReductionWithVW4IC4, Itr); \ + } \ + BENCHMARK(benchForIC4VW4LoopWithReductionTC##Itr); \ + void benchForIC1VW1LoopWithReductionTC##Itr(benchmark::State &state) { \ + benchForWithReductionLoopInterleaveThreshold( \ + state, &loopWithReductionWithVW1IC1, Itr); \ + } \ + BENCHMARK(benchForIC1VW1LoopWithReductionTC##Itr); \ + void benchForIC2VW1LoopWithReductionTC##Itr(benchmark::State &state) { \ + benchForWithReductionLoopInterleaveThreshold( \ + state, &loopWithReductionWithVW1IC2, Itr); \ + } \ + BENCHMARK(benchForIC2VW1LoopWithReductionTC##Itr); \ + void benchForIC4VW1LoopWithReductionTC##Itr(benchmark::State &state) { \ + benchForWithReductionLoopInterleaveThreshold( \ + state, &loopWithReductionWithVW1IC4, Itr); \ + } \ + BENCHMARK(benchForIC4VW1LoopWithReductionTC##Itr); + +ADD_BENCHMARK(1) +ADD_BENCHMARK(2) +ADD_BENCHMARK(3) +ADD_BENCHMARK(4) +ADD_BENCHMARK(5) +ADD_BENCHMARK(6) +ADD_BENCHMARK(7) +ADD_BENCHMARK(8) +ADD_BENCHMARK(9) +ADD_BENCHMARK(10) +ADD_BENCHMARK(11) +ADD_BENCHMARK(12) +ADD_BENCHMARK(13) +ADD_BENCHMARK(14) +ADD_BENCHMARK(15) +ADD_BENCHMARK(16) +ADD_BENCHMARK(17) +ADD_BENCHMARK(18) +ADD_BENCHMARK(19) +ADD_BENCHMARK(20) +ADD_BENCHMARK(21) +ADD_BENCHMARK(22) +ADD_BENCHMARK(23) +ADD_BENCHMARK(24) +ADD_BENCHMARK(25) +ADD_BENCHMARK(26) +ADD_BENCHMARK(27) +ADD_BENCHMARK(28) +ADD_BENCHMARK(29) +ADD_BENCHMARK(30) +ADD_BENCHMARK(31) +ADD_BENCHMARK(32) +ADD_BENCHMARK(33) +ADD_BENCHMARK(34) +ADD_BENCHMARK(35) +ADD_BENCHMARK(36) +ADD_BENCHMARK(37) +ADD_BENCHMARK(38) +ADD_BENCHMARK(39) +ADD_BENCHMARK(40) +ADD_BENCHMARK(41) +ADD_BENCHMARK(42) +ADD_BENCHMARK(43) +ADD_BENCHMARK(44) +ADD_BENCHMARK(45) +ADD_BENCHMARK(46) +ADD_BENCHMARK(47) +ADD_BENCHMARK(48) +ADD_BENCHMARK(49) +ADD_BENCHMARK(50) +ADD_BENCHMARK(51) +ADD_BENCHMARK(52) +ADD_BENCHMARK(53) +ADD_BENCHMARK(54) +ADD_BENCHMARK(55) +ADD_BENCHMARK(56) +ADD_BENCHMARK(57) +ADD_BENCHMARK(58) +ADD_BENCHMARK(59) +ADD_BENCHMARK(60) +ADD_BENCHMARK(61) +ADD_BENCHMARK(62) +ADD_BENCHMARK(63) +ADD_BENCHMARK(64) +ADD_BENCHMARK(65) +ADD_BENCHMARK(66) +ADD_BENCHMARK(67) +ADD_BENCHMARK(68) +ADD_BENCHMARK(69) +ADD_BENCHMARK(70) +ADD_BENCHMARK(71) +ADD_BENCHMARK(72) +ADD_BENCHMARK(73) +ADD_BENCHMARK(74) +ADD_BENCHMARK(75) +ADD_BENCHMARK(76) +ADD_BENCHMARK(77) +ADD_BENCHMARK(78) +ADD_BENCHMARK(79) +ADD_BENCHMARK(80) +ADD_BENCHMARK(81) +ADD_BENCHMARK(82) +ADD_BENCHMARK(83) +ADD_BENCHMARK(84) +ADD_BENCHMARK(85) +ADD_BENCHMARK(86) +ADD_BENCHMARK(87) +ADD_BENCHMARK(88) +ADD_BENCHMARK(89) +ADD_BENCHMARK(90) +ADD_BENCHMARK(91) +ADD_BENCHMARK(92) +ADD_BENCHMARK(93) +ADD_BENCHMARK(94) +ADD_BENCHMARK(95) +ADD_BENCHMARK(96) +ADD_BENCHMARK(97) +ADD_BENCHMARK(98) +ADD_BENCHMARK(99) +ADD_BENCHMARK(100) +ADD_BENCHMARK(101) +ADD_BENCHMARK(102) +ADD_BENCHMARK(103) +ADD_BENCHMARK(104) +ADD_BENCHMARK(105) +ADD_BENCHMARK(106) +ADD_BENCHMARK(107) +ADD_BENCHMARK(108) +ADD_BENCHMARK(109) +ADD_BENCHMARK(110) +ADD_BENCHMARK(111) +ADD_BENCHMARK(112) +ADD_BENCHMARK(113) +ADD_BENCHMARK(114) +ADD_BENCHMARK(115) +ADD_BENCHMARK(116) +ADD_BENCHMARK(117) +ADD_BENCHMARK(118) +ADD_BENCHMARK(119) +ADD_BENCHMARK(120) +ADD_BENCHMARK(121) +ADD_BENCHMARK(122) +ADD_BENCHMARK(123) +ADD_BENCHMARK(124) +ADD_BENCHMARK(125) +ADD_BENCHMARK(126) +ADD_BENCHMARK(127) +ADD_BENCHMARK(128) +ADD_BENCHMARK(129) +ADD_BENCHMARK(130) +ADD_BENCHMARK(131) +ADD_BENCHMARK(132) +ADD_BENCHMARK(133) +ADD_BENCHMARK(134) +ADD_BENCHMARK(135) +ADD_BENCHMARK(136) +ADD_BENCHMARK(137) +ADD_BENCHMARK(138) +ADD_BENCHMARK(139) +ADD_BENCHMARK(140) +ADD_BENCHMARK(141) +ADD_BENCHMARK(142) +ADD_BENCHMARK(143) +ADD_BENCHMARK(144) +ADD_BENCHMARK(145) +ADD_BENCHMARK(146) +ADD_BENCHMARK(147) +ADD_BENCHMARK(148) +ADD_BENCHMARK(149) +ADD_BENCHMARK(150) +ADD_BENCHMARK(151) +ADD_BENCHMARK(152) +ADD_BENCHMARK(153) +ADD_BENCHMARK(154) +ADD_BENCHMARK(155) +ADD_BENCHMARK(156) +ADD_BENCHMARK(157) +ADD_BENCHMARK(158) +ADD_BENCHMARK(159) +ADD_BENCHMARK(160) +ADD_BENCHMARK(161) +ADD_BENCHMARK(162) +ADD_BENCHMARK(163) +ADD_BENCHMARK(164) +ADD_BENCHMARK(165) +ADD_BENCHMARK(166) +ADD_BENCHMARK(167) +ADD_BENCHMARK(168) +ADD_BENCHMARK(169) +ADD_BENCHMARK(170) +ADD_BENCHMARK(171) +ADD_BENCHMARK(172) +ADD_BENCHMARK(173) +ADD_BENCHMARK(174) +ADD_BENCHMARK(175) +ADD_BENCHMARK(176) +ADD_BENCHMARK(177) +ADD_BENCHMARK(178) +ADD_BENCHMARK(179) +ADD_BENCHMARK(180) +ADD_BENCHMARK(181) +ADD_BENCHMARK(182) +ADD_BENCHMARK(183) +ADD_BENCHMARK(184) +ADD_BENCHMARK(185) +ADD_BENCHMARK(186) +ADD_BENCHMARK(187) +ADD_BENCHMARK(188) +ADD_BENCHMARK(189) +ADD_BENCHMARK(190) +ADD_BENCHMARK(191) +ADD_BENCHMARK(192) +ADD_BENCHMARK(193) +ADD_BENCHMARK(194) +ADD_BENCHMARK(195) +ADD_BENCHMARK(196) +ADD_BENCHMARK(197) +ADD_BENCHMARK(198) +ADD_BENCHMARK(199) +ADD_BENCHMARK(200) +ADD_BENCHMARK(201) +ADD_BENCHMARK(202) +ADD_BENCHMARK(203) +ADD_BENCHMARK(204) +ADD_BENCHMARK(205) +ADD_BENCHMARK(206) +ADD_BENCHMARK(207) +ADD_BENCHMARK(208) +ADD_BENCHMARK(209) +ADD_BENCHMARK(210) +ADD_BENCHMARK(211) +ADD_BENCHMARK(212) +ADD_BENCHMARK(213) +ADD_BENCHMARK(214) +ADD_BENCHMARK(215) +ADD_BENCHMARK(216) +ADD_BENCHMARK(217) +ADD_BENCHMARK(218) +ADD_BENCHMARK(219) +ADD_BENCHMARK(220) +ADD_BENCHMARK(221) +ADD_BENCHMARK(222) +ADD_BENCHMARK(223) +ADD_BENCHMARK(224) +ADD_BENCHMARK(225) +ADD_BENCHMARK(226) +ADD_BENCHMARK(227) +ADD_BENCHMARK(228) +ADD_BENCHMARK(229) +ADD_BENCHMARK(230) +ADD_BENCHMARK(231) +ADD_BENCHMARK(232) +ADD_BENCHMARK(233) +ADD_BENCHMARK(234) +ADD_BENCHMARK(235) +ADD_BENCHMARK(236) +ADD_BENCHMARK(237) +ADD_BENCHMARK(238) +ADD_BENCHMARK(239) +ADD_BENCHMARK(240) +ADD_BENCHMARK(241) +ADD_BENCHMARK(242) +ADD_BENCHMARK(243) +ADD_BENCHMARK(244) +ADD_BENCHMARK(245) +ADD_BENCHMARK(246) +ADD_BENCHMARK(247) +ADD_BENCHMARK(248) +ADD_BENCHMARK(249) +ADD_BENCHMARK(250) +ADD_BENCHMARK(251) +ADD_BENCHMARK(252) +ADD_BENCHMARK(253) +ADD_BENCHMARK(254) +ADD_BENCHMARK(255) +ADD_BENCHMARK(256)