Index: MicroBenchmarks/LoopVectorization/CMakeLists.txt =================================================================== --- MicroBenchmarks/LoopVectorization/CMakeLists.txt +++ MicroBenchmarks/LoopVectorization/CMakeLists.txt @@ -13,6 +13,7 @@ main.cpp MathFunctions.cpp RuntimeChecks.cpp + VectorOperations.cpp ) target_link_libraries(LoopVectorizationBenchmarks benchmark) Index: MicroBenchmarks/LoopVectorization/VectorOperations.cpp =================================================================== --- /dev/null +++ MicroBenchmarks/LoopVectorization/VectorOperations.cpp @@ -0,0 +1,75 @@ +#include +#include +#include + +#include "benchmark/benchmark.h" + +#define ITERATIONS 10000 + +static std::mt19937 rng; + +// Initialize array A with random numbers. +template +static void init_data(const std::unique_ptr &A, unsigned N) { + std::uniform_int_distribution distrib( + std::numeric_limits::min(), std::numeric_limits::max()); + for (unsigned i = 0; i < N; i++) + A[i] = static_cast(distrib(rng)); +} + +// Truncate & add each vector element from T to int8_t in a vectorized loop with vectorization width 8 +template static void truncVecInLoopWithVW8(const T *A, uint8_t *B, int iterations) { +#pragma clang loop vectorize_width(8) interleave_count(4) + for (unsigned i = 0; i < iterations; i++) { + B[i] += A[i]; + } +} + +template static void __attribute__((always_inline)) +benchForTruncVecInLoopWithVW8(benchmark::State &state) { + std::unique_ptr A(new Ty[ITERATIONS]); + std::unique_ptr B(new uint8_t[ITERATIONS]); + init_data(A, ITERATIONS); + init_data(B, ITERATIONS); + for (auto _ : state) { + benchmark::DoNotOptimize(B); + benchmark::ClobberMemory(); + truncVecInLoopWithVW8(&A[0], &B[0], ITERATIONS); + } +} + +// Truncate & add each vector element from T to int8_t in a vectorized loop with vectorization width 16 +template static void truncVecInLoopWithVW16(const T *A, uint8_t *B, int iterations) { +#pragma clang loop vectorize_width(16) interleave_count(4) + for (unsigned i = 0; i < iterations; i++) { + B[i] += A[i]; + } +} + +template static void __attribute__((always_inline)) +benchForTruncVecInLoopWithVW16(benchmark::State &state) { + std::unique_ptr A(new Ty[ITERATIONS]); + std::unique_ptr B(new uint8_t[ITERATIONS]); + init_data(A, ITERATIONS); + init_data(B, ITERATIONS); + for (auto _ : state) { + benchmark::DoNotOptimize(B); + benchmark::ClobberMemory(); + truncVecInLoopWithVW16(&A[0], &B[0], ITERATIONS); + } +} + +// Add vectorized truncate operation benchmarks for different types +#define ADD_BENCHMARK(ty) \ + void benchForTruncVecInLoopFor_##ty##_(benchmark::State &state) { \ + benchForTruncVecInLoopWithVW8(state); \ + } \ + BENCHMARK(benchForTruncVecInLoopFor_##ty##_); \ + void benchForTruncVecInLoopWithVW16For_##ty##_(benchmark::State &state) { \ + benchForTruncVecInLoopWithVW16(state); \ + } \ + BENCHMARK(benchForTruncVecInLoopFor_##ty##_); \ + +ADD_BENCHMARK(uint64_t) +ADD_BENCHMARK(uint32_t) +ADD_BENCHMARK(uint16_t)