diff --git a/MicroBenchmarks/LoopVectorization/VectorOperations.cpp b/MicroBenchmarks/LoopVectorization/VectorOperations.cpp --- a/MicroBenchmarks/LoopVectorization/VectorOperations.cpp +++ b/MicroBenchmarks/LoopVectorization/VectorOperations.cpp @@ -1,20 +1,17 @@ +// This program tests vectorized truncates & zero-extends for performance and +// correctness #include #include -#include #include "benchmark/benchmark.h" #define ITERATIONS 10000 -static std::mt19937 rng; - -// Initialize array A with random numbers. +// Initialize array A with the maximum value for its type template static void init_data(const std::unique_ptr &A, unsigned N) { - std::uniform_int_distribution distrib( - std::numeric_limits::min(), std::numeric_limits::max()); for (unsigned i = 0; i < N; i++) - A[i] = static_cast(distrib(rng)); + A[i] = std::numeric_limits::max(); } // Truncate/Zero-extend each vector element in a vectorized loop with vectorization width 8 @@ -30,51 +27,88 @@ std::unique_ptr A(new Ty1[ITERATIONS]); std::unique_ptr B(new Ty2[ITERATIONS]); init_data(A, ITERATIONS); - init_data(B, ITERATIONS); for (auto _ : state) { benchmark::DoNotOptimize(B); benchmark::ClobberMemory(); truncOrZextVecInLoopWithVW8(&A[0], &B[0], ITERATIONS); } + + // Create expected result of the Trunc or ZExt operation + Ty2 C; + if (sizeof(Ty1) > sizeof(Ty2)) { + C = std::numeric_limits::max(); + } else { + C = std::numeric_limits::max(); + } + + // Check for correctness + for (int I = 0; I < ITERATIONS; I++) { + if (B[I] != C) { + std::cerr << "ERROR: Trunc or ZExt operation on " << A[I] + << " is showing result " << B[I] << " instead of " << C << "\n"; + exit(1); + } + } } // Truncate/Zero-extend each vector element in a vectorized loop -template static void truncOrZextVecInLoop(const Ty1 *A, Ty2 *B, int iterations) { -#pragma clang loop interleave_count(4) +template +static void truncOrZextVecInLoopWithVW16(const Ty1 *A, Ty2 *B, int iterations) { +#pragma clang loop vectorize_width(16) interleave_count(4) for (unsigned i = 0; i < iterations; i++) { B[i] = A[i]; } } -template static void __attribute__((always_inline)) -benchForTruncOrZextVecInLoop(benchmark::State &state) { +template +static void __attribute__((always_inline)) +benchForTruncOrZextVecInLoopWithVW16(benchmark::State &state) { std::unique_ptr A(new Ty1[ITERATIONS]); std::unique_ptr B(new Ty2[ITERATIONS]); init_data(A, ITERATIONS); - init_data(B, ITERATIONS); for (auto _ : state) { benchmark::DoNotOptimize(B); benchmark::ClobberMemory(); - truncOrZextVecInLoop(&A[0], &B[0], ITERATIONS); + truncOrZextVecInLoopWithVW16(&A[0], &B[0], ITERATIONS); + } + + // Create expected result of the Trunc or ZExt operation + Ty2 C; + if (sizeof(Ty1) > sizeof(Ty2)) { + C = std::numeric_limits::max(); + } else { + C = std::numeric_limits::max(); + } + + // Check for correctness + for (int I = 0; I < ITERATIONS; I++) { + if (B[I] != C) { + std::cerr << "ERROR: Trunc or ZExt operation on " << A[I] + << " is showing result " << B[I] << " instead of " << C << "\n"; + exit(1); + } } } // Add vectorized truncate or zero-extend operation benchmarks for different element types -#define ADD_BENCHMARK(ty1, ty2) \ - void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_(benchmark::State &state) { \ - benchForTruncOrZextVecInLoopWithVW8(state); \ +#define ADD_BENCHMARK(ty1, ty2) \ + void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_( \ + benchmark::State &state) { \ + benchForTruncOrZextVecInLoopWithVW8(state); \ } \ - BENCHMARK(benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_); \ - void benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_(benchmark::State &state) { \ - benchForTruncOrZextVecInLoop(state); \ + BENCHMARK(benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_); \ + void benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_( \ + benchmark::State &state) { \ + benchForTruncOrZextVecInLoopWithVW16(state); \ } \ - BENCHMARK(benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_); \ + BENCHMARK(benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_); /* Vectorized truncate operations */ ADD_BENCHMARK(uint64_t, uint8_t) ADD_BENCHMARK(uint32_t, uint8_t) ADD_BENCHMARK(uint16_t, uint8_t) - /* Vectorized zero extend operations */ +ADD_BENCHMARK(uint8_t, uint16_t) ADD_BENCHMARK(uint8_t, uint32_t) +ADD_BENCHMARK(uint8_t, uint64_t)