diff --git a/MicroBenchmarks/LoopVectorization/VectorOperations.cpp b/MicroBenchmarks/LoopVectorization/VectorOperations.cpp --- a/MicroBenchmarks/LoopVectorization/VectorOperations.cpp +++ b/MicroBenchmarks/LoopVectorization/VectorOperations.cpp @@ -1,3 +1,5 @@ +// This program tests vectorized truncates & zero-extends for performance and +// correctness #include #include #include @@ -11,17 +13,27 @@ // Initialize array A with random numbers. template static void init_data(const std::unique_ptr &A, unsigned N) { - std::uniform_int_distribution distrib( - std::numeric_limits::min(), std::numeric_limits::max()); - for (unsigned i = 0; i < N; i++) - A[i] = static_cast(distrib(rng)); + std::uniform_int_distribution distrib(std::numeric_limits::min(), + std::numeric_limits::max()); + for (unsigned I = 0; I < N; I++) + A[I] = distrib(rng); +} + +// Truncate/Zero-extend elements to create expected results with no +// vectorization +template static void +truncOrZextWithNoVec(const Ty1 *A, Ty2 *B, int Iterations) { +#pragma clang loop vectorize(disable) + for (unsigned I = 0; I < Iterations; I++) { + B[I] = A[I]; + } } // Truncate/Zero-extend each vector element in a vectorized loop with vectorization width 8 -template static void truncOrZextVecInLoopWithVW8(const Ty1 *A, Ty2 *B, int iterations) { +template static void truncOrZextVecInLoopWithVW8(const Ty1 *A, Ty2 *B, int Iterations) { #pragma clang loop vectorize_width(8) interleave_count(4) - for (unsigned i = 0; i < iterations; i++) { - B[i] = A[i]; + for (unsigned I = 0; I < Iterations; I++) { + B[I] = A[I]; } } @@ -29,8 +41,22 @@ benchForTruncOrZextVecInLoopWithVW8(benchmark::State &state) { std::unique_ptr A(new Ty1[ITERATIONS]); std::unique_ptr B(new Ty2[ITERATIONS]); + std::unique_ptr C(new Ty2[ITERATIONS]); + init_data(A, ITERATIONS); - init_data(B, ITERATIONS); + + // Check for correctness + truncOrZextWithNoVec(&A[0], &C[0], ITERATIONS); + truncOrZextVecInLoopWithVW8(&A[0], &B[0], ITERATIONS); + for (int I = 0; I < ITERATIONS; I++) { + if (B[I] != C[I]) { + std::cerr << "ERROR: Trunc or ZExt operation on " << A[I] + << " is showing result " << B[I] << " instead of " << C[I] + << "\n"; + exit(1); + } + } + for (auto _ : state) { benchmark::DoNotOptimize(B); benchmark::ClobberMemory(); @@ -38,20 +64,73 @@ } } -// Truncate/Zero-extend each vector element in a vectorized loop -template static void truncOrZextVecInLoop(const Ty1 *A, Ty2 *B, int iterations) { -#pragma clang loop interleave_count(4) - for (unsigned i = 0; i < iterations; i++) { - B[i] = A[i]; +// Truncate/Zero-extend each vector element in a vectorized loop with vector width 16 +template +static void truncOrZextVecInLoopWithVW16(const Ty1 *A, Ty2 *B, int Iterations) { +#pragma clang loop vectorize_width(16) interleave_count(4) + for (unsigned I = 0; I < Iterations; I++) { + B[I] = A[I]; } } -template static void __attribute__((always_inline)) +template +static void __attribute__((always_inline)) +benchForTruncOrZextVecInLoopWithVW16(benchmark::State &state) { + std::unique_ptr A(new Ty1[ITERATIONS]); + std::unique_ptr B(new Ty2[ITERATIONS]); + std::unique_ptr C(new Ty2[ITERATIONS]); + + init_data(A, ITERATIONS); + + // Check for correctness + truncOrZextWithNoVec(&A[0], &C[0], ITERATIONS); + truncOrZextVecInLoopWithVW16(&A[0], &B[0], ITERATIONS); + for (int I = 0; I < ITERATIONS; I++) { + if (B[I] != C[I]) { + std::cerr << "ERROR: Trunc or ZExt operation on " << A[I] + << " is showing result " << B[I] << " instead of " << C[I] + << "\n"; + exit(1); + } + } + + for (auto _ : state) { + benchmark::DoNotOptimize(B); + benchmark::ClobberMemory(); + truncOrZextVecInLoopWithVW16(&A[0], &B[0], ITERATIONS); + } +} + +// Truncate/Zero-extend each vector element in a vectorized loop with vector width 16 +template +static void truncOrZextVecInLoop(const Ty1 *A, Ty2 *B, int Iterations) { +#pragma clang loop vectorize(enable) + for (unsigned I = 0; I < Iterations; I++) { + B[I] = A[I]; + } +} + +template +static void __attribute__((always_inline)) benchForTruncOrZextVecInLoop(benchmark::State &state) { std::unique_ptr A(new Ty1[ITERATIONS]); std::unique_ptr B(new Ty2[ITERATIONS]); + std::unique_ptr C(new Ty2[ITERATIONS]); + init_data(A, ITERATIONS); - init_data(B, ITERATIONS); + + // Check for correctness + truncOrZextWithNoVec(&A[0], &C[0], ITERATIONS); + truncOrZextVecInLoop(&A[0], &B[0], ITERATIONS); + for (int I = 0; I < ITERATIONS; I++) { + if (B[I] != C[I]) { + std::cerr << "ERROR: Trunc or ZExt operation on " << A[I] + << " is showing result " << B[I] << " instead of " << C[I] + << "\n"; + exit(1); + } + } + for (auto _ : state) { benchmark::DoNotOptimize(B); benchmark::ClobberMemory(); @@ -60,21 +139,35 @@ } // Add vectorized truncate or zero-extend operation benchmarks for different element types -#define ADD_BENCHMARK(ty1, ty2) \ - void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_(benchmark::State &state) { \ - benchForTruncOrZextVecInLoopWithVW8(state); \ +#define ADD_BENCHMARK(ty1, ty2) \ + void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_( \ + benchmark::State &state) { \ + benchForTruncOrZextVecInLoopWithVW8(state); \ } \ - BENCHMARK(benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_); \ - void benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_(benchmark::State &state) { \ - benchForTruncOrZextVecInLoop(state); \ + BENCHMARK(benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_); \ + void benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_( \ + benchmark::State &state) { \ + benchForTruncOrZextVecInLoopWithVW16(state); \ } \ - BENCHMARK(benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_); \ + BENCHMARK(benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_); \ + void benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_( \ + benchmark::State &state) { \ + benchForTruncOrZextVecInLoop(state); \ + } \ + BENCHMARK(benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_); /* Vectorized truncate operations */ -ADD_BENCHMARK(uint64_t, uint8_t) -ADD_BENCHMARK(uint32_t, uint8_t) ADD_BENCHMARK(uint16_t, uint8_t) - +ADD_BENCHMARK(uint32_t, uint8_t) +ADD_BENCHMARK(uint64_t, uint8_t) +ADD_BENCHMARK(uint32_t, uint16_t) +ADD_BENCHMARK(uint64_t, uint16_t) +ADD_BENCHMARK(uint64_t, uint32_t) /* Vectorized zero extend operations */ +ADD_BENCHMARK(uint8_t, uint16_t) ADD_BENCHMARK(uint8_t, uint32_t) +ADD_BENCHMARK(uint8_t, uint64_t) +ADD_BENCHMARK(uint16_t, uint32_t) +ADD_BENCHMARK(uint16_t, uint64_t) +ADD_BENCHMARK(uint32_t, uint64_t)