diff --git a/MicroBenchmarks/LoopVectorization/VectorOperations.cpp b/MicroBenchmarks/LoopVectorization/VectorOperations.cpp --- a/MicroBenchmarks/LoopVectorization/VectorOperations.cpp +++ b/MicroBenchmarks/LoopVectorization/VectorOperations.cpp @@ -1,3 +1,5 @@ +// This program tests vectorized truncates & zero-extends for performance and +// correctness #include #include #include @@ -11,70 +13,178 @@ // Initialize array A with random numbers. template static void init_data(const std::unique_ptr &A, unsigned N) { - std::uniform_int_distribution distrib( - std::numeric_limits::min(), std::numeric_limits::max()); - for (unsigned i = 0; i < N; i++) - A[i] = static_cast(distrib(rng)); + std::uniform_int_distribution distrib(std::numeric_limits::min(), + std::numeric_limits::max()); + for (unsigned I = 0; I < N; I++) + A[I] = distrib(rng); +} + +// Truncate/Zero-extend elements to create expected results with no +// vectorization +template +static void truncOrZextWithNoVec(const Ty1 *A, Ty2 *B, int Iterations) { +#pragma clang loop vectorize(disable) + for (unsigned I = 0; I < Iterations; I++) { + B[I] = A[I]; + } } // Truncate/Zero-extend each vector element in a vectorized loop with vectorization width 8 -template static void truncOrZextVecInLoopWithVW8(const Ty1 *A, Ty2 *B, int iterations) { +template +static void truncOrZextVecInLoopWithVW8(const Ty1 *A, Ty2 *B, int Iterations) { #pragma clang loop vectorize_width(8) interleave_count(4) - for (unsigned i = 0; i < iterations; i++) { - B[i] = A[i]; + for (unsigned I = 0; I < Iterations; I++) { + B[I] = A[I]; + } +} + +// Truncate/Zero-extend each vector element in a vectorized loop with +// vectorization width 16 +template +static void truncOrZextVecInLoopWithVW16(const Ty1 *A, Ty2 *B, int Iterations) { +#pragma clang loop vectorize_width(16) interleave_count(4) + for (unsigned I = 0; I < Iterations; I++) { + B[I] = A[I]; + } +} + +// Truncate/Zero-extend each vector element in a vectorized loop +template +static void truncOrZextVecInLoop(const Ty1 *A, Ty2 *B, int Iterations) { +#pragma clang loop vectorize(enable) + for (unsigned I = 0; I < Iterations; I++) { + B[I] = A[I]; + } +} + +// Truncate/Zero-extend each vector element while adding in a vectorized loop +// with vectorization width 8 +template +static void truncOrZextVecWithAddInLoopWithVW8(const Ty1 *A, Ty2 *B, + int Iterations) { +#pragma clang loop vectorize_width(8) interleave_count(4) + for (unsigned I = 0; I < Iterations; I++) { + B[I] += A[I]; + } +} + +// Truncate/Zero-extend each vector element while adding in a vectorized loop +// vectorization width 16 +template +static void truncOrZextVecWithAddInLoopWithVW16(const Ty1 *A, Ty2 *B, + int Iterations) { +#pragma clang loop vectorize_width(16) interleave_count(4) + for (unsigned I = 0; I < Iterations; I++) { + B[I] += A[I]; + } +} + +// Truncate/Zero-extend each vector element while adding in a vectorized loop +template +static void truncOrZextVecWithAddInLoop(const Ty1 *A, Ty2 *B, int Iterations) { +#pragma clang loop vectorize(enable) + for (unsigned I = 0; I < Iterations; I++) { + B[I] += A[I]; } } -template static void __attribute__((always_inline)) -benchForTruncOrZextVecInLoopWithVW8(benchmark::State &state) { +template +static void __attribute__((always_inline)) +benchForTruncOrZextVecInLoop(benchmark::State &state, + void (*Fn)(const Ty1 *, Ty2 *, int)) { std::unique_ptr A(new Ty1[ITERATIONS]); std::unique_ptr B(new Ty2[ITERATIONS]); + std::unique_ptr C(new Ty2[ITERATIONS]); + init_data(A, ITERATIONS); - init_data(B, ITERATIONS); + + // Check for correctness + truncOrZextWithNoVec(&A[0], &C[0], ITERATIONS); + Fn(&A[0], &B[0], ITERATIONS); + for (int I = 0; I < ITERATIONS; I++) { + if (B[I] != C[I]) { + std::cerr << "ERROR: Trunc or ZExt operation on " << A[I] + << " is showing result " << B[I] << " instead of " << C[I] + << "\n"; + exit(1); + } + } + for (auto _ : state) { benchmark::DoNotOptimize(B); benchmark::ClobberMemory(); - truncOrZextVecInLoopWithVW8(&A[0], &B[0], ITERATIONS); - } -} - -// Truncate/Zero-extend each vector element in a vectorized loop -template static void truncOrZextVecInLoop(const Ty1 *A, Ty2 *B, int iterations) { -#pragma clang loop interleave_count(4) - for (unsigned i = 0; i < iterations; i++) { - B[i] = A[i]; + Fn(&A[0], &B[0], ITERATIONS); } } -template static void __attribute__((always_inline)) -benchForTruncOrZextVecInLoop(benchmark::State &state) { +template +static void __attribute__((always_inline)) +benchForTruncOrZextVecWithAddInLoop(benchmark::State &state, + void (*Fn)(const Ty1 *, Ty2 *, int)) { std::unique_ptr A(new Ty1[ITERATIONS]); std::unique_ptr B(new Ty2[ITERATIONS]); + std::unique_ptr C(new Ty2[ITERATIONS]); init_data(A, ITERATIONS); init_data(B, ITERATIONS); for (auto _ : state) { benchmark::DoNotOptimize(B); benchmark::ClobberMemory(); - truncOrZextVecInLoop(&A[0], &B[0], ITERATIONS); + Fn(&A[0], &B[0], ITERATIONS); } } // Add vectorized truncate or zero-extend operation benchmarks for different element types -#define ADD_BENCHMARK(ty1, ty2) \ - void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_(benchmark::State &state) { \ - benchForTruncOrZextVecInLoopWithVW8(state); \ +#define ADD_BENCHMARK(ty1, ty2) \ + void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_( \ + benchmark::State &state) { \ + benchForTruncOrZextVecInLoop(state, \ + &truncOrZextVecInLoopWithVW8); \ } \ - BENCHMARK(benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_); \ - void benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_(benchmark::State &state) { \ - benchForTruncOrZextVecInLoop(state); \ + BENCHMARK(benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_); \ + void benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_( \ + benchmark::State &state) { \ + benchForTruncOrZextVecInLoop(state, \ + &truncOrZextVecInLoopWithVW16); \ } \ - BENCHMARK(benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_); \ + BENCHMARK(benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_); \ + void benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_( \ + benchmark::State &state) { \ + benchForTruncOrZextVecInLoop(state, &truncOrZextVecInLoop); \ + } \ + BENCHMARK(benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_); \ + void benchForTruncOrZextVecWithAddInLoopWithVW8From_##ty1##_To_##ty2##_( \ + benchmark::State &state) { \ + benchForTruncOrZextVecWithAddInLoop( \ + state, &truncOrZextVecWithAddInLoopWithVW8); \ + } \ + BENCHMARK( \ + benchForTruncOrZextVecWithAddInLoopWithVW8From_##ty1##_To_##ty2##_); \ + void benchForTruncOrZextVecWithAddInLoopWithVW16From_##ty1##_To_##ty2##_( \ + benchmark::State &state) { \ + benchForTruncOrZextVecWithAddInLoop( \ + state, &truncOrZextVecWithAddInLoopWithVW16); \ + } \ + BENCHMARK( \ + benchForTruncOrZextVecWithAddInLoopWithVW16From_##ty1##_To_##ty2##_); \ + void benchForTruncOrZextVecWithAddInLoopFrom_##ty1##_To_##ty2##_( \ + benchmark::State &state) { \ + benchForTruncOrZextVecWithAddInLoop( \ + state, &truncOrZextVecWithAddInLoop); \ + } \ + BENCHMARK(benchForTruncOrZextVecWithAddInLoopFrom_##ty1##_To_##ty2##_); /* Vectorized truncate operations */ -ADD_BENCHMARK(uint64_t, uint8_t) -ADD_BENCHMARK(uint32_t, uint8_t) ADD_BENCHMARK(uint16_t, uint8_t) - +ADD_BENCHMARK(uint32_t, uint8_t) +ADD_BENCHMARK(uint64_t, uint8_t) +ADD_BENCHMARK(uint32_t, uint16_t) +ADD_BENCHMARK(uint64_t, uint16_t) +ADD_BENCHMARK(uint64_t, uint32_t) /* Vectorized zero extend operations */ +ADD_BENCHMARK(uint8_t, uint16_t) ADD_BENCHMARK(uint8_t, uint32_t) +ADD_BENCHMARK(uint8_t, uint64_t) +ADD_BENCHMARK(uint16_t, uint32_t) +ADD_BENCHMARK(uint16_t, uint64_t) +ADD_BENCHMARK(uint32_t, uint64_t)