diff --git a/MicroBenchmarks/LoopVectorization/VectorOperations.cpp b/MicroBenchmarks/LoopVectorization/VectorOperations.cpp --- a/MicroBenchmarks/LoopVectorization/VectorOperations.cpp +++ b/MicroBenchmarks/LoopVectorization/VectorOperations.cpp @@ -8,13 +8,19 @@ static std::mt19937 rng; -// Initialize array A with random numbers. +// Initialize array A with random numbers or the maximum for that type template -static void init_data(const std::unique_ptr &A, unsigned N) { - std::uniform_int_distribution distrib( - std::numeric_limits::min(), std::numeric_limits::max()); - for (unsigned i = 0; i < N; i++) - A[i] = static_cast(distrib(rng)); +static void init_data(const std::unique_ptr &A, unsigned N, + bool Randomize = true) { + if (Randomize) { + std::uniform_int_distribution distrib(std::numeric_limits::min(), + std::numeric_limits::max()); + for (unsigned i = 0; i < N; i++) + A[i] = distrib(rng); + } else { + for (unsigned i = 0; i < N; i++) + A[i] = std::numeric_limits::max(); + } } // Truncate/Zero-extend each vector element in a vectorized loop with vectorization width 8 @@ -30,7 +36,6 @@ std::unique_ptr A(new Ty1[ITERATIONS]); std::unique_ptr B(new Ty2[ITERATIONS]); init_data(A, ITERATIONS); - init_data(B, ITERATIONS); for (auto _ : state) { benchmark::DoNotOptimize(B); benchmark::ClobberMemory(); @@ -51,7 +56,6 @@ std::unique_ptr A(new Ty1[ITERATIONS]); std::unique_ptr B(new Ty2[ITERATIONS]); init_data(A, ITERATIONS); - init_data(B, ITERATIONS); for (auto _ : state) { benchmark::DoNotOptimize(B); benchmark::ClobberMemory(); @@ -59,16 +63,55 @@ } } +// Check correctness of Truncate/Zero-extend operation of each vector element in +// a vectorized loop +template +static void __attribute__((always_inline)) +correctnessTestForTruncOrZextVecInLoop(benchmark::State &state) { + std::unique_ptr A(new Ty1[ITERATIONS]); + std::unique_ptr B(new Ty2[ITERATIONS]); + // Create source vector elements with maximum value for the type + init_data(A, ITERATIONS, false); + for (auto _ : state) { + benchmark::DoNotOptimize(B); + benchmark::ClobberMemory(); + truncOrZextVecInLoop(&A[0], &B[0], ITERATIONS); + } + + // Create expected result of the Trunc or ZExt operation + Ty2 C; + if (sizeof(Ty1) > sizeof(Ty2)) { + C = std::numeric_limits::max(); + } else { + C = std::numeric_limits::max(); + } + + for (int I = 0; I < ITERATIONS; I++) { + if (B[I] != C) { + std::cerr << "ERROR: Trunc or ZExt operation on " << A[I] + << " is showing result " << B[I] << " instead of " << C << "\n"; + exit(1); + } + } +} + // Add vectorized truncate or zero-extend operation benchmarks for different element types -#define ADD_BENCHMARK(ty1, ty2) \ - void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_(benchmark::State &state) { \ - benchForTruncOrZextVecInLoopWithVW8(state); \ +#define ADD_BENCHMARK(ty1, ty2) \ + void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_( \ + benchmark::State &state) { \ + benchForTruncOrZextVecInLoopWithVW8(state); \ + } \ + BENCHMARK(benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_); \ + void benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_( \ + benchmark::State &state) { \ + benchForTruncOrZextVecInLoop(state); \ } \ - BENCHMARK(benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_); \ - void benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_(benchmark::State &state) { \ - benchForTruncOrZextVecInLoop(state); \ + BENCHMARK(benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_); \ + void correctnessTestForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_( \ + benchmark::State &state) { \ + correctnessTestForTruncOrZextVecInLoop(state); \ } \ - BENCHMARK(benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_); \ + BENCHMARK(correctnessTestForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_); /* Vectorized truncate operations */ ADD_BENCHMARK(uint64_t, uint8_t) @@ -77,4 +120,6 @@ /* Vectorized zero extend operations */ +ADD_BENCHMARK(uint8_t, uint16_t) ADD_BENCHMARK(uint8_t, uint32_t) +ADD_BENCHMARK(uint8_t, uint64_t)