diff --git a/MicroBenchmarks/CMakeLists.txt b/MicroBenchmarks/CMakeLists.txt --- a/MicroBenchmarks/CMakeLists.txt +++ b/MicroBenchmarks/CMakeLists.txt @@ -7,4 +7,5 @@ add_subdirectory(harris) add_subdirectory(ImageProcessing) add_subdirectory(LoopInterchange) +add_subdirectory(LoopVectorization) add_subdirectory(MemFunctions) diff --git a/MicroBenchmarks/LoopVectorization/CMakeLists.txt b/MicroBenchmarks/LoopVectorization/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/MicroBenchmarks/LoopVectorization/CMakeLists.txt @@ -0,0 +1,16 @@ +llvm_test_run(WORKDIR ${CMAKE_CURRENT_BINARY_DIR}) + +# Only enable verification of results if neither 'benchmarking only' has been +# selected nor -ffast-math is passed. +string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPER) +set(COMBINED_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${CPPFLAGS}") +if (NOT TEST_SUITE_BENCHMARKING_ONLY AND + NOT ${COMBINED_CXX_FLAGS} MATCHES ".*-ffast-math.*") + list(APPEND CPPFLAGS -DBENCH_AND_VERIFY) +endif() + +llvm_test_executable(LoopVectorizationBenchmarks + main.cpp + MathFunctions.cpp) + +target_link_libraries(LoopVectorizationBenchmarks benchmark) diff --git a/MicroBenchmarks/LoopVectorization/MathFunctions.cpp b/MicroBenchmarks/LoopVectorization/MathFunctions.cpp new file mode 100644 --- /dev/null +++ b/MicroBenchmarks/LoopVectorization/MathFunctions.cpp @@ -0,0 +1,128 @@ +#include +#include +#include +#include + +#include "benchmark/benchmark.h" + +#define N 10000 + +// Apply Fn(A[i]) + Fn(B[i]) in loop, with default loop vectorization settings. +template static void run_fn_autovec(T *A, T *B, T *C, T (*Fn)(T)) { + for (unsigned i = 0; i < N; i++) { + C[i] = Fn(A[i]) + Fn(B[i]); + } +} + +// Apply Fn(A[i]) + Fn(B[i]) in loop, with loop vectorization disabled. +template static void run_fn_novec(T *A, T *B, T *C, T (*Fn)(T)) { +#pragma clang loop vectorize(disable) interleave(disable) + for (unsigned i = 0; i < N; i++) { + C[i] = Fn(A[i]) + Fn(B[i]); + } +} + +// Initialize arrays A, B and T with random numbers. +template static void init_data(T *A, T *B, T *C) { + std::uniform_real_distribution dist(-100, 100); + std::mt19937 rng(12345); + for (unsigned i = 0; i < N; i++) { + A[i] = dist(rng); + B[i] = dist(rng); + C[i] = dist(rng); + } +} + +// Benchmark auto-vectorized version using Fn. +template +static void __attribute__((always_inline)) +benchmark_fn_autovec(benchmark::State &state, T (*Fn)(T)) { + std::unique_ptr A(new T[N]); + std::unique_ptr B(new T[N]); + std::unique_ptr C(new T[N]); + init_data(&A[0], &B[0], &C[0]); + +#ifdef BENCH_AND_VERIFY + // Verify the vectorized and un-vectorized versions produce the same results. + { + std::unique_ptr CNovec(new T[N]); + for (unsigned i = 0; i < N; i++) + CNovec[i] = C[i]; + + run_fn_novec(&A[0], &B[0], &CNovec[0], Fn); + run_fn_autovec(&A[0], &B[0], &C[0], Fn); + for (unsigned i = 0; i < N; i++) + // If there's a value mismatch, fall back to fpclassify. + if (C[i] != CNovec[i] && fpclassify(C[i]) != fpclassify(CNovec[i])) { + std::cerr << "ERROR: autovec result different to scalar result " << C[i] + << " != " << CNovec[i] << " at index " << i << "\n"; + exit(1); + } + } +#endif + + for (auto _ : state) { + run_fn_autovec(&A[0], &B[0], &C[0], Fn); + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::DoNotOptimize(C); + benchmark::ClobberMemory(); + } +} + +// Benchmark version using Fn with vectorization disabled. +template +static void __attribute__((always_inline)) +benchmark_fn_novec(benchmark::State &state, T (*Fn)(T)) { + std::unique_ptr A(new T[N]); + std::unique_ptr B(new T[N]); + std::unique_ptr C(new T[N]); + init_data(&A[0], &B[0], &C[0]); + + for (auto _ : state) { + run_fn_novec(&A[0], &B[0], &C[0], Fn); + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::DoNotOptimize(C); + } +} + +// Add add auto-vectorized and disabled vectorization benchmarks for math +// function fn and type ty. +#define ADD_BENCHMARK(fn, ty) \ + void BENCHMARK_##fn##_autovec_##ty##_(benchmark::State &state) { \ + benchmark_fn_autovec(state, fn); \ + } \ + BENCHMARK(BENCHMARK_##fn##_autovec_##ty##_)->Unit(benchmark::kMicrosecond); \ + \ + void BENCHMARK_##fn##_novec_##ty##_(benchmark::State &state) { \ + benchmark_fn_novec(state, fn); \ + } \ + BENCHMARK(BENCHMARK_##fn##_novec_##ty##_)->Unit(benchmark::kMicrosecond); + +ADD_BENCHMARK(expf, float) +ADD_BENCHMARK(exp, double) + +ADD_BENCHMARK(acosf, float) +ADD_BENCHMARK(acos, double) + +ADD_BENCHMARK(asinf, float) +ADD_BENCHMARK(asin, double) + +ADD_BENCHMARK(atanf, float) +ADD_BENCHMARK(atan, double) + +ADD_BENCHMARK(cbrtf, float) +ADD_BENCHMARK(cbrt, double) + +ADD_BENCHMARK(erff, float) +ADD_BENCHMARK(erf, double) + +ADD_BENCHMARK(cosf, float) +ADD_BENCHMARK(cos, double) + +ADD_BENCHMARK(sinf, float) +ADD_BENCHMARK(sin, double) + +ADD_BENCHMARK(sinhf, float) +ADD_BENCHMARK(sinh, double) diff --git a/MicroBenchmarks/LoopVectorization/main.cpp b/MicroBenchmarks/LoopVectorization/main.cpp new file mode 100644 --- /dev/null +++ b/MicroBenchmarks/LoopVectorization/main.cpp @@ -0,0 +1,8 @@ +#include "benchmark/benchmark.h" + +int main(int argc, char *argv[]) { + benchmark::Initialize(&argc, argv); + + benchmark::RunSpecifiedBenchmarks(); + return EXIT_SUCCESS; +}