diff --git a/MicroBenchmarks/CMakeLists.txt b/MicroBenchmarks/CMakeLists.txt --- a/MicroBenchmarks/CMakeLists.txt +++ b/MicroBenchmarks/CMakeLists.txt @@ -9,3 +9,4 @@ add_subdirectory(LoopInterchange) add_subdirectory(LoopVectorization) add_subdirectory(MemFunctions) +add_subdirectory(SLPVectorization) diff --git a/MicroBenchmarks/SLPVectorization/CMakeLists.txt b/MicroBenchmarks/SLPVectorization/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/MicroBenchmarks/SLPVectorization/CMakeLists.txt @@ -0,0 +1,14 @@ +llvm_test_run(WORKDIR ${CMAKE_CURRENT_BINARY_DIR}) + +# Try and enable C++11. Don't use C++14 because it doesn't work in some +# configurations. +add_cxx_compiler_flag(-std=c++11) +if (NOT HAVE_CXX_FLAG_STD_CXX11) + add_cxx_compiler_flag(-std=c++0x) +endif () + +llvm_test_executable(SLPVectorizationBenchmarks + main.cpp + Versioning.cpp) + +target_link_libraries(SLPVectorizationBenchmarks benchmark) diff --git a/MicroBenchmarks/SLPVectorization/Versioning.cpp b/MicroBenchmarks/SLPVectorization/Versioning.cpp new file mode 100644 --- /dev/null +++ b/MicroBenchmarks/SLPVectorization/Versioning.cpp @@ -0,0 +1,222 @@ +#include +#include +#include +#include + +#include "benchmark/benchmark.h" + +// Initialize arrays A, B and T with random numbers. +template static void init_data(T *A, unsigned N) { + std::uniform_real_distribution dist(-100, 100); + std::mt19937 rng(12345); + for (unsigned i = 0; i < N; i++) + A[i] = dist(rng); +} + +template +__attribute__((noinline)) void do_xor(T *A, T *B) { +#pragma clang loop unroll(full) + for (int i = 0; i < N; i++) { + A[i] ^= B[i]; + } +} + +template +void benchmark_xor_runtime_checks_pass(benchmark::State &state) { + T A[N]; + T B[N]; + init_data(&A[0], N); + init_data(&B[0], N); + + for (auto _ : state) { + do_xor(&A[0], &B[0]); + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::ClobberMemory(); + } +} + +template +void benchmark_xor_runtime_checks_fail(benchmark::State &state) { + T A[2 * N]; + init_data(&A[0], 2 * N); + + for (auto _ : state) { + do_xor(&A[0], &A[3]); + benchmark::DoNotOptimize(A); + benchmark::ClobberMemory(); + } +} + +template +void benchmark_xor_no_runtime_checks_needed(benchmark::State &state) { + T A[N]; + T B[N]; + init_data(&A[0], N); + init_data(&B[0], N); + + for (auto _ : state) { +#pragma clang loop unroll(full) + for (int i = 0; i < N; i++) { + A[i] ^= B[i]; + } + + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::ClobberMemory(); + } +} + +BENCHMARK_TEMPLATE(benchmark_xor_runtime_checks_pass, 4, int); +BENCHMARK_TEMPLATE(benchmark_xor_runtime_checks_fail, 4, int); +BENCHMARK_TEMPLATE(benchmark_xor_no_runtime_checks_needed, 4, int); + +BENCHMARK_TEMPLATE(benchmark_xor_runtime_checks_pass, 16, int); +BENCHMARK_TEMPLATE(benchmark_xor_runtime_checks_fail, 16, int); +BENCHMARK_TEMPLATE(benchmark_xor_no_runtime_checks_needed, 16, int); + +template +__attribute__((noinline)) void do_add_xor(T *A, T *B, T *C) { +#pragma clang loop unroll(full) + for (int i = 0; i < N; i++) { + A[i] ^= B[i] + C[i]; + } +} + +template +void benchmark_add_xor_runtime_checks_pass(benchmark::State &state) { + T A[N]; + T B[N]; + T C[N]; + init_data(&A[0], N); + init_data(&B[0], N); + init_data(&C[0], N); + + for (auto _ : state) { + do_add_xor(&A[0], &B[0], &C[0]); + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::DoNotOptimize(C); + benchmark::ClobberMemory(); + } +} + +template +void benchmark_add_xor_runtime_checks_fail(benchmark::State &state) { + T A[2 * N]; + T B[N]; + init_data(&A[0], 2 * N); + init_data(&B[0], N); + + for (auto _ : state) { + do_add_xor(&A[0], &A[3], &B[0]); + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::ClobberMemory(); + } +} + +template +void benchmark_add_xor_no_runtime_checks_needed(benchmark::State &state) { + T A[N]; + T B[N]; + T C[N]; + init_data(&A[0], N); + init_data(&B[0], N); + init_data(&C[0], N); + + for (auto _ : state) { +#pragma clang loop unroll(full) + for (int i = 0; i < N; i++) { + A[i] ^= B[i] + C[i]; + } + + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::DoNotOptimize(C); + benchmark::ClobberMemory(); + } +} + +BENCHMARK_TEMPLATE(benchmark_add_xor_runtime_checks_pass, 4, int); +BENCHMARK_TEMPLATE(benchmark_add_xor_runtime_checks_fail, 4, int); +BENCHMARK_TEMPLATE(benchmark_add_xor_no_runtime_checks_needed, 4, int); +BENCHMARK_TEMPLATE(benchmark_add_xor_runtime_checks_pass, 16, int); +BENCHMARK_TEMPLATE(benchmark_add_xor_runtime_checks_fail, 16, int); +BENCHMARK_TEMPLATE(benchmark_add_xor_no_runtime_checks_needed, 16, int); + +template +__attribute__((noinline)) void do_multiply_accumulate(T *A, T *B, T c) { +#pragma clang loop unroll(full) + for (unsigned int i = 0; i < N; ++i) { + A[i] += c * B[i]; + } +} + +template +void benchmark_multiply_accumulate_runtime_checks_pass( + benchmark::State &state) { + T A[N]; + T B[N]; + init_data(&A[0], N); + init_data(&B[0], N); + + for (auto _ : state) { + do_multiply_accumulate(&A[0], &B[0], B[0]); + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::ClobberMemory(); + } +} + +template +void benchmark_multiply_accumulate_runtime_checks_fail( + benchmark::State &state) { + T A[N]; + init_data(&A[0], N); + + for (auto _ : state) { + do_multiply_accumulate(&A[0], &A[1], A[0]); + benchmark::DoNotOptimize(A); + benchmark::ClobberMemory(); + } +} + +template +void benchmark_multiply_accumulate_no_runtime_checks_needed( + benchmark::State &state) { + T A[N]; + T B[N]; + init_data(&A[0], N); + init_data(&B[0], N); + + for (auto _ : state) { + T c = A[0]; +#pragma clang loop unroll(full) + for (unsigned int i = 0; i < N; ++i) { + A[i] += c * B[i]; + } + benchmark::DoNotOptimize(A); + benchmark::DoNotOptimize(B); + benchmark::ClobberMemory(); + } +} + +BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_runtime_checks_pass, 2, + double); +BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_runtime_checks_fail, 2, + double); +BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_no_runtime_checks_needed, 2, + double); +BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_runtime_checks_pass, 3, + double); +BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_runtime_checks_fail, 3, + double); +BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_no_runtime_checks_needed, 3, + double); +BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_runtime_checks_pass, 4, + double); +BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_runtime_checks_fail, 4, + double); +BENCHMARK_TEMPLATE(benchmark_multiply_accumulate_no_runtime_checks_needed, 4, + double); diff --git a/MicroBenchmarks/SLPVectorization/main.cpp b/MicroBenchmarks/SLPVectorization/main.cpp new file mode 100644 --- /dev/null +++ b/MicroBenchmarks/SLPVectorization/main.cpp @@ -0,0 +1,8 @@ +#include "benchmark/benchmark.h" + +int main(int argc, char *argv[]) { + benchmark::Initialize(&argc, argv); + + benchmark::RunSpecifiedBenchmarks(); + return EXIT_SUCCESS; +}