Index: test-suite/trunk/MicroBenchmarks/CMakeLists.txt =================================================================== --- test-suite/trunk/MicroBenchmarks/CMakeLists.txt +++ test-suite/trunk/MicroBenchmarks/CMakeLists.txt @@ -6,3 +6,4 @@ add_subdirectory(harris) add_subdirectory(ImageProcessing) add_subdirectory(LoopInterchange) +add_subdirectory(MemFunctions) Index: test-suite/trunk/MicroBenchmarks/MemFunctions/CMakeLists.txt =================================================================== --- test-suite/trunk/MicroBenchmarks/MemFunctions/CMakeLists.txt +++ test-suite/trunk/MicroBenchmarks/MemFunctions/CMakeLists.txt @@ -0,0 +1,5 @@ +llvm_test_run(WORKDIR ${CMAKE_CURRENT_BINARY_DIR}) + +llvm_test_executable(MemFunctions main.cpp) + +target_link_libraries(MemFunctions benchmark) Index: test-suite/trunk/MicroBenchmarks/MemFunctions/main.cpp =================================================================== --- test-suite/trunk/MicroBenchmarks/MemFunctions/main.cpp +++ test-suite/trunk/MicroBenchmarks/MemFunctions/main.cpp @@ -0,0 +1,127 @@ +//===- main.cc - Memory Functions Benchmarks ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Memory functions (memcmp, memcpy, ...) are typically recognized by the +// compiler and expanded to specific asm patterns when the size is known at +// compile time. THese microbenchmarks help catch potential CodeGen regressions. +// +// Note that these microbenchmarks do not represent a typical real-life +// situation. They are designed to test the LLVM CodeGen. In particular, +// real-life applications will typically be memory- rather than compute-bound +// when manipulating memory. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "benchmark/benchmark.h" + +// Benchmarks `memcmp(p, q, size) OP 0` where n is known at compile time and OP +// is defined by `Pred`. The compiler typically inlines the memcmp + comparion +// to loads and compares. +template +void BM_MemCmp(benchmark::State& state) { + // The buffer size should be large enough that there are several elements in a + // buffer, but small enough to fit in cache. Alsom the smaller the buffer + // size, the more latency the benchmark framework has to adjust the number of + // iterations to make benchmarking faster. + static constexpr size_t kMaxBufSizeBytes = 4096; + constexpr const size_t kNumElements = kMaxBufSizeBytes / kSize; + + std::vector p_storage(kNumElements * kSize); + std::vector q_storage(kNumElements * kSize); + char* p = p_storage.data(); + const char* q = q_storage.data(); + + // We're comparing an all-zeros buffer (q) vs an all-zeros-but-one-element + // buffer (p). The non-zero element is detemined by `Mod`. + for (int i = 0; i < kNumElements; ++i) + Mod().template Change(p + i * kSize); + + benchmark::DoNotOptimize(p); + benchmark::DoNotOptimize(q); + + for (auto _ : state) { + benchmark::ClobberMemory(); + benchmark::ClobberMemory(); + + for (int i = 0; i < kNumElements; ++i) { + int res = Pred()(memcmp(p + i * kSize, q + i * kSize, kSize)); + benchmark::DoNotOptimize(res); + } + } + state.SetBytesProcessed(p_storage.size() * state.iterations()); +} + +// Predicates. +struct EqZero { + bool operator()(int v) const { return v == 0; } +}; +struct LessThanZero { + bool operator()(int v) const { return v < 0; } +}; +struct GreaterThanZero { + bool operator()(int v) const { return v > 0; } +}; + +// Functors to change the first/mid/last or no value. +struct None { + template + void Change(char* const p) const {} +}; +struct First { + template + void Change(char* const p) const { + p[0] = 128; + } +}; +struct Mid { + template + void Change(char* const p) const { + p[kSize / 2] = 128; + } +}; +struct Last { + template + void Change(char* const p) const { + p[kSize - 1] = 128; + } +}; + +#define MEMCMP_BENCHMARK_PRED_CHANGE(size, pred, change) \ + BENCHMARK_TEMPLATE(BM_MemCmp, size, pred, change) \ + ->Unit(benchmark::kNanosecond); + +#define MEMCMP_BENCHMARK_PRED(size, pred) \ + MEMCMP_BENCHMARK_PRED_CHANGE(size, pred, None); \ + MEMCMP_BENCHMARK_PRED_CHANGE(size, pred, First); \ + MEMCMP_BENCHMARK_PRED_CHANGE(size, pred, Mid); \ + MEMCMP_BENCHMARK_PRED_CHANGE(size, pred, Last); + +#define MEMCMP_BENCHMARK(size) \ + MEMCMP_BENCHMARK_PRED(size, EqZero) \ + MEMCMP_BENCHMARK_PRED(size, LessThanZero) \ + MEMCMP_BENCHMARK_PRED(size, GreaterThanZero) + +MEMCMP_BENCHMARK(1) +MEMCMP_BENCHMARK(2) +MEMCMP_BENCHMARK(3) +MEMCMP_BENCHMARK(4) +MEMCMP_BENCHMARK(5) +MEMCMP_BENCHMARK(6) +MEMCMP_BENCHMARK(7) +MEMCMP_BENCHMARK(8) +MEMCMP_BENCHMARK(15) +MEMCMP_BENCHMARK(16) +MEMCMP_BENCHMARK(31) +MEMCMP_BENCHMARK(32) +MEMCMP_BENCHMARK(63) +MEMCMP_BENCHMARK(64) + +BENCHMARK_MAIN();