diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt --- a/libc/benchmarks/CMakeLists.txt +++ b/libc/benchmarks/CMakeLists.txt @@ -145,11 +145,27 @@ # Benchmarking tool #============================================================================== -add_executable(libc-benchmark-main - EXCLUDE_FROM_ALL - LibcMemoryBenchmarkMain.cpp -) -foreach(entrypoint_target libc.src.string.memcpy libc.src.string.memset) - get_target_property(entrypoint_object_file ${entrypoint_target} "OBJECT_FILE_RAW") - target_link_libraries(libc-benchmark-main PUBLIC json ${entrypoint_object_file}) -endforeach() +# Benchmark all implementations that can run on the target CPU. +function(add_libc_multi_impl_benchmark name) + get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations) + foreach(fq_config_name IN LISTS fq_implementations) + get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES) + cpu_supports(can_run "${required_cpu_features}") + if(can_run) + set(benchmark_name ${fq_config_name}_benchmark) + add_executable(${benchmark_name} + EXCLUDE_FROM_ALL + LibcMemoryBenchmarkMain.cpp + ) + get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW") + target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file}) + string(TOUPPER ${name} name_upper) + target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=1" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"") + else() + message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'") + endif() + endforeach() +endfunction() + +add_libc_multi_impl_benchmark(memcpy) +add_libc_multi_impl_benchmark(memset) diff --git a/libc/benchmarks/LibcMemoryBenchmarkMain.cpp b/libc/benchmarks/LibcMemoryBenchmarkMain.cpp --- a/libc/benchmarks/LibcMemoryBenchmarkMain.cpp +++ b/libc/benchmarks/LibcMemoryBenchmarkMain.cpp @@ -27,17 +27,9 @@ namespace llvm { namespace libc_benchmarks { -enum Function { memcpy, memset }; - static cl::opt StudyName("study-name", cl::desc("The name for this study"), cl::Required); -static cl::opt - MemoryFunction("function", cl::desc("Sets the function to benchmark:"), - cl::values(clEnumVal(memcpy, "__llvm_libc::memcpy"), - clEnumVal(memset, "__llvm_libc::memset")), - cl::Required); - static cl::opt SizeDistributionName("size-distribution-name", cl::desc("The name of the distribution to use")); @@ -75,12 +67,12 @@ unsigned SizeBytes : 16; // max : 16 KiB - 1 }; -struct MemcpyBenchmark { +#if defined(LIBC_BENCHMARK_FUNCTION_MEMCPY) +struct Benchmark { static constexpr auto GetDistributions = &getMemcpySizeDistributions; static constexpr size_t BufferCount = 2; - static void amend(Study &S) { S.Configuration.Function = "memcpy"; } - MemcpyBenchmark(const size_t BufferSize) + Benchmark(const size_t BufferSize) : SrcBuffer(BufferSize), DstBuffer(BufferSize) {} inline auto functor() { @@ -94,13 +86,12 @@ AlignedBuffer SrcBuffer; AlignedBuffer DstBuffer; }; - -struct MemsetBenchmark { +#elif defined(LIBC_BENCHMARK_FUNCTION_MEMSET) +struct Benchmark { static constexpr auto GetDistributions = &getMemsetSizeDistributions; static constexpr size_t BufferCount = 1; - static void amend(Study &S) { S.Configuration.Function = "memset"; } - MemsetBenchmark(const size_t BufferSize) : DstBuffer(BufferSize) {} + Benchmark(const size_t BufferSize) : DstBuffer(BufferSize) {} inline auto functor() { return [this](ParameterType P) { @@ -112,8 +103,11 @@ AlignedBuffer DstBuffer; }; +#else +#error "Missing LIBC_BENCHMARK_FUNCTION_XXX definition" +#endif -template struct Harness : Benchmark { +struct Harness : Benchmark { using Benchmark::functor; Harness(const size_t BufferSize, size_t BatchParameterCount, @@ -140,11 +134,6 @@ std::function OffsetSampler; }; -struct IBenchmark { - virtual ~IBenchmark() {} - virtual Study run() = 0; -}; - size_t getL1DataCacheSize() { const std::vector &CacheInfos = HostState::get().Caches; const auto IsL1DataCache = [](const CacheInfo &CI) { @@ -156,7 +145,7 @@ report_fatal_error("Unable to read L1 Cache Data Size"); } -template struct MemfunctionBenchmark : IBenchmark { +struct MemfunctionBenchmark { MemfunctionBenchmark(int64_t L1Size = getL1DataCacheSize()) : AvailableSize(L1Size - L1LeftAsideBytes - ParameterStorageBytes), BufferSize(AvailableSize / Benchmark::BufferCount), @@ -219,10 +208,10 @@ SC.AccessAlignment = MaybeAlign(AlignedAccess); // Delegate specific flags and configuration. - Benchmark::amend(Study); + Study.Configuration.Function = LIBC_BENCHMARK_FUNCTION_NAME; } - Study run() override { + Study run() { if (SweepMode) runSweepMode(); else @@ -280,8 +269,7 @@ void runTrials(const BenchmarkOptions &Options, std::function SizeSampler, std::function OffsetSampler) { - Harness B(BufferSize, BatchParameterCount, SizeSampler, - OffsetSampler); + Harness B(BufferSize, BatchParameterCount, SizeSampler, OffsetSampler); for (size_t i = 0; i < NumTrials; ++i) { const BenchmarkResult Result = benchmark(Options, B, B.functor()); Study.Measurements.push_back(Result.BestGuess); @@ -313,15 +301,6 @@ } }; -std::unique_ptr getMemfunctionBenchmark() { - switch (MemoryFunction) { - case memcpy: - return std::make_unique>(); - case memset: - return std::make_unique>(); - } -} - void writeStudy(const Study &S) { std::error_code EC; raw_fd_ostream FOS(Output, EC); @@ -337,8 +316,8 @@ void main() { checkRequirements(); - auto MB = getMemfunctionBenchmark(); - writeStudy(MB->run()); + MemfunctionBenchmark MB; + writeStudy(MB.run()); } } // namespace libc_benchmarks diff --git a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake --- a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake +++ b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake @@ -3,10 +3,14 @@ # ------------------------------------------------------------------------------ if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) - set(ALL_CPU_FEATURES SSE SSE2 AVX AVX2 AVX512F) - list(SORT ALL_CPU_FEATURES) + set(ALL_CPU_FEATURES SSE2 SSE4_2 AVX2 AVX512F) + set(LIBC_COMPILE_OPTIONS_NATIVE -march=native) +elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) + set(LIBC_COMPILE_OPTIONS_NATIVE -mtune=native) endif() +list(SORT ALL_CPU_FEATURES) + # Function to check whether the target CPU supports the provided set of features. # Usage: # cpu_supports( @@ -22,49 +26,6 @@ endif() endfunction() -# Function to compute the flags to pass down to the compiler. -# Usage: -# compute_flags( -# -# MARCH -# REQUIRE -# REJECT -# ) -function(compute_flags output_var) - cmake_parse_arguments( - "COMPUTE_FLAGS" - "" # Optional arguments - "MARCH" # Single value arguments - "REQUIRE;REJECT" # Multi value arguments - ${ARGN}) - # Check that features are not required and rejected at the same time. - if(COMPUTE_FLAGS_REQUIRE AND COMPUTE_FLAGS_REJECT) - _intersection(var ${COMPUTE_FLAGS_REQUIRE} ${COMPUTE_FLAGS_REJECT}) - if(var) - message(FATAL_ERROR "Cpu Features REQUIRE and REJECT ${var}") - endif() - endif() - # Generate the compiler flags in `current`. - if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang|GNU") - if(COMPUTE_FLAGS_MARCH) - list(APPEND current "-march=${COMPUTE_FLAGS_MARCH}") - endif() - foreach(feature IN LISTS COMPUTE_FLAGS_REQUIRE) - string(TOLOWER ${feature} lowercase_feature) - list(APPEND current "-m${lowercase_feature}") - endforeach() - foreach(feature IN LISTS COMPUTE_FLAGS_REJECT) - string(TOLOWER ${feature} lowercase_feature) - list(APPEND current "-mno-${lowercase_feature}") - endforeach() - else() - # In future, we can extend for other compilers. - message(FATAL_ERROR "Unkown compiler ${CMAKE_CXX_COMPILER_ID}.") - endif() - # Export the list of flags. - set(${output_var} "${current}" PARENT_SCOPE) -endfunction() - # ------------------------------------------------------------------------------ # Internal helpers and utilities. # ------------------------------------------------------------------------------ @@ -94,54 +55,27 @@ endfunction() _generate_check_code() -# Compiles and runs the code generated above with the specified requirements. -# This is helpful to infer which features a particular target supports or if -# a specific features implies other features (e.g. BMI2 implies SSE2 and SSE). -function(_check_defined_cpu_feature output_var) - cmake_parse_arguments( - "CHECK_DEFINED" - "" # Optional arguments - "MARCH" # Single value arguments - "REQUIRE;REJECT" # Multi value arguments - ${ARGN}) - compute_flags( - flags - MARCH ${CHECK_DEFINED_MARCH} - REQUIRE ${CHECK_DEFINED_REQUIRE} - REJECT ${CHECK_DEFINED_REJECT}) +set(LIBC_CPU_FEATURES "" CACHE PATH "Host supported CPU features") + +if(CMAKE_CROSSCOMPILING) + _intersection(cpu_features "${ALL_CPU_FEATURES}" "${LIBC_CPU_FEATURES}") + if(NOT "${cpu_features}" STREQUAL "${LIBC_CPU_FEATURES}") + message(FATAL_ERROR "Unsupported CPU features: ${cpu_features}") + endif() + set(LIBC_CPU_FEATURES "${cpu_features}") +else() + # Populates the LIBC_CPU_FEATURES list from host. try_run( run_result compile_result "${CMAKE_CURRENT_BINARY_DIR}/check_${feature}" "${CMAKE_CURRENT_BINARY_DIR}/cpu_features/check_cpu_features.cpp" - COMPILE_DEFINITIONS ${flags} + COMPILE_DEFINITIONS ${LIBC_COMPILE_OPTIONS_NATIVE} COMPILE_OUTPUT_VARIABLE compile_output RUN_OUTPUT_VARIABLE run_output) if("${run_result}" EQUAL 0) - set(${output_var} - "${run_output}" - PARENT_SCOPE) + set(LIBC_CPU_FEATURES "${run_output}") elseif(NOT ${compile_result}) message(FATAL_ERROR "Failed to compile: ${compile_output}") else() message(FATAL_ERROR "Failed to run: ${run_output}") endif() -endfunction() - -set(LIBC_CPU_FEATURES "" CACHE PATH "supported CPU features") - -if(CMAKE_CROSSCOMPILING) - _intersection(cpu_features "${ALL_CPU_FEATURES}" "${LIBC_CPU_FEATURES}") - if(NOT "${cpu_features}" STREQUAL "${LIBC_CPU_FEATURES}") - message(FATAL_ERROR "Unsupported CPU features: ${cpu_features}") - endif() - set(LIBC_CPU_FEATURES "${cpu_features}") -else() - # Populates the LIBC_CPU_FEATURES list. - # Use -march=native only when the compiler supports it. - include(CheckCXXCompilerFlag) - CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) - if(COMPILER_SUPPORTS_MARCH_NATIVE) - _check_defined_cpu_feature(LIBC_CPU_FEATURES MARCH native) - else() - _check_defined_cpu_feature(LIBC_CPU_FEATURES) - endif() endif() diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -186,20 +186,15 @@ cmake_parse_arguments( "ADD_IMPL" "" # Optional arguments - "MARCH" # Single value arguments - "REQUIRE;REJECT;SRCS;HDRS;DEPENDS;COMPILE_OPTIONS" # Multi value arguments + "" # Single value arguments + "REQUIRE;SRCS;HDRS;DEPENDS;COMPILE_OPTIONS" # Multi value arguments ${ARGN}) - compute_flags(flags - MARCH ${ADD_IMPL_MARCH} - REQUIRE ${ADD_IMPL_REQUIRE} - REJECT ${ADD_IMPL_REJECT} - ) add_entrypoint_object(${impl_name} NAME ${name} SRCS ${ADD_IMPL_SRCS} HDRS ${ADD_IMPL_HDRS} DEPENDS ${ADD_IMPL_DEPENDS} - COMPILE_OPTIONS ${ADD_IMPL_COMPILE_OPTIONS} ${flags} -O2 + COMPILE_OPTIONS ${ADD_IMPL_COMPILE_OPTIONS} ) get_fq_target_name(${impl_name} fq_target_name) set_target_properties(${fq_target_name} PROPERTIES REQUIRE_CPU_FEATURES "${ADD_IMPL_REQUIRE}") @@ -210,17 +205,6 @@ # memcpy # ------------------------------------------------------------------------------ -# include the relevant architecture specific implementations -if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) - set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/${LIBC_TARGET_ARCHITECTURE}/memcpy.cpp) -elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) - set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/${LIBC_TARGET_ARCHITECTURE}/memcpy.cpp) -#Disable tail merging as it leads to lower performance - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mllvm --tail-merge-threshold=0") -else() - set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp) -endif() - function(add_memcpy memcpy_name) add_implementation(memcpy ${memcpy_name} SRCS ${MEMCPY_SRC} @@ -235,8 +219,22 @@ endfunction() if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) - add_memcpy(memcpy MARCH native) + set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/x86_64/memcpy.cpp) + add_memcpy(memcpy_x86_64_opt_sse2 COMPILE_OPTIONS -march=x86-64 REQUIRE SSE2) + add_memcpy(memcpy_x86_64_opt_sse4 COMPILE_OPTIONS -march=x86-64-v2 REQUIRE SSE4_2) + add_memcpy(memcpy_x86_64_opt_avx2 COMPILE_OPTIONS -march=x86-64-v3 REQUIRE AVX2) + add_memcpy(memcpy_x86_64_opt_avx512 COMPILE_OPTIONS -march=x86-64-v4 REQUIRE AVX512F) + add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) + add_memcpy(memcpy) +elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) + set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/aarch64/memcpy.cpp) + #Disable tail merging as it leads to lower performance + add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE} + COMPILE_OPTIONS "-mllvm --tail-merge-threshold=0") + add_memcpy(memcpy COMPILE_OPTIONS "-mllvm --tail-merge-threshold=0") else() + set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp) + add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) add_memcpy(memcpy) endif() @@ -258,8 +256,14 @@ endfunction() if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) - add_memset(memset MARCH native) + add_memset(memset_x86_64_opt_sse2 COMPILE_OPTIONS -march=x86-64 REQUIRE SSE2) + add_memset(memset_x86_64_opt_sse4 COMPILE_OPTIONS -march=x86-64-v2 REQUIRE SSE4_2) + add_memset(memset_x86_64_opt_avx2 COMPILE_OPTIONS -march=x86-64-v3 REQUIRE AVX2) + add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=x86-64-v4 REQUIRE AVX512F) + add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) + add_memset(memset) else() + add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) add_memset(memset) endif() @@ -282,15 +286,13 @@ endfunction() if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) - add_bzero(bzero MARCH native) + add_bzero(bzero_x86_64_opt_sse2 COMPILE_OPTIONS -march=x86-64 REQUIRE SSE2) + add_bzero(bzero_x86_64_opt_sse4 COMPILE_OPTIONS -march=x86-64-v2 REQUIRE SSE4_2) + add_bzero(bzero_x86_64_opt_avx2 COMPILE_OPTIONS -march=x86-64-v3 REQUIRE AVX2) + add_bzero(bzero_x86_64_opt_avx512 COMPILE_OPTIONS -march=x86-64-v4 REQUIRE AVX512F) + add_bzero(bzero_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) + add_bzero(bzero) else() + add_bzero(bzero_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) add_bzero(bzero) endif() - -# ------------------------------------------------------------------------------ -# Add all other relevant implementations for the native target. -# ------------------------------------------------------------------------------ - -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE}) - include(${LIBC_TARGET_ARCHITECTURE}/CMakeLists.txt) -endif() diff --git a/libc/src/string/aarch64/CMakeLists.txt b/libc/src/string/aarch64/CMakeLists.txt deleted file mode 100644 --- a/libc/src/string/aarch64/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_memcpy("memcpy_${LIBC_TARGET_ARCHITECTURE}") diff --git a/libc/src/string/x86_64/CMakeLists.txt b/libc/src/string/x86_64/CMakeLists.txt deleted file mode 100644 --- a/libc/src/string/x86_64/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -add_memcpy("memcpy_${LIBC_TARGET_ARCHITECTURE}_opt_none" REJECT "${ALL_CPU_FEATURES}") -add_memcpy("memcpy_${LIBC_TARGET_ARCHITECTURE}_opt_sse" REQUIRE "SSE" REJECT "SSE2") -add_memcpy("memcpy_${LIBC_TARGET_ARCHITECTURE}_opt_avx" REQUIRE "AVX" REJECT "AVX2") -add_memcpy("memcpy_${LIBC_TARGET_ARCHITECTURE}_opt_avx512f" REQUIRE "AVX512F") - -add_memset("memset_${LIBC_TARGET_ARCHITECTURE}_opt_none" REJECT "${ALL_CPU_FEATURES}") -add_memset("memset_${LIBC_TARGET_ARCHITECTURE}_opt_sse" REQUIRE "SSE" REJECT "SSE2") -add_memset("memset_${LIBC_TARGET_ARCHITECTURE}_opt_avx" REQUIRE "AVX" REJECT "AVX2") -add_memset("memset_${LIBC_TARGET_ARCHITECTURE}_opt_avx512f" REQUIRE "AVX512F") - -add_bzero("bzero_${LIBC_TARGET_ARCHITECTURE}_opt_none" REJECT "${ALL_CPU_FEATURES}") -add_bzero("bzero_${LIBC_TARGET_ARCHITECTURE}_opt_sse" REQUIRE "SSE" REJECT "SSE2") -add_bzero("bzero_${LIBC_TARGET_ARCHITECTURE}_opt_avx" REQUIRE "AVX" REJECT "AVX2") -add_bzero("bzero_${LIBC_TARGET_ARCHITECTURE}_opt_avx512f" REQUIRE "AVX512F") diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt --- a/libc/test/src/string/CMakeLists.txt +++ b/libc/test/src/string/CMakeLists.txt @@ -196,6 +196,8 @@ libc_string_unittests DEPENDS ${fq_config_name} + COMPILE_OPTIONS + ${LIBC_COMPILE_OPTIONS_NATIVE} ${ARGN} ) else()