diff --git a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake --- a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake +++ b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake @@ -6,7 +6,7 @@ set(ALL_CPU_FEATURES "") if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) - set(ALL_CPU_FEATURES SSE2 SSE4_2 AVX2 AVX512F) + set(ALL_CPU_FEATURES SSE2 SSE4_2 AVX2 AVX512F FMA) set(LIBC_COMPILE_OPTIONS_NATIVE -march=native) elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) set(LIBC_COMPILE_OPTIONS_NATIVE -mcpu=native) @@ -66,6 +66,7 @@ if(NOT "${cpu_features}" STREQUAL "${LIBC_CPU_FEATURES}") message(FATAL_ERROR "Unsupported CPU features: ${cpu_features}") endif() + message(STATUS "Check CPU features: ${run_output}") set(LIBC_CPU_FEATURES "${cpu_features}") else() # Populates the LIBC_CPU_FEATURES list from host. @@ -76,6 +77,7 @@ COMPILE_OUTPUT_VARIABLE compile_output RUN_OUTPUT_VARIABLE run_output) if("${run_result}" EQUAL 0) + message(STATUS "Check CPU features: ${run_output}") set(LIBC_CPU_FEATURES "${run_output}") elseif(NOT ${compile_result}) message(FATAL_ERROR "Failed to compile: ${compile_output}") diff --git a/libc/cmake/modules/LLVMLibCLibraryRules.cmake b/libc/cmake/modules/LLVMLibCLibraryRules.cmake --- a/libc/cmake/modules/LLVMLibCLibraryRules.cmake +++ b/libc/cmake/modules/LLVMLibCLibraryRules.cmake @@ -134,7 +134,7 @@ function(add_header_library target_name) cmake_parse_arguments( "ADD_HEADER" - "" # No optional arguments + "FMA_OPT" # Use FMA if target supports "" # No Single value arguments "HDRS;DEPENDS" # Multi-value arguments ${ARGN} @@ -160,6 +160,14 @@ get_fq_deps_list(fq_deps_list ${ADD_HEADER_DEPENDS}) if(ADD_HEADER_DEPENDS) add_dependencies(${interface_target_name} ${fq_deps_list}) + foreach(dep IN LISTS fq_deps_list) + if(TARGET ${dep}) + get_target_property(fma_opt ${dep} "FMA_OPT") + if(fma_opt) + set(ADD_HEADER_FMA_OPT TRUE) + endif() + endif() + endforeach() endif() add_custom_target(${fq_target_name}) @@ -170,4 +178,15 @@ "TARGET_TYPE" "${HDR_LIBRARY_TARGET_TYPE}" "DEPS" "${fq_deps_list}" ) + if(ADD_HEADER_FMA_OPT) + message(STATUS "Header library ${fq_target_name} will enable FMA for dependecy") + set_target_properties( + ${interface_target_name} + PROPERTIES INTERFACE_FMA_OPT TRUE + ) + set_target_properties( + ${fq_target_name} + PROPERTIES FMA_OPT TRUE + ) + endif() endfunction(add_header_library) diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake --- a/libc/cmake/modules/LLVMLibCObjectRules.cmake +++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake @@ -22,6 +22,7 @@ # Usage: # add_object_library( # +# FMA_OPT [optional] enable FMA if target supports. # HDRS # SRCS # DEPENDS @@ -29,7 +30,7 @@ function(add_object_library target_name) cmake_parse_arguments( "ADD_OBJECT" - "" # No optional arguments + "FMA_OPT" # Optional arguments "CXX_STANDARD" # Single value arguments "SRCS;HDRS;COMPILE_OPTIONS;DEPENDS" # Multivalue arguments ${ARGN} @@ -54,14 +55,35 @@ ${LIBC_SOURCE_DIR} ${LIBC_BUILD_DIR} ) - _get_common_compile_options(compile_options ${ADD_OBJECT_COMPILE_OPTIONS}) - target_compile_options(${fq_target_name} PRIVATE ${compile_options}) - get_fq_deps_list(fq_deps_list ${ADD_OBJECT_DEPENDS}) if(fq_deps_list) add_dependencies(${fq_target_name} ${fq_deps_list}) + + foreach(dep IN LISTS ${fq_deps_list}) + if(TARGET ${dep}) + get_target_property(fma_opt ${dep} "FMA_OPT") + endif() + if(fma_opt) + set(ADD_OBJECT_FMA_OPT TRUE) + endif() + endforeach() + + endif() + + if(ADD_OBJECT_FMA_OPT) + set_target_properties( + ${fq_target_name} + PROPERTIES FMA_OPT TRUE + ) + if(LIBC_CPU_FEATURES MATCHES "FMA") + message(STATUS "FMA is enabled for ${fq_target_name}") + list(APPEND ADD_OBJECT_COMPILE_OPTIONS "-mfma") + endif() endif() + _get_common_compile_options(compile_options ${ADD_OBJECT_COMPILE_OPTIONS}) + target_compile_options(${fq_target_name} PRIVATE ${compile_options}) + if(ADD_OBJECT_CXX_STANDARD) set_target_properties( ${fq_target_name} @@ -85,7 +107,8 @@ # Usage: # add_entrypoint_object( # -# [ALIAS|REDIRECTED] # Specified if the entrypoint is redirected or an alias. +# [ALIAS|REDIRECTED|FMA_OPT] # Specified if the entrypoint is redirected or an alias, +# # or it can take advantage of FMA instructions. # [NAME] # SRCS # HDRS @@ -96,7 +119,7 @@ function(add_entrypoint_object target_name) cmake_parse_arguments( "ADD_ENTRYPOINT_OBJ" - "ALIAS;REDIRECTED" # Optional argument + "ALIAS;REDIRECTED;FMA_OPT" # Optional argument "NAME;CXX_STANDARD" # Single value arguments "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS" # Multi value arguments ${ARGN} @@ -169,12 +192,27 @@ message(FATAL_ERROR "`add_entrypoint_object` rule requires HDRS to be specified.") endif() - _get_common_compile_options(common_compile_options ${ADD_ENTRYPOINT_OBJ_COMPILE_OPTIONS}) set(internal_target_name ${fq_target_name}.__internal__) set(include_dirs ${LIBC_BUILD_DIR}/include ${LIBC_SOURCE_DIR} ${LIBC_BUILD_DIR}) get_fq_deps_list(fq_deps_list ${ADD_ENTRYPOINT_OBJ_DEPENDS}) set(full_deps_list ${fq_deps_list} libc.src.__support.common) + foreach(dep IN LISTS fq_deps_list) + if(TARGET ${dep}) + get_target_property(fma_opt ${dep} "FMA_OPT") + endif() + if(fma_opt) + set(ADD_ENTRYPOINT_OBJ_FMA_OPT TRUE) + endif() + endforeach() + + if(ADD_ENTRYPOINT_OBJ_FMA_OPT AND (LIBC_CPU_FEATURES MATCHES "FMA")) + message(STATUS "FMA is enabled for ${entrypoint_name}") + list(APPEND ADD_ENTRYPOINT_OBJ_COMPILE_OPTIONS "-mfma") + endif() + + _get_common_compile_options(common_compile_options ${ADD_ENTRYPOINT_OBJ_COMPILE_OPTIONS}) + add_library( ${internal_target_name} # TODO: We don't need an object library for internal consumption. @@ -209,6 +247,13 @@ ) endif() + if(ADD_ENTRYPOINT_OBJ_FMA_OPT) + set_target_properties( + ${fq_target_name} + PROPERTIES FMA_OPT TRUE + ) + endif() + set_target_properties( ${fq_target_name} PROPERTIES diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt --- a/libc/src/__support/FPUtil/CMakeLists.txt +++ b/libc/src/__support/FPUtil/CMakeLists.txt @@ -35,6 +35,7 @@ add_header_library( fma + FMA_OPT HDRS FMA.h DEPENDS diff --git a/libc/src/__support/FPUtil/PolyEval.h b/libc/src/__support/FPUtil/PolyEval.h --- a/libc/src/__support/FPUtil/PolyEval.h +++ b/libc/src/__support/FPUtil/PolyEval.h @@ -24,7 +24,7 @@ template static inline T polyeval(T x, T a0) { return a0; } template -INLINE_FMA static inline T polyeval(T x, T a0, Ts... a) { +static inline T polyeval(T x, T a0, Ts... a) { return multiply_add(x, polyeval(x, a...), a0); } diff --git a/libc/src/__support/FPUtil/x86_64/FMA.h b/libc/src/__support/FPUtil/x86_64/FMA.h --- a/libc/src/__support/FPUtil/x86_64/FMA.h +++ b/libc/src/__support/FPUtil/x86_64/FMA.h @@ -26,8 +26,8 @@ namespace fputil { template -INLINE_FMA static inline cpp::EnableIfType::Value, T> -fma(T x, T y, T z) { +static inline cpp::EnableIfType::Value, T> fma(T x, T y, + T z) { float result; __m128 xmm = _mm_load_ss(&x); // NOLINT __m128 ymm = _mm_load_ss(&y); // NOLINT @@ -38,8 +38,8 @@ } template -INLINE_FMA static inline cpp::EnableIfType::Value, T> -fma(T x, T y, T z) { +static inline cpp::EnableIfType::Value, T> fma(T x, T y, + T z) { double result; __m128d xmm = _mm_load_sd(&x); // NOLINT __m128d ymm = _mm_load_sd(&y); // NOLINT diff --git a/libc/src/__support/FPUtil/x86_64/PolyEval.h b/libc/src/__support/FPUtil/x86_64/PolyEval.h --- a/libc/src/__support/FPUtil/x86_64/PolyEval.h +++ b/libc/src/__support/FPUtil/x86_64/PolyEval.h @@ -23,8 +23,7 @@ // Cubic polynomials: // polyeval(x, a0, a1, a2, a3) = a3*x^3 + a2*x^2 + a1*x + a0 template <> -INLINE_FMA inline float polyeval(float x, float a0, float a1, float a2, - float a3) { +inline float polyeval(float x, float a0, float a1, float a2, float a3) { __m128 xmm = _mm_set1_ps(x); // NOLINT __m128 a13 = _mm_set_ps(0.0f, x, a3, a1); // NOLINT __m128 a02 = _mm_set_ps(0.0f, 0.0f, a2, a0); // NOLINT @@ -35,8 +34,7 @@ } template <> -INLINE_FMA inline double polyeval(double x, double a0, double a1, double a2, - double a3) { +inline double polyeval(double x, double a0, double a1, double a2, double a3) { __m256d xmm = _mm256_set1_pd(x); // NOLINT __m256d a13 = _mm256_set_pd(0.0, x, a3, a1); // NOLINT __m256d a02 = _mm256_set_pd(0.0, 0.0, a2, a0); // NOLINT @@ -50,8 +48,8 @@ // polyeval(x, a0, a1, a2, a3, a4, a5) = a5*x^5 + a4*x^4 + a3*x^3 + a2*x^2 + // + a1*x + a0 template <> -INLINE_FMA inline float polyeval(float x, float a0, float a1, float a2, - float a3, float a4, float a5) { +inline float polyeval(float x, float a0, float a1, float a2, float a3, float a4, + float a5) { __m128 xmm = _mm_set1_ps(x); // NOLINT __m128 a25 = _mm_set_ps(0.0f, x, a5, a2); // NOLINT __m128 a14 = _mm_set_ps(0.0f, 0.0f, a4, a1); // NOLINT @@ -65,8 +63,8 @@ } template <> -INLINE_FMA inline double polyeval(double x, double a0, double a1, double a2, - double a3, double a4, double a5) { +inline double polyeval(double x, double a0, double a1, double a2, double a3, + double a4, double a5) { __m256d xmm = _mm256_set1_pd(x); // NOLINT __m256d a25 = _mm256_set_pd(0.0, x, a5, a2); // NOLINT __m256d a14 = _mm256_set_pd(0.0, 0.0, a4, a1); // NOLINT diff --git a/libc/src/__support/architectures.h b/libc/src/__support/architectures.h --- a/libc/src/__support/architectures.h +++ b/libc/src/__support/architectures.h @@ -45,10 +45,4 @@ #endif #endif -#if (defined(LLVM_LIBC_ARCH_X86_64) && defined(LIBC_TARGET_HAS_FMA)) -#define INLINE_FMA __attribute__((target("fma"))) -#else -#define INLINE_FMA -#endif // LLVM_LIBC_ARCH_X86_64 - #endif // LLVM_LIBC_SUPPORT_ARCHITECTURES_H diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -51,7 +51,6 @@ libc.src.__support.FPUtil.fma COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( @@ -65,7 +64,6 @@ libc.src.__support.FPUtil.fma COMPILE_OPTIONS -O3 - -mfma ) add_math_entrypoint_object(ceil) diff --git a/libc/src/math/fma.cpp b/libc/src/math/fma.cpp --- a/libc/src/math/fma.cpp +++ b/libc/src/math/fma.cpp @@ -13,7 +13,6 @@ namespace __llvm_libc { -INLINE_FMA LLVM_LIBC_FUNCTION(double, fma, (double x, double y, double z)) { return fputil::fma(x, y, z); } diff --git a/libc/src/math/fmaf.cpp b/libc/src/math/fmaf.cpp --- a/libc/src/math/fmaf.cpp +++ b/libc/src/math/fmaf.cpp @@ -13,7 +13,6 @@ namespace __llvm_libc { -INLINE_FMA LLVM_LIBC_FUNCTION(float, fmaf, (float x, float y, float z)) { return fputil::fma(x, y, z); } diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -482,7 +482,6 @@ libc.include.math COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( @@ -497,7 +496,6 @@ libc.include.math COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( @@ -514,7 +512,6 @@ libc.include.math COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( @@ -682,7 +679,6 @@ libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( @@ -698,7 +694,6 @@ libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( @@ -713,7 +708,6 @@ libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( @@ -729,7 +723,6 @@ libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( diff --git a/libc/src/math/generic/exp2f.cpp b/libc/src/math/generic/exp2f.cpp --- a/libc/src/math/generic/exp2f.cpp +++ b/libc/src/math/generic/exp2f.cpp @@ -47,7 +47,6 @@ 0x1.fa7c1819e90d8p0, }; -INLINE_FMA LLVM_LIBC_FUNCTION(float, exp2f, (float x)) { using FPBits = typename fputil::FPBits; FPBits xbits(x); diff --git a/libc/src/math/generic/expf.cpp b/libc/src/math/generic/expf.cpp --- a/libc/src/math/generic/expf.cpp +++ b/libc/src/math/generic/expf.cpp @@ -19,7 +19,6 @@ namespace __llvm_libc { -INLINE_FMA LLVM_LIBC_FUNCTION(float, expf, (float x)) { using FPBits = typename fputil::FPBits; FPBits xbits(x); diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp --- a/libc/src/math/generic/expm1f.cpp +++ b/libc/src/math/generic/expm1f.cpp @@ -19,7 +19,6 @@ namespace __llvm_libc { -INLINE_FMA LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { using FPBits = typename fputil::FPBits; FPBits xbits(x); diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp --- a/libc/src/math/generic/log10f.cpp +++ b/libc/src/math/generic/log10f.cpp @@ -101,7 +101,6 @@ 0x1.2b7b9e258e422p-2, 0x1.2d404b073e27ep-2, 0x1.2f032cf56a5bep-2, 0x1.30c4478f0835fp-2, 0x1.32839e681fc62p-2}; -INLINE_FMA LLVM_LIBC_FUNCTION(float, log10f, (float x)) { constexpr double LOG10_2 = 0x1.34413509f79ffp-2; diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp --- a/libc/src/math/generic/log1pf.cpp +++ b/libc/src/math/generic/log1pf.cpp @@ -32,7 +32,7 @@ namespace internal { // We don't need to treat denormal -INLINE_FMA static inline float log(double x) { +static inline float log(double x) { constexpr double LOG_2 = 0x1.62e42fefa39efp-1; using FPBits = typename fputil::FPBits; @@ -77,7 +77,6 @@ } // namespace internal -INLINE_FMA LLVM_LIBC_FUNCTION(float, log1pf, (float x)) { using FPBits = typename fputil::FPBits; FPBits xbits(x); diff --git a/libc/src/math/generic/log2f.cpp b/libc/src/math/generic/log2f.cpp --- a/libc/src/math/generic/log2f.cpp +++ b/libc/src/math/generic/log2f.cpp @@ -98,7 +98,6 @@ 0x1.f16e281db7630p-1, 0x1.f45e08bcf0655p-1, 0x1.f74aef0efafaep-1, 0x1.fa34e1177c233p-1, 0x1.fd1be4c7f2af9p-1}; -INLINE_FMA LLVM_LIBC_FUNCTION(float, log2f, (float x)) { using FPBits = typename fputil::FPBits; FPBits xbits(x); diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp --- a/libc/src/math/generic/logf.cpp +++ b/libc/src/math/generic/logf.cpp @@ -49,7 +49,6 @@ namespace __llvm_libc { -INLINE_FMA LLVM_LIBC_FUNCTION(float, logf, (float x)) { constexpr double LOG_2 = 0x1.62e42fefa39efp-1; using FPBits = typename fputil::FPBits;