diff --git a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake --- a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake +++ b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake @@ -6,7 +6,7 @@ set(ALL_CPU_FEATURES "") if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) - set(ALL_CPU_FEATURES SSE2 SSE4_2 AVX2 AVX512F) + set(ALL_CPU_FEATURES SSE2 SSE4_2 AVX2 AVX512F FMA) set(LIBC_COMPILE_OPTIONS_NATIVE -march=native) elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) set(LIBC_COMPILE_OPTIONS_NATIVE -mcpu=native) @@ -66,6 +66,7 @@ if(NOT "${cpu_features}" STREQUAL "${LIBC_CPU_FEATURES}") message(FATAL_ERROR "Unsupported CPU features: ${cpu_features}") endif() + message(STATUS "Set CPU features: ${cpu_features}") set(LIBC_CPU_FEATURES "${cpu_features}") else() # Populates the LIBC_CPU_FEATURES list from host. @@ -76,6 +77,7 @@ COMPILE_OUTPUT_VARIABLE compile_output RUN_OUTPUT_VARIABLE run_output) if("${run_result}" EQUAL 0) + message(STATUS "Set CPU features: ${run_output}") set(LIBC_CPU_FEATURES "${run_output}") elseif(NOT ${compile_result}) message(FATAL_ERROR "Failed to compile: ${compile_output}") diff --git a/libc/cmake/modules/LLVMLibCFlagRules.cmake b/libc/cmake/modules/LLVMLibCFlagRules.cmake --- a/libc/cmake/modules/LLVMLibCFlagRules.cmake +++ b/libc/cmake/modules/LLVMLibCFlagRules.cmake @@ -131,3 +131,8 @@ # Special flags set(FMA_OPT_FLAG "FMA_OPT") + +# Skip FMA_OPT flag for targets that don't support fma. +if(NOT(LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "FMA"))) + set(SKIP_FLAG_EXPANSION_FMA_OPT TRUE) +endif() diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake --- a/libc/cmake/modules/LLVMLibCObjectRules.cmake +++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake @@ -1,6 +1,14 @@ set(OBJECT_LIBRARY_TARGET_TYPE "OBJECT_LIBRARY") -function(_get_common_compile_options output_var) +function(_get_common_compile_options output_var flags) + list(FIND flags ${FMA_OPT_FLAG} fma) + if(${fma} LESS 0) + list(FIND flags "${FMA_OPT_FLAG}__ONLY" fma) + endif() + if((${fma} GREATER -1) AND (LIBC_CPU_FEATURES MATCHES "FMA")) + set(ADD_FMA_FLAG TRUE) + endif() + set(compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT} ${ARGN}) if(NOT ${LIBC_TARGET_OS} STREQUAL "windows") set(compile_options ${compile_options} -fpie -ffreestanding -fno-builtin) @@ -10,9 +18,15 @@ list(APPEND compile_options "-fno-unwind-tables") list(APPEND compile_options "-fno-asynchronous-unwind-tables") list(APPEND compile_options "-fno-rtti") + if(ADD_FMA_FLAG) + list(APPEND compile_options "-mfma") + endif() elseif(MSVC) list(APPEND compile_options "/EHs-c-") list(APPEND compile_options "/GR-") + if(ADD_FMA_FLAG) + list(APPEND compile_options "/arch:AVX2") + endif() endif() set(${output_var} ${compile_options} PARENT_SCOPE) endfunction() @@ -54,7 +68,11 @@ ${LIBC_SOURCE_DIR} ${LIBC_BUILD_DIR} ) - _get_common_compile_options(compile_options ${ADD_OBJECT_COMPILE_OPTIONS}) + _get_common_compile_options( + compile_options + "${ADD_OBJECT_FLAGS}" + ${ADD_OBJECT_COMPILE_OPTIONS} + ) target_compile_options(${fq_target_name} PRIVATE ${compile_options}) get_fq_deps_list(fq_deps_list ${ADD_OBJECT_DEPENDS}) @@ -276,7 +294,11 @@ set(ADD_ENTRYPOINT_OBJ_CXX_STANDARD ${CMAKE_CXX_STANDARD}) endif() - _get_common_compile_options(common_compile_options ${ADD_ENTRYPOINT_OBJ_COMPILE_OPTIONS}) + _get_common_compile_options( + common_compile_options + "${ADD_ENTRYPOINT_OBJ_FLAGS}" + ${ADD_ENTRYPOINT_OBJ_COMPILE_OPTIONS} + ) set(internal_target_name ${fq_target_name}.__internal__) set(include_dirs ${LIBC_BUILD_DIR}/include ${LIBC_SOURCE_DIR} ${LIBC_BUILD_DIR}) get_fq_deps_list(fq_deps_list ${ADD_ENTRYPOINT_OBJ_DEPENDS}) diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt --- a/libc/src/__support/FPUtil/CMakeLists.txt +++ b/libc/src/__support/FPUtil/CMakeLists.txt @@ -48,6 +48,8 @@ DEPENDS .fputil libc.src.__support.FPUtil.generic.fma + FLAGS + FMA_OPT ) add_header_library( diff --git a/libc/src/__support/FPUtil/PolyEval.h b/libc/src/__support/FPUtil/PolyEval.h --- a/libc/src/__support/FPUtil/PolyEval.h +++ b/libc/src/__support/FPUtil/PolyEval.h @@ -24,7 +24,7 @@ template static inline T polyeval(T x, T a0) { return a0; } template -INLINE_FMA static inline T polyeval(T x, T a0, Ts... a) { +static inline T polyeval(T x, T a0, Ts... a) { return multiply_add(x, polyeval(x, a...), a0); } diff --git a/libc/src/__support/FPUtil/x86_64/FMA.h b/libc/src/__support/FPUtil/x86_64/FMA.h --- a/libc/src/__support/FPUtil/x86_64/FMA.h +++ b/libc/src/__support/FPUtil/x86_64/FMA.h @@ -26,8 +26,8 @@ namespace fputil { template -INLINE_FMA static inline cpp::EnableIfType::Value, T> -fma(T x, T y, T z) { +static inline cpp::EnableIfType::Value, T> fma(T x, T y, + T z) { float result; __m128 xmm = _mm_load_ss(&x); // NOLINT __m128 ymm = _mm_load_ss(&y); // NOLINT @@ -38,8 +38,8 @@ } template -INLINE_FMA static inline cpp::EnableIfType::Value, T> -fma(T x, T y, T z) { +static inline cpp::EnableIfType::Value, T> fma(T x, T y, + T z) { double result; __m128d xmm = _mm_load_sd(&x); // NOLINT __m128d ymm = _mm_load_sd(&y); // NOLINT diff --git a/libc/src/__support/FPUtil/x86_64/PolyEval.h b/libc/src/__support/FPUtil/x86_64/PolyEval.h --- a/libc/src/__support/FPUtil/x86_64/PolyEval.h +++ b/libc/src/__support/FPUtil/x86_64/PolyEval.h @@ -23,8 +23,7 @@ // Cubic polynomials: // polyeval(x, a0, a1, a2, a3) = a3*x^3 + a2*x^2 + a1*x + a0 template <> -INLINE_FMA inline float polyeval(float x, float a0, float a1, float a2, - float a3) { +inline float polyeval(float x, float a0, float a1, float a2, float a3) { __m128 xmm = _mm_set1_ps(x); // NOLINT __m128 a13 = _mm_set_ps(0.0f, x, a3, a1); // NOLINT __m128 a02 = _mm_set_ps(0.0f, 0.0f, a2, a0); // NOLINT @@ -35,8 +34,7 @@ } template <> -INLINE_FMA inline double polyeval(double x, double a0, double a1, double a2, - double a3) { +inline double polyeval(double x, double a0, double a1, double a2, double a3) { __m256d xmm = _mm256_set1_pd(x); // NOLINT __m256d a13 = _mm256_set_pd(0.0, x, a3, a1); // NOLINT __m256d a02 = _mm256_set_pd(0.0, 0.0, a2, a0); // NOLINT @@ -50,8 +48,8 @@ // polyeval(x, a0, a1, a2, a3, a4, a5) = a5*x^5 + a4*x^4 + a3*x^3 + a2*x^2 + // + a1*x + a0 template <> -INLINE_FMA inline float polyeval(float x, float a0, float a1, float a2, - float a3, float a4, float a5) { +inline float polyeval(float x, float a0, float a1, float a2, float a3, float a4, + float a5) { __m128 xmm = _mm_set1_ps(x); // NOLINT __m128 a25 = _mm_set_ps(0.0f, x, a5, a2); // NOLINT __m128 a14 = _mm_set_ps(0.0f, 0.0f, a4, a1); // NOLINT @@ -65,8 +63,8 @@ } template <> -INLINE_FMA inline double polyeval(double x, double a0, double a1, double a2, - double a3, double a4, double a5) { +inline double polyeval(double x, double a0, double a1, double a2, double a3, + double a4, double a5) { __m256d xmm = _mm256_set1_pd(x); // NOLINT __m256d a25 = _mm256_set_pd(0.0, x, a5, a2); // NOLINT __m256d a14 = _mm256_set_pd(0.0, 0.0, a4, a1); // NOLINT diff --git a/libc/src/__support/architectures.h b/libc/src/__support/architectures.h --- a/libc/src/__support/architectures.h +++ b/libc/src/__support/architectures.h @@ -45,10 +45,4 @@ #endif #endif -#if (defined(LLVM_LIBC_ARCH_X86_64) && defined(LIBC_TARGET_HAS_FMA)) -#define INLINE_FMA __attribute__((target("fma"))) -#else -#define INLINE_FMA -#endif // LLVM_LIBC_ARCH_X86_64 - #endif // LLVM_LIBC_SUPPORT_ARCHITECTURES_H diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -51,7 +51,6 @@ libc.src.__support.FPUtil.fma COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( @@ -65,7 +64,8 @@ libc.src.__support.FPUtil.fma COMPILE_OPTIONS -O3 - -mfma + FLAGS + FMA_OPT__ONLY ) add_math_entrypoint_object(ceil) diff --git a/libc/src/math/fma.cpp b/libc/src/math/fma.cpp --- a/libc/src/math/fma.cpp +++ b/libc/src/math/fma.cpp @@ -13,7 +13,6 @@ namespace __llvm_libc { -INLINE_FMA LLVM_LIBC_FUNCTION(double, fma, (double x, double y, double z)) { return fputil::fma(x, y, z); } diff --git a/libc/src/math/fmaf.cpp b/libc/src/math/fmaf.cpp --- a/libc/src/math/fmaf.cpp +++ b/libc/src/math/fmaf.cpp @@ -13,7 +13,6 @@ namespace __llvm_libc { -INLINE_FMA LLVM_LIBC_FUNCTION(float, fmaf, (float x, float y, float z)) { return fputil::fma(x, y, z); } diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -482,7 +482,6 @@ libc.include.math COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( @@ -497,7 +496,6 @@ libc.include.math COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( @@ -514,7 +512,6 @@ libc.include.math COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( @@ -682,7 +679,6 @@ libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( @@ -698,7 +694,6 @@ libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( @@ -713,7 +708,6 @@ libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( @@ -729,7 +723,6 @@ libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 - -mfma ) add_entrypoint_object( diff --git a/libc/src/math/generic/exp2f.cpp b/libc/src/math/generic/exp2f.cpp --- a/libc/src/math/generic/exp2f.cpp +++ b/libc/src/math/generic/exp2f.cpp @@ -47,7 +47,6 @@ 0x1.fa7c1819e90d8p0, }; -INLINE_FMA LLVM_LIBC_FUNCTION(float, exp2f, (float x)) { using FPBits = typename fputil::FPBits; FPBits xbits(x); diff --git a/libc/src/math/generic/expf.cpp b/libc/src/math/generic/expf.cpp --- a/libc/src/math/generic/expf.cpp +++ b/libc/src/math/generic/expf.cpp @@ -19,7 +19,6 @@ namespace __llvm_libc { -INLINE_FMA LLVM_LIBC_FUNCTION(float, expf, (float x)) { using FPBits = typename fputil::FPBits; FPBits xbits(x); diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp --- a/libc/src/math/generic/expm1f.cpp +++ b/libc/src/math/generic/expm1f.cpp @@ -19,7 +19,6 @@ namespace __llvm_libc { -INLINE_FMA LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { using FPBits = typename fputil::FPBits; FPBits xbits(x); diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp --- a/libc/src/math/generic/log10f.cpp +++ b/libc/src/math/generic/log10f.cpp @@ -101,7 +101,6 @@ 0x1.2b7b9e258e422p-2, 0x1.2d404b073e27ep-2, 0x1.2f032cf56a5bep-2, 0x1.30c4478f0835fp-2, 0x1.32839e681fc62p-2}; -INLINE_FMA LLVM_LIBC_FUNCTION(float, log10f, (float x)) { constexpr double LOG10_2 = 0x1.34413509f79ffp-2; diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp --- a/libc/src/math/generic/log1pf.cpp +++ b/libc/src/math/generic/log1pf.cpp @@ -32,7 +32,7 @@ namespace internal { // We don't need to treat denormal -INLINE_FMA static inline float log(double x) { +static inline float log(double x) { constexpr double LOG_2 = 0x1.62e42fefa39efp-1; using FPBits = typename fputil::FPBits; @@ -77,7 +77,6 @@ } // namespace internal -INLINE_FMA LLVM_LIBC_FUNCTION(float, log1pf, (float x)) { using FPBits = typename fputil::FPBits; FPBits xbits(x); diff --git a/libc/src/math/generic/log2f.cpp b/libc/src/math/generic/log2f.cpp --- a/libc/src/math/generic/log2f.cpp +++ b/libc/src/math/generic/log2f.cpp @@ -98,7 +98,6 @@ 0x1.f16e281db7630p-1, 0x1.f45e08bcf0655p-1, 0x1.f74aef0efafaep-1, 0x1.fa34e1177c233p-1, 0x1.fd1be4c7f2af9p-1}; -INLINE_FMA LLVM_LIBC_FUNCTION(float, log2f, (float x)) { using FPBits = typename fputil::FPBits; FPBits xbits(x); diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp --- a/libc/src/math/generic/logf.cpp +++ b/libc/src/math/generic/logf.cpp @@ -49,7 +49,6 @@ namespace __llvm_libc { -INLINE_FMA LLVM_LIBC_FUNCTION(float, logf, (float x)) { constexpr double LOG_2 = 0x1.62e42fefa39efp-1; using FPBits = typename fputil::FPBits; diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -1189,6 +1189,9 @@ libc.src.__support.FPUtil.fputil ) +# Without FMA instructions, the current expm1f implementation is not correctly +# rounded for all float inputs (1 extra exceptional value). This will be fixed +# in the followup patch: https://reviews.llvm.org/D123440 add_fp_unittest( expm1f_test NEED_MPFR @@ -1201,6 +1204,8 @@ libc.include.math libc.src.math.expm1f libc.src.__support.FPUtil.fputil + FLAGS + FMA_OPT__ONLY ) add_fp_unittest(