diff --git a/libc/src/__support/FPUtil/PolyEval.h b/libc/src/__support/FPUtil/PolyEval.h --- a/libc/src/__support/FPUtil/PolyEval.h +++ b/libc/src/__support/FPUtil/PolyEval.h @@ -28,7 +28,7 @@ template static inline T polyeval(T x, T a0) { return a0; } template -static inline T polyeval(T x, T a0, Ts... a) { +INLINE_FMA static inline T polyeval(T x, T a0, Ts... a) { return fma(x, polyeval(x, a...), a0); } diff --git a/libc/src/__support/FPUtil/x86_64/FMA.h b/libc/src/__support/FPUtil/x86_64/FMA.h --- a/libc/src/__support/FPUtil/x86_64/FMA.h +++ b/libc/src/__support/FPUtil/x86_64/FMA.h @@ -11,7 +11,7 @@ #include "src/__support/architectures.h" -#if !defined(LLVM_LIBC_ARCH_X86) +#if !defined(LLVM_LIBC_ARCH_X86_64) #error "Invalid include" #endif @@ -22,8 +22,7 @@ namespace fputil { template -__attribute__((target( - "fma"))) static inline cpp::EnableIfType::Value, T> +INLINE_FMA static inline cpp::EnableIfType::Value, T> fma(T x, T y, T z) { float result; __m128 xmm = _mm_load_ss(&x); // NOLINT @@ -35,8 +34,7 @@ } template -__attribute__((target( - "fma"))) static inline cpp::EnableIfType::Value, T> +INLINE_FMA static inline cpp::EnableIfType::Value, T> fma(T x, T y, T z) { double result; __m128d xmm = _mm_load_sd(&x); // NOLINT diff --git a/libc/src/__support/FPUtil/x86_64/PolyEval.h b/libc/src/__support/FPUtil/x86_64/PolyEval.h --- a/libc/src/__support/FPUtil/x86_64/PolyEval.h +++ b/libc/src/__support/FPUtil/x86_64/PolyEval.h @@ -23,11 +23,11 @@ // Cubic polynomials: // polyeval(x, a0, a1, a2, a3) = a3*x^3 + a2*x^2 + a1*x + a0 template <> -__attribute__((target("fma"))) inline float -polyeval(float x, float a0, float a1, float a2, float a3) { - __m128 xmm = _mm_set1_ps(x); // NOLINT - __m128 a13 = _mm_set_ps(0.0f, x, a3, a1); // NOLINT - __m128 a02 = _mm_set_ps(0.0f, 0.0f, a2, a0); // NOLINT +INLINE_FMA inline float polyeval(float x, float a0, float a1, float a2, + float a3) { + __m128 xmm = _mm_set1_ps(x); + __m128 a13 = _mm_set_ps(0.0f, x, a3, a1); + __m128 a02 = _mm_set_ps(0.0f, 0.0f, a2, a0); // r = (0, x^2, a3*x + a2, a1*x + a0) __m128 r = _mm_fmadd_ps(a13, xmm, a02); // NOLINT // result = (a3*x + a2) * x^2 + (a1*x + a0) @@ -35,11 +35,11 @@ } template <> -__attribute__((target("fma"))) inline double -polyeval(double x, double a0, double a1, double a2, double a3) { - __m256d xmm = _mm256_set1_pd(x); // NOLINT - __m256d a13 = _mm256_set_pd(0.0, x, a3, a1); // NOLINT - __m256d a02 = _mm256_set_pd(0.0, 0.0, a2, a0); // NOLINT +INLINE_FMA inline double polyeval(double x, double a0, double a1, double a2, + double a3) { + __m256d xmm = _mm256_set1_pd(x); + __m256d a13 = _mm256_set_pd(0.0, x, a3, a1); + __m256d a02 = _mm256_set_pd(0.0, 0.0, a2, a0); // r = (0, x^2, a3*x + a2, a1*x + a0) __m256d r = _mm256_fmadd_pd(a13, xmm, a02); // NOLINT // result = (a3*x + a2) * x^2 + (a1*x + a0) @@ -50,12 +50,12 @@ // polyeval(x, a0, a1, a2, a3, a4, a5) = a5*x^5 + a4*x^4 + a3*x^3 + a2*x^2 + // + a1*x + a0 template <> -__attribute__((target("fma"))) inline float -polyeval(float x, float a0, float a1, float a2, float a3, float a4, float a5) { - __m128 xmm = _mm_set1_ps(x); // NOLINT - __m128 a25 = _mm_set_ps(0.0f, x, a5, a2); // NOLINT - __m128 a14 = _mm_set_ps(0.0f, 0.0f, a4, a1); // NOLINT - __m128 a03 = _mm_set_ps(0.0f, 0.0f, a3, a0); // NOLINT +INLINE_FMA inline float polyeval(float x, float a0, float a1, float a2, + float a3, float a4, float a5) { + __m128 xmm = _mm_set1_ps(x); + __m128 a25 = _mm_set_ps(0.0f, x, a5, a2); + __m128 a14 = _mm_set_ps(0.0f, 0.0f, a4, a1); + __m128 a03 = _mm_set_ps(0.0f, 0.0f, a3, a0); // r1 = (0, x^2, a5*x + a4, a2*x + a1) __m128 r1 = _mm_fmadd_ps(a25, xmm, a14); // NOLINT // r2 = (0, x^3, (a5*x + a4)*x + a3, (a2*x + a1)*x + a0 @@ -65,13 +65,12 @@ } template <> -__attribute__((target("fma"))) inline double -polyeval(double x, double a0, double a1, double a2, double a3, double a4, - double a5) { - __m256d xmm = _mm256_set1_pd(x); // NOLINT - __m256d a25 = _mm256_set_pd(0.0, x, a5, a2); // NOLINT - __m256d a14 = _mm256_set_pd(0.0, 0.0, a4, a1); // NOLINT - __m256d a03 = _mm256_set_pd(0.0, 0.0, a3, a0); // NOLINT +INLINE_FMA inline double polyeval(double x, double a0, double a1, double a2, + double a3, double a4, double a5) { + __m256d xmm = _mm256_set1_pd(x); + __m256d a25 = _mm256_set_pd(0.0, x, a5, a2); + __m256d a14 = _mm256_set_pd(0.0, 0.0, a4, a1); + __m256d a03 = _mm256_set_pd(0.0, 0.0, a3, a0); // r1 = (0, x^2, a5*x + a4, a2*x + a1) __m256d r1 = _mm256_fmadd_pd(a25, xmm, a14); // NOLINT // r2 = (0, x^3, (a5*x + a4)*x + a3, (a2*x + a1)*x + a0 diff --git a/libc/src/__support/architectures.h b/libc/src/__support/architectures.h --- a/libc/src/__support/architectures.h +++ b/libc/src/__support/architectures.h @@ -37,4 +37,10 @@ #define LLVM_LIBC_ARCH_ANY_ARM #endif +#if defined(LLVM_LIBC_ARCH_X86_64) +#define INLINE_FMA __attribute__((target("fma"))) +#else +#define INLINE_FMA +#endif // LLVM_LIBC_ARCH_X86_64 + #endif // LLVM_LIBC_SUPPORT_ARCHITECTURES_H diff --git a/libc/src/math/fma.cpp b/libc/src/math/fma.cpp --- a/libc/src/math/fma.cpp +++ b/libc/src/math/fma.cpp @@ -13,6 +13,7 @@ namespace __llvm_libc { +INLINE_FMA LLVM_LIBC_FUNCTION(double, fma, (double x, double y, double z)) { return fputil::fma(x, y, z); } diff --git a/libc/src/math/fmaf.cpp b/libc/src/math/fmaf.cpp --- a/libc/src/math/fmaf.cpp +++ b/libc/src/math/fmaf.cpp @@ -13,6 +13,7 @@ namespace __llvm_libc { +INLINE_FMA LLVM_LIBC_FUNCTION(float, fmaf, (float x, float y, float z)) { return fputil::fma(x, y, z); } diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp --- a/libc/src/math/generic/expm1f.cpp +++ b/libc/src/math/generic/expm1f.cpp @@ -22,6 +22,7 @@ // each interval. The coefficients were generated by Sollya's fpminmax. // // See libc/utils/mathtools/expm1f.sollya for more detail. +INLINE_FMA LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { const float ln2 = 0.69314718055994530941723212145817656807550013436025f; // For C++17: