diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt --- a/libc/src/__support/FPUtil/CMakeLists.txt +++ b/libc/src/__support/FPUtil/CMakeLists.txt @@ -12,7 +12,6 @@ NearestIntegerOperations.h NormalFloat.h PlatformDefs.h - PolyEval.h UInt.h XFloat.h DEPENDS @@ -34,4 +33,29 @@ libc.src.__support.FPUtil.generic.sqrt ) +add_header_library( + fma + HDRS + FMA.h + DEPENDS + .fputil + libc.src.__support.FPUtil.generic.fma +) + +add_header_library( + multiply_add + HDRS + multiply_add.h + DEPENDS + .fma +) + +add_header_library( + polyeval + HDRS + PolyEval.h + DEPENDS + .multiply_add +) + add_subdirectory(generic) diff --git a/libc/src/__support/FPUtil/FMA.h b/libc/src/__support/FPUtil/FMA.h --- a/libc/src/__support/FPUtil/FMA.h +++ b/libc/src/__support/FPUtil/FMA.h @@ -11,11 +11,16 @@ #include "src/__support/architectures.h" +#if defined(LLVM_LIBC_FMA) + #if defined(LLVM_LIBC_ARCH_X86_64) #include "x86_64/FMA.h" #elif defined(LLVM_LIBC_ARCH_AARCH64) #include "aarch64/FMA.h" +#endif + #else +// FMA instructions are not available #include "generic/FMA.h" #include "src/__support/CPP/TypeTraits.h" diff --git a/libc/src/__support/FPUtil/PolyEval.h b/libc/src/__support/FPUtil/PolyEval.h --- a/libc/src/__support/FPUtil/PolyEval.h +++ b/libc/src/__support/FPUtil/PolyEval.h @@ -9,19 +9,15 @@ #ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H #define LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H -#include "src/__support/CPP/TypeTraits.h" -#include "src/__support/architectures.h" +#include "multiply_add.h" // Evaluate polynomial using Horner's Scheme: // With polyeval(x, a_0, a_1, ..., a_n) = a_n * x^n + ... + a_1 * x + a_0, we // evaluated it as: a_0 + x * (a_1 + x * ( ... (a_(n-1) + x * a_n) ... ) ) ). -// We will use fma instructions if available. +// We will use FMA instructions if available. // Example: to evaluate x^3 + 2*x^2 + 3*x + 4, call // polyeval( x, 4.0, 3.0, 2.0, 1.0 ) -#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64) -#include "FMA.h" - namespace __llvm_libc { namespace fputil { @@ -29,35 +25,10 @@ template INLINE_FMA static inline T polyeval(T x, T a0, Ts... a) { - return fma(x, polyeval(x, a...), a0); + return multiply_add(x, polyeval(x, a...), a0); } } // namespace fputil } // namespace __llvm_libc -#ifdef LLVM_LIBC_ARCH_X86_64 - -// [DISABLED] There is a regression with using vectorized version for polyeval -// compared to the naive Horner's scheme with fma. Need further investigation -// #include "x86_64/PolyEval.h" - -#endif // LLVM_LIBC_ARCH_X86_64 - -#else - -namespace __llvm_libc { -namespace fputil { - -template static inline T polyeval(T x, T a0) { return a0; } - -template -static inline T polyeval(T x, T a0, Ts... a) { - return x * polyeval(x, a...) + a0; -} - -} // namespace fputil -} // namespace __llvm_libc - -#endif - -#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_FMA_H +#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H diff --git a/libc/src/__support/FPUtil/aarch64/FMA.h b/libc/src/__support/FPUtil/aarch64/FMA.h --- a/libc/src/__support/FPUtil/aarch64/FMA.h +++ b/libc/src/__support/FPUtil/aarch64/FMA.h @@ -15,6 +15,10 @@ #error "Invalid include" #endif +#if !defined(LLVM_LIBC_FMA) +#error "FMA instructions are not supported" +#endif + #include "src/__support/CPP/TypeTraits.h" namespace __llvm_libc { diff --git a/libc/src/__support/FPUtil/generic/CMakeLists.txt b/libc/src/__support/FPUtil/generic/CMakeLists.txt --- a/libc/src/__support/FPUtil/generic/CMakeLists.txt +++ b/libc/src/__support/FPUtil/generic/CMakeLists.txt @@ -4,3 +4,9 @@ sqrt.h sqrt_80_bit_long_double.h ) + +add_header_library( + fma + HDRS + FMA.h +) diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h --- a/libc/src/__support/FPUtil/generic/FMA.h +++ b/libc/src/__support/FPUtil/generic/FMA.h @@ -10,6 +10,7 @@ #define LLVM_LIBC_SRC_SUPPORT_FPUTIL_GENERIC_FMA_H #include "src/__support/CPP/TypeTraits.h" +#include "src/__support/FPUtil/FPBits.h" namespace __llvm_libc { namespace fputil { diff --git a/libc/src/__support/FPUtil/multiply_add.h b/libc/src/__support/FPUtil/multiply_add.h new file mode 100644 --- /dev/null +++ b/libc/src/__support/FPUtil/multiply_add.h @@ -0,0 +1,41 @@ +//===-- Common header for multiply-add implementations ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H +#define LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H + +#include "src/__support/architectures.h" + +namespace __llvm_libc { +namespace fputil { + +// Implement a simple wrapper for multiply-add operation: +// multiply_add(x, y, z) = x*y + z +// which uses FMA instructions to speed up if available. + +template static inline T multiply_add(T x, T y, T z) { + return x * y + z; +} + +#if defined(LLVM_LIBC_FMA) +// FMA instructions are available. +#include "FMA.h" + +template <> inline float multiply_add(float x, float y, float z) { + return fma(x, y, z); +} + +template <> inline double multiply_add(double x, double y, double z) { + return fma(x, y, z); +} +#endif // LLVM_LIBC_FMA + +} // namespace fputil +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H diff --git a/libc/src/__support/FPUtil/x86_64/FMA.h b/libc/src/__support/FPUtil/x86_64/FMA.h --- a/libc/src/__support/FPUtil/x86_64/FMA.h +++ b/libc/src/__support/FPUtil/x86_64/FMA.h @@ -15,6 +15,10 @@ #error "Invalid include" #endif +#if !defined(LLVM_LIBC_FMA) +#error "FMA instructions are not supported" +#endif + #include "src/__support/CPP/TypeTraits.h" #include diff --git a/libc/src/__support/architectures.h b/libc/src/__support/architectures.h --- a/libc/src/__support/architectures.h +++ b/libc/src/__support/architectures.h @@ -37,7 +37,15 @@ #define LLVM_LIBC_ARCH_ANY_ARM #endif -#if defined(LLVM_LIBC_ARCH_X86_64) +#if defined(LLVM_LIBC_ARCH_AARCH64) +#define LLVM_LIBC_FMA +#elif defined(LLVM_LIBC_ARCH_X86_64) +#if (defined(__AVX2__) || defined(__FMA__)) +#define LLVM_LIBC_FMA +#endif +#endif + +#if (defined(LLVM_LIBC_ARCH_X86_64) && defined(LLVM_LIBC_FMA)) #define INLINE_FMA __attribute__((target("fma"))) #else #define INLINE_FMA diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -48,8 +48,9 @@ fmaf.h DEPENDS libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.fma COMPILE_OPTIONS - -O2 + -O3 -mfma ) @@ -61,8 +62,9 @@ fma.h DEPENDS libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.fma COMPILE_OPTIONS - -O2 + -O3 -mfma ) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -478,6 +478,7 @@ DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.polyeval libc.include.math COMPILE_OPTIONS -O3 @@ -492,6 +493,7 @@ ../exp2f.h DEPENDS libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.polyeval libc.include.math COMPILE_OPTIONS -O3 @@ -507,6 +509,8 @@ DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval libc.include.math COMPILE_OPTIONS -O3 @@ -674,6 +678,8 @@ DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 -mfma @@ -688,6 +694,8 @@ DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 -mfma @@ -702,6 +710,7 @@ DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 -mfma @@ -716,6 +725,8 @@ DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 -mfma diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp --- a/libc/src/math/generic/expm1f.cpp +++ b/libc/src/math/generic/expm1f.cpp @@ -83,7 +83,7 @@ // = x otherwise. // To simplify the rounding decision and make it more efficient, we use // fma(x, x, x) ~ x + x^2 instead. - return fputil::fma(x, x, x); + return fputil::multiply_add(x, x, x); } // 2^-25 <= |x| < 2^-4 @@ -96,7 +96,7 @@ fputil::polyeval(xd, 0x1p-1, 0x1.55555555557ddp-3, 0x1.55555555552fap-5, 0x1.111110fcd58b7p-7, 0x1.6c16c1717660bp-10, 0x1.a0241f0006d62p-13, 0x1.a01e3f8d3c06p-16); - return static_cast(fputil::fma(r, xsq, xd)); + return static_cast(fputil::multiply_add(r, xsq, xd)); } // For -18 < x < 89, to compute expm1(x), we perform the following range @@ -132,7 +132,7 @@ double exp_lo = fputil::polyeval(xd, 0x1.0p0, 0x1.ffffffffff777p-1, 0x1.000000000071cp-1, 0x1.555566668e5e7p-3, 0x1.55555555ef243p-5); - return static_cast(fputil::fma(exp_hi_mid, exp_lo, -1.0)); + return static_cast(fputil::multiply_add(exp_hi_mid, exp_lo, -1.0)); } } // namespace __llvm_libc diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp --- a/libc/src/math/generic/log10f.cpp +++ b/libc/src/math/generic/log10f.cpp @@ -170,7 +170,7 @@ double d = static_cast(xbits) - static_cast(f); d *= ONE_OVER_F[f_index]; - double extra_factor = fputil::fma(m, LOG10_2, LOG10_F[f_index]); + double extra_factor = fputil::multiply_add(m, LOG10_2, LOG10_F[f_index]); double r = fputil::polyeval(d, extra_factor, 0x1.bcb7b1526e4c5p-2, -0x1.bcb7b1518a5e9p-3, 0x1.287a72a6f716p-3, diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp --- a/libc/src/math/generic/log1pf.cpp +++ b/libc/src/math/generic/log1pf.cpp @@ -66,7 +66,7 @@ double d = static_cast(xbits) - static_cast(f); d *= ONE_OVER_F[f_index]; - double extra_factor = fputil::fma(m, LOG_2, LOG_F[f_index]); + double extra_factor = fputil::multiply_add(m, LOG_2, LOG_F[f_index]); double r = fputil::polyeval(d, extra_factor, 0x1.fffffffffffacp-1, -0x1.fffffffef9cb2p-2, 0x1.5555513bc679ap-2, @@ -161,7 +161,7 @@ // > fpminimax(log(1 + x)/x, 5, [|D...|], [-2^-8; 2^-8]); r = fputil::polyeval(xd, -0x1p-1, 0x1.5555555515551p-2, -0x1.ffffffff82bdap-3, 0x1.999b33348d3aep-3, -0x1.5556cae3adcc3p-3); - return static_cast(fputil::fma(r, xd * xd, xd)); + return static_cast(fputil::multiply_add(r, xd * xd, xd)); } } // namespace __llvm_libc diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp --- a/libc/src/math/generic/logf.cpp +++ b/libc/src/math/generic/logf.cpp @@ -120,7 +120,7 @@ d *= ONE_OVER_F[f_index]; double extra_factor = - fputil::fma(static_cast(m), LOG_2, LOG_F[f_index]); + fputil::multiply_add(static_cast(m), LOG_2, LOG_F[f_index]); double r = __llvm_libc::fputil::polyeval( d, extra_factor, 0x1.fffffffffffacp-1, -0x1.fffffffef9cb2p-2, diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -196,6 +196,54 @@ ], ) +fma_common_hdrs = [ + "src/__support/FPUtil/FMA.h", + "src/__support/FPUtil/generic/FMA.h", +] + +fma_hdrs = selects.with_or({ + "//conditions:default": fma_common_hdrs, + PLATFORM_CPU_X86_64: fma_common_hdrs + [ + "src/__support/FPUtil/x86_64/FMA.h", + ], + PLATFORM_CPU_ARM64: fma_common_hdrs + [ + "src/__support/FPUtil/aarch64/FMA.h", + ], +}) + +cc_library( + name = "__support_fputil_fma", + hdrs = fma_hdrs, + deps = [ + ":__support_common", + ":__support_cpp_bit", + ":__support_cpp_type_traits", + ":__support_fputil", + ":libc_root", + ], +) + +cc_library( + name = "__support_fputil_multiply_add", + hdrs = [ + "src/__support/FPUtil/multiply_add.h", + ], + deps = [ + ":__support_common", + ":__support_fputil_fma", + ], +) + +cc_library( + name = "__support_fputil_polyeval", + hdrs = [ + "src/__support/FPUtil/PolyEval.h", + ], + deps = [ + ":__support_fputil_multiply_add", + ], +) + ################################ fenv targets ################################ libc_function(