diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -12,7 +12,6 @@
     NearestIntegerOperations.h
     NormalFloat.h
     PlatformDefs.h
-    PolyEval.h
     UInt.h
     XFloat.h
   DEPENDS
@@ -34,4 +33,29 @@
     libc.src.__support.FPUtil.generic.sqrt
 )
 
+add_header_library(
+  fma
+  HDRS
+    FMA.h
+  DEPENDS
+    .fputil
+    libc.src.__support.FPUtil.generic.fma
+)
+
+add_header_library(
+  multiply_add
+  HDRS
+    multiply_add.h
+  DEPENDS
+    .fma
+)
+
+add_header_library(
+  polyeval
+  HDRS
+    PolyEval.h
+  DEPENDS
+    .multiply_add
+)
+
 add_subdirectory(generic)
diff --git a/libc/src/__support/FPUtil/FMA.h b/libc/src/__support/FPUtil/FMA.h
--- a/libc/src/__support/FPUtil/FMA.h
+++ b/libc/src/__support/FPUtil/FMA.h
@@ -11,11 +11,16 @@
 
 #include "src/__support/architectures.h"
 
+#if defined(LLVM_LIBC_FMA)
+
 #if defined(LLVM_LIBC_ARCH_X86_64)
 #include "x86_64/FMA.h"
 #elif defined(LLVM_LIBC_ARCH_AARCH64)
 #include "aarch64/FMA.h"
+#endif
+
 #else
+// FMA instructions are not available
 #include "generic/FMA.h"
 #include "src/__support/CPP/TypeTraits.h"
 
diff --git a/libc/src/__support/FPUtil/PolyEval.h b/libc/src/__support/FPUtil/PolyEval.h
--- a/libc/src/__support/FPUtil/PolyEval.h
+++ b/libc/src/__support/FPUtil/PolyEval.h
@@ -9,19 +9,15 @@
 #ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H
 #define LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H
 
-#include "src/__support/CPP/TypeTraits.h"
-#include "src/__support/architectures.h"
+#include "multiply_add.h"
 
 // Evaluate polynomial using Horner's Scheme:
 // With polyeval(x, a_0, a_1, ..., a_n) = a_n * x^n + ... + a_1 * x + a_0, we
 // evaluated it as:  a_0 + x * (a_1 + x * ( ... (a_(n-1) + x * a_n) ... ) ) ).
-// We will use fma instructions if available.
+// We will use FMA instructions if available.
 // Example: to evaluate x^3 + 2*x^2 + 3*x + 4, call
 //   polyeval( x, 4.0, 3.0, 2.0, 1.0 )
 
-#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64)
-#include "FMA.h"
-
 namespace __llvm_libc {
 namespace fputil {
 
@@ -29,35 +25,10 @@
 
 template <typename T, typename... Ts>
 INLINE_FMA static inline T polyeval(T x, T a0, Ts... a) {
-  return fma(x, polyeval(x, a...), a0);
+  return multiply_add(x, polyeval(x, a...), a0);
 }
 
 } // namespace fputil
 } // namespace __llvm_libc
 
-#ifdef LLVM_LIBC_ARCH_X86_64
-
-// [DISABLED] There is a regression with using vectorized version for polyeval
-// compared to the naive Horner's scheme with fma.  Need further investigation
-// #include "x86_64/PolyEval.h"
-
-#endif // LLVM_LIBC_ARCH_X86_64
-
-#else
-
-namespace __llvm_libc {
-namespace fputil {
-
-template <typename T> static inline T polyeval(T x, T a0) { return a0; }
-
-template <typename T, typename... Ts>
-static inline T polyeval(T x, T a0, Ts... a) {
-  return x * polyeval(x, a...) + a0;
-}
-
-} // namespace fputil
-} // namespace __llvm_libc
-
-#endif
-
-#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_FMA_H
+#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H
diff --git a/libc/src/__support/FPUtil/aarch64/FMA.h b/libc/src/__support/FPUtil/aarch64/FMA.h
--- a/libc/src/__support/FPUtil/aarch64/FMA.h
+++ b/libc/src/__support/FPUtil/aarch64/FMA.h
@@ -15,6 +15,10 @@
 #error "Invalid include"
 #endif
 
+#if !defined(LLVM_LIBC_FMA)
+#error "FMA instructions are not supported"
+#endif
+
 #include "src/__support/CPP/TypeTraits.h"
 
 namespace __llvm_libc {
diff --git a/libc/src/__support/FPUtil/generic/CMakeLists.txt b/libc/src/__support/FPUtil/generic/CMakeLists.txt
--- a/libc/src/__support/FPUtil/generic/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/generic/CMakeLists.txt
@@ -4,3 +4,9 @@
     sqrt.h
     sqrt_80_bit_long_double.h
 )
+
+add_header_library(
+  fma
+  HDRS
+    FMA.h
+)
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_SRC_SUPPORT_FPUTIL_GENERIC_FMA_H
 
 #include "src/__support/CPP/TypeTraits.h"
+#include "src/__support/FPUtil/FPBits.h"
 
 namespace __llvm_libc {
 namespace fputil {
diff --git a/libc/src/__support/FPUtil/multiply_add.h b/libc/src/__support/FPUtil/multiply_add.h
new file mode 100644
--- /dev/null
+++ b/libc/src/__support/FPUtil/multiply_add.h
@@ -0,0 +1,41 @@
+//===-- Common header for multiply-add implementations ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H
+#define LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H
+
+#include "src/__support/architectures.h"
+
+namespace __llvm_libc {
+namespace fputil {
+
+// Implement a simple wrapper for multiply-add operation:
+//   multiply_add(x, y, z) = x*y + z
+// which uses FMA instructions to speed up if available.
+
+template <typename T> static inline T multiply_add(T x, T y, T z) {
+  return x * y + z;
+}
+
+#if defined(LLVM_LIBC_FMA)
+// FMA instructions are available.
+#include "FMA.h"
+
+template <> inline float multiply_add<float>(float x, float y, float z) {
+  return fma(x, y, z);
+}
+
+template <> inline double multiply_add<double>(double x, double y, double z) {
+  return fma(x, y, z);
+}
+#endif // LLVM_LIBC_FMA
+
+} // namespace fputil
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H
diff --git a/libc/src/__support/FPUtil/x86_64/FMA.h b/libc/src/__support/FPUtil/x86_64/FMA.h
--- a/libc/src/__support/FPUtil/x86_64/FMA.h
+++ b/libc/src/__support/FPUtil/x86_64/FMA.h
@@ -15,6 +15,10 @@
 #error "Invalid include"
 #endif
 
+#if !defined(LLVM_LIBC_FMA)
+#error "FMA instructions are not supported"
+#endif
+
 #include "src/__support/CPP/TypeTraits.h"
 #include <immintrin.h>
 
diff --git a/libc/src/__support/architectures.h b/libc/src/__support/architectures.h
--- a/libc/src/__support/architectures.h
+++ b/libc/src/__support/architectures.h
@@ -37,7 +37,15 @@
 #define LLVM_LIBC_ARCH_ANY_ARM
 #endif
 
-#if defined(LLVM_LIBC_ARCH_X86_64)
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+#define LLVM_LIBC_FMA
+#elif defined(LLVM_LIBC_ARCH_X86_64)
+#if (defined(__AVX2__) || defined(__FMA__))
+#define LLVM_LIBC_FMA
+#endif
+#endif
+
+#if (defined(LLVM_LIBC_ARCH_X86_64) && defined(LLVM_LIBC_FMA))
 #define INLINE_FMA __attribute__((target("fma")))
 #else
 #define INLINE_FMA
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -48,8 +48,9 @@
     fmaf.h
   DEPENDS
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.fma
   COMPILE_OPTIONS
-    -O2
+    -O3
     -mfma
 )
 
@@ -61,8 +62,9 @@
     fma.h
   DEPENDS
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.fma
   COMPILE_OPTIONS
-    -O2
+    -O3
     -mfma
 )
 
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -478,6 +478,7 @@
   DEPENDS
     .common_constants
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.polyeval
     libc.include.math
   COMPILE_OPTIONS
     -O3
@@ -492,6 +493,7 @@
     ../exp2f.h
   DEPENDS
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.polyeval
     libc.include.math
   COMPILE_OPTIONS
     -O3
@@ -507,6 +509,8 @@
   DEPENDS
     .common_constants
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
     libc.include.math
   COMPILE_OPTIONS
     -O3
@@ -674,6 +678,8 @@
   DEPENDS
     .common_constants
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
   COMPILE_OPTIONS
     -O3
     -mfma
@@ -688,6 +694,8 @@
   DEPENDS
     .common_constants
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
   COMPILE_OPTIONS
     -O3
     -mfma
@@ -702,6 +710,7 @@
   DEPENDS
     .common_constants
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.polyeval
     COMPILE_OPTIONS
     -O3
     -mfma
@@ -716,6 +725,8 @@
   DEPENDS
     .common_constants
     libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
   COMPILE_OPTIONS
     -O3
     -mfma
diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp
--- a/libc/src/math/generic/expm1f.cpp
+++ b/libc/src/math/generic/expm1f.cpp
@@ -83,7 +83,7 @@
       //   = x otherwise.
       // To simplify the rounding decision and make it more efficient, we use
       //   fma(x, x, x) ~ x + x^2 instead.
-      return fputil::fma(x, x, x);
+      return fputil::multiply_add(x, x, x);
     }
 
     // 2^-25 <= |x| < 2^-4
@@ -96,7 +96,7 @@
         fputil::polyeval(xd, 0x1p-1, 0x1.55555555557ddp-3, 0x1.55555555552fap-5,
                          0x1.111110fcd58b7p-7, 0x1.6c16c1717660bp-10,
                          0x1.a0241f0006d62p-13, 0x1.a01e3f8d3c06p-16);
-    return static_cast<float>(fputil::fma(r, xsq, xd));
+    return static_cast<float>(fputil::multiply_add(r, xsq, xd));
   }
 
   // For -18 < x < 89, to compute expm1(x), we perform the following range
@@ -132,7 +132,7 @@
   double exp_lo =
       fputil::polyeval(xd, 0x1.0p0, 0x1.ffffffffff777p-1, 0x1.000000000071cp-1,
                        0x1.555566668e5e7p-3, 0x1.55555555ef243p-5);
-  return static_cast<float>(fputil::fma(exp_hi_mid, exp_lo, -1.0));
+  return static_cast<float>(fputil::multiply_add(exp_hi_mid, exp_lo, -1.0));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp
--- a/libc/src/math/generic/log10f.cpp
+++ b/libc/src/math/generic/log10f.cpp
@@ -170,7 +170,7 @@
   double d = static_cast<float>(xbits) - static_cast<float>(f);
   d *= ONE_OVER_F[f_index];
 
-  double extra_factor = fputil::fma(m, LOG10_2, LOG10_F[f_index]);
+  double extra_factor = fputil::multiply_add(m, LOG10_2, LOG10_F[f_index]);
 
   double r = fputil::polyeval(d, extra_factor, 0x1.bcb7b1526e4c5p-2,
                               -0x1.bcb7b1518a5e9p-3, 0x1.287a72a6f716p-3,
diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp
--- a/libc/src/math/generic/log1pf.cpp
+++ b/libc/src/math/generic/log1pf.cpp
@@ -66,7 +66,7 @@
   double d = static_cast<double>(xbits) - static_cast<double>(f);
   d *= ONE_OVER_F[f_index];
 
-  double extra_factor = fputil::fma(m, LOG_2, LOG_F[f_index]);
+  double extra_factor = fputil::multiply_add(m, LOG_2, LOG_F[f_index]);
 
   double r = fputil::polyeval(d, extra_factor, 0x1.fffffffffffacp-1,
                               -0x1.fffffffef9cb2p-2, 0x1.5555513bc679ap-2,
@@ -161,7 +161,7 @@
   // > fpminimax(log(1 + x)/x, 5, [|D...|], [-2^-8; 2^-8]);
   r = fputil::polyeval(xd, -0x1p-1, 0x1.5555555515551p-2, -0x1.ffffffff82bdap-3,
                        0x1.999b33348d3aep-3, -0x1.5556cae3adcc3p-3);
-  return static_cast<float>(fputil::fma(r, xd * xd, xd));
+  return static_cast<float>(fputil::multiply_add(r, xd * xd, xd));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp
--- a/libc/src/math/generic/logf.cpp
+++ b/libc/src/math/generic/logf.cpp
@@ -120,7 +120,7 @@
   d *= ONE_OVER_F[f_index];
 
   double extra_factor =
-      fputil::fma(static_cast<double>(m), LOG_2, LOG_F[f_index]);
+      fputil::multiply_add(static_cast<double>(m), LOG_2, LOG_F[f_index]);
 
   double r = __llvm_libc::fputil::polyeval(
       d, extra_factor, 0x1.fffffffffffacp-1, -0x1.fffffffef9cb2p-2,
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -196,6 +196,54 @@
     ],
 )
 
+fma_common_hdrs = [
+    "src/__support/FPUtil/FMA.h",
+    "src/__support/FPUtil/generic/FMA.h",
+]
+
+fma_hdrs = selects.with_or({
+    "//conditions:default": fma_common_hdrs,
+    PLATFORM_CPU_X86_64: fma_common_hdrs + [
+        "src/__support/FPUtil/x86_64/FMA.h",
+    ],
+    PLATFORM_CPU_ARM64: fma_common_hdrs + [
+        "src/__support/FPUtil/aarch64/FMA.h",
+    ],
+})
+
+cc_library(
+    name = "__support_fputil_fma",
+    hdrs = fma_hdrs,
+    deps = [
+        ":__support_common",
+        ":__support_cpp_bit",
+        ":__support_cpp_type_traits",
+        ":__support_fputil",
+        ":libc_root",
+    ],
+)
+
+cc_library(
+    name = "__support_fputil_multiply_add",
+    hdrs = [
+        "src/__support/FPUtil/multiply_add.h",
+    ],
+    deps = [
+        ":__support_common",
+        ":__support_fputil_fma",
+    ],
+)
+
+cc_library(
+    name = "__support_fputil_polyeval",
+    hdrs = [
+        "src/__support/FPUtil/PolyEval.h",
+    ],
+    deps = [
+        ":__support_fputil_multiply_add",
+    ],
+)
+
 ################################ fenv targets ################################
 
 libc_function(