diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -14,6 +14,7 @@
     PlatformDefs.h
     UInt.h
     XFloat.h
+    clz.h
   DEPENDS
     libc.include.math
     libc.include.errno
diff --git a/libc/src/__support/FPUtil/FMA.h b/libc/src/__support/FPUtil/FMA.h
--- a/libc/src/__support/FPUtil/FMA.h
+++ b/libc/src/__support/FPUtil/FMA.h
@@ -27,11 +27,7 @@
 namespace __llvm_libc {
 namespace fputil {
 
-// We have a generic implementation available only for single precision fma as
-// we restrict it to float values for now.
-template <typename T>
-static inline cpp::EnableIfType<cpp::IsSame<T, float>::Value, T> fma(T x, T y,
-                                                                     T z) {
+template <typename T> static inline T fma(T x, T y, T z) {
   return generic::fma(x, y, z);
 }
 
diff --git a/libc/src/__support/FPUtil/Hypot.h b/libc/src/__support/FPUtil/Hypot.h
--- a/libc/src/__support/FPUtil/Hypot.h
+++ b/libc/src/__support/FPUtil/Hypot.h
@@ -12,6 +12,7 @@
 #include "BasicOperations.h"
 #include "FEnvImpl.h"
 #include "FPBits.h"
+#include "clz.h"
 #include "src/__support/CPP/Bit.h"
 #include "src/__support/CPP/TypeTraits.h"
 
@@ -23,18 +24,12 @@
 template <typename T>
 static inline T find_leading_one(T mant, int &shift_length);
 
-// The following overloads are matched based on what is accepted by
-// __builtin_clz* rather than using the exactly-sized aliases from stdint.h
-// (such as uint32_t). There are 3 overloads even though 2 will only ever be
-// used by a specific platform, since unsigned long varies in size depending on
-// the word size of the architecture.
-
 template <>
 inline unsigned int find_leading_one<unsigned int>(unsigned int mant,
                                                    int &shift_length) {
   shift_length = 0;
   if (mant > 0) {
-    shift_length = (sizeof(mant) * 8) - 1 - __builtin_clz(mant);
+    shift_length = (sizeof(mant) * 8) - 1 - clz(mant);
   }
   return 1U << shift_length;
 }
@@ -44,7 +39,7 @@
                                                      int &shift_length) {
   shift_length = 0;
   if (mant > 0) {
-    shift_length = (sizeof(mant) * 8) - 1 - __builtin_clzl(mant);
+    shift_length = (sizeof(mant) * 8) - 1 - clz(mant);
   }
   return 1UL << shift_length;
 }
@@ -55,7 +50,7 @@
                                      int &shift_length) {
   shift_length = 0;
   if (mant > 0) {
-    shift_length = (sizeof(mant) * 8) - 1 - __builtin_clzll(mant);
+    shift_length = (sizeof(mant) * 8) - 1 - clz(mant);
   }
   return 1ULL << shift_length;
 }
diff --git a/libc/src/__support/FPUtil/clz.h b/libc/src/__support/FPUtil/clz.h
new file mode 100644
--- /dev/null
+++ b/libc/src/__support/FPUtil/clz.h
@@ -0,0 +1,33 @@
+//===--Convenient template for CLZ (Count Lead Zeroes) builtins--*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_CLZ_H
+#define LLVM_LIBC_SRC_SUPPORT_FPUTIL_CLZ_H
+
+namespace __llvm_libc {
+namespace fputil {
+
+// The following overloads are matched based on what is accepted by
+// __builtin_clz* rather than using the exactly-sized aliases from stdint.h.
+// This way, we can avoid making any assumptions about integer sizes and let the
+// compiler match for us.
+template <typename T> static inline int clz(T val);
+template <> inline int clz<unsigned int>(unsigned int val) {
+  return __builtin_clz(val);
+}
+template <> inline int clz<unsigned long int>(unsigned long int val) {
+  return __builtin_clzl(val);
+}
+template <> inline int clz<unsigned long long int>(unsigned long long int val) {
+  return __builtin_clzll(val);
+}
+
+} // namespace fputil
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_CLZ_H
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -9,16 +9,21 @@
 #ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_GENERIC_FMA_H
 #define LLVM_LIBC_SRC_SUPPORT_FPUTIL_GENERIC_FMA_H
 
+#include "src/__support/CPP/Bit.h"
 #include "src/__support/CPP/TypeTraits.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/FloatProperties.h"
+#include "src/__support/FPUtil/clz.h"
+#include "src/__support/common.h"
 
 namespace __llvm_libc {
 namespace fputil {
 namespace generic {
 
-template <typename T>
-static inline cpp::EnableIfType<cpp::IsSame<T, float>::Value, T> fma(T x, T y,
-                                                                     T z) {
+template <typename T> static inline T fma(T x, T y, T z);
+
+template <> inline float fma<float>(float x, float y, float z) {
   // Product is exact.
   double prod = static_cast<double>(x) * static_cast<double>(y);
   double z_d = static_cast<double>(z);
@@ -66,6 +71,211 @@
   return static_cast<float>(static_cast<double>(bit_sum));
 }
 
+namespace internal {
+
+// Extract the sticky bits and shift the `mantissa` to the right by
+// `shift_length`.
+static inline bool shift_mantissa(int shift_length, __uint128_t &mant) {
+  if (shift_length >= 128) {
+    mant = 0;
+    return true; // prod_mant is non-zero.
+  }
+  __uint128_t mask = (__uint128_t(1) << shift_length) - 1;
+  bool sticky_bits = (mant & mask) != 0;
+  mant >>= shift_length;
+  return sticky_bits;
+}
+
+} // namespace internal
+
+template <> inline double fma<double>(double x, double y, double z) {
+  using FPBits = fputil::FPBits<double>;
+  using FloatProp = fputil::FloatProperties<double>;
+
+  if (unlikely(x == 0 || y == 0 || z == 0)) {
+    return x * y + z;
+  }
+
+  int x_exp = 0;
+  int y_exp = 0;
+  int z_exp = 0;
+
+  // Normalize denormal inputs.
+  if (unlikely(FPBits(x).get_unbiased_exponent() == 0)) {
+    x_exp -= 52;
+    x *= 0x1.0p+52;
+  }
+  if (unlikely(FPBits(y).get_unbiased_exponent() == 0)) {
+    y_exp -= 52;
+    y *= 0x1.0p+52;
+  }
+  if (unlikely(FPBits(z).get_unbiased_exponent() == 0)) {
+    z_exp -= 52;
+    z *= 0x1.0p+52;
+  }
+
+  FPBits x_bits(x), y_bits(y), z_bits(z);
+  bool x_sign = x_bits.get_sign();
+  bool y_sign = y_bits.get_sign();
+  bool z_sign = z_bits.get_sign();
+  bool prod_sign = x_sign != y_sign;
+  x_exp += x_bits.get_unbiased_exponent();
+  y_exp += y_bits.get_unbiased_exponent();
+  z_exp += z_bits.get_unbiased_exponent();
+
+  if (unlikely(x_exp == FPBits::MAX_EXPONENT || y_exp == FPBits::MAX_EXPONENT ||
+               z_exp == FPBits::MAX_EXPONENT))
+    return x * y + z;
+
+  // Extract mantissa and append hidden leading bits.
+  __uint128_t x_mant = x_bits.get_mantissa() | FPBits::MIN_NORMAL;
+  __uint128_t y_mant = y_bits.get_mantissa() | FPBits::MIN_NORMAL;
+  __uint128_t z_mant = z_bits.get_mantissa() | FPBits::MIN_NORMAL;
+
+  // If the exponent of the product x*y > the exponent of z, then no extra
+  // precision beside the entire product x*y is needed.  On the other hand, when
+  // the exponent of z >= the exponent of the product x*y, the worst-case that
+  // we need extra precision is when there is cancellation and the most
+  // significant bit of the product is aligned exactly with the second most
+  // significant bit of z:
+  //      z :    10aa...a
+  // - prod :     1bb...bb....b
+  // In that case, in order to store the exact result, we need at least
+  //   (Length of prod) - (MantissaLength of z) = 2*(52 + 1) - 52 = 54.
+  // Overall, before aligning the mantissas and exponents, we can simply left-
+  // shift the mantissa of z by at least 54, and left-shift the product of x*y
+  // by (that amount - 52).  After that, it is enough to align the least
+  // significant bit, given that we keep track of the round and sticky bits
+  // after the least significant bit.
+  // We pick shifting z_mant by 64 bits so that technically we can simply use
+  // the original mantissa as high part when constructing 128-bit z_mant.
+
+  __uint128_t prod_mant = x_mant * y_mant << 10;
+  int prod_lsb_exp =
+      x_exp + y_exp -
+      (FPBits::EXPONENT_BIAS + 2 * MantissaWidth<double>::VALUE + 10);
+
+  z_mant <<= 64;
+  int z_lsb_exp = z_exp - (MantissaWidth<double>::VALUE + 64);
+  bool round_bit = false;
+  bool sticky_bits = false;
+  bool z_shifted = false;
+
+  // Align exponents.
+  if (prod_lsb_exp < z_lsb_exp) {
+    sticky_bits = internal::shift_mantissa(z_lsb_exp - prod_lsb_exp, prod_mant);
+    prod_lsb_exp = z_lsb_exp;
+  } else if (z_lsb_exp < prod_lsb_exp) {
+    z_shifted = true;
+    sticky_bits = internal::shift_mantissa(prod_lsb_exp - z_lsb_exp, z_mant);
+  }
+
+  // Perform the addition:
+  //   (-1)^prod_sign * prod_mant + (-1)^z_sign * z_mant.
+  // The final result will be stored in prod_sign and prod_mant.
+  if (prod_sign == z_sign) {
+    // Effectively an addition.
+    prod_mant += z_mant;
+  } else {
+    // Subtraction cases.
+    if (prod_mant >= z_mant) {
+      if (z_shifted && sticky_bits) {
+        // Add 1 more to the subtrahend so that the sticky bits remains
+        // positive. This would simplify the rounding logic.
+        ++z_mant;
+      }
+      prod_mant -= z_mant;
+    } else {
+      if (!z_shifted && sticky_bits) {
+        // Add 1 more to the subtrahend so that the sticky bits remains
+        // positive. This would simplify the rounding logic.
+        ++prod_mant;
+      }
+      prod_mant = z_mant - prod_mant;
+      prod_sign = z_sign;
+    }
+  }
+
+  uint64_t result = 0;
+  int r_exp = 0; // Unbiased exponent of the result
+
+  // Normalize the result.
+  if (prod_mant != 0) {
+    uint64_t prod_hi = static_cast<uint64_t>(prod_mant >> 64);
+    int lead_zeros =
+        prod_hi ? clz(prod_hi) : 64 + clz(static_cast<uint64_t>(prod_mant));
+    // Move the leading 1 to the most significant bit.
+    prod_mant <<= lead_zeros;
+    // The lower 64 bits are always sticky bits after moving the leading 1 to
+    // the most significant bit.
+    sticky_bits |= (static_cast<uint64_t>(prod_mant) != 0);
+    result = static_cast<uint64_t>(prod_mant >> 64);
+    // Change prod_lsb_exp the be the exponent of the least significant bit of
+    // the result.
+    prod_lsb_exp += 64 - lead_zeros;
+    r_exp = prod_lsb_exp + 63;
+
+    if (r_exp > 0) {
+      // The result is normal.  We will shift the mantissa to the right by
+      // 63 - 52 = 11 bits (from the locations of the most significant bit).
+      // Then the rounding bit will correspond the the 11th bit, and the lowest
+      // 10 bits are merged into sticky bits.
+      round_bit = (result & 0x0400ULL) != 0;
+      sticky_bits |= (result & 0x03ffULL) != 0;
+      result >>= 11;
+    } else {
+      if (r_exp < -52) {
+        // The result is smaller than 1/2 of the smallest denormal number.
+        sticky_bits = true; // since the result is non-zero.
+        result = 0;
+      } else {
+        // The result is denormal.
+        uint64_t mask = 1ULL << (11 - r_exp);
+        round_bit = (result & mask) != 0;
+        sticky_bits |= (result & (mask - 1)) != 0;
+        if (r_exp > -52)
+          result >>= 12 - r_exp;
+        else
+          result = 0;
+      }
+
+      r_exp = 0;
+    }
+  }
+
+  // Finalize the result.
+  int round_mode = fputil::get_round();
+  if (unlikely(r_exp >= FPBits::MAX_EXPONENT)) {
+    if ((round_mode == FE_TOWARDZERO) ||
+        (round_mode == FE_UPWARD && prod_sign) ||
+        (round_mode == FE_DOWNWARD && !prod_sign)) {
+      result = FPBits::MAX_NORMAL;
+      return prod_sign ? -bit_cast<double>(result) : bit_cast<double>(result);
+    }
+    return prod_sign ? static_cast<double>(FPBits::neg_inf())
+                     : static_cast<double>(FPBits::inf());
+  }
+
+  // Remove hidden bit and append the exponent field and sign bit.
+  result = (result & FloatProp::MANTISSA_MASK) |
+           (static_cast<uint64_t>(r_exp) << FloatProp::MANTISSA_WIDTH);
+  if (prod_sign) {
+    result |= FloatProp::SIGN_MASK;
+  }
+
+  // Rounding.
+  if (round_mode == FE_TONEAREST) {
+    if (round_bit && (sticky_bits || ((result & 1) != 0)))
+      ++result;
+  } else if ((round_mode == FE_UPWARD && !prod_sign) ||
+             (round_mode == FE_DOWNWARD && prod_sign)) {
+    if (round_bit || sticky_bits)
+      ++result;
+  }
+
+  return bit_cast<double>(result);
+}
+
 } // namespace generic
 } // namespace fputil
 } // namespace __llvm_libc
diff --git a/libc/src/__support/FPUtil/generic/sqrt.h b/libc/src/__support/FPUtil/generic/sqrt.h
--- a/libc/src/__support/FPUtil/generic/sqrt.h
+++ b/libc/src/__support/FPUtil/generic/sqrt.h
@@ -15,6 +15,7 @@
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PlatformDefs.h"
+#include "src/__support/FPUtil/clz.h"
 
 namespace __llvm_libc {
 namespace fputil {
@@ -31,21 +32,6 @@
 };
 #endif // SPECIAL_X86_LONG_DOUBLE
 
-// The following overloads are matched based on what is accepted by
-// __builtin_clz* rather than using the exactly-sized aliases from stdint.h.
-// This way, we can avoid making any assumptions about integer sizes and let the
-// compiler match for us.
-template <typename T> static inline int clz(T val);
-template <> inline int clz<unsigned int>(unsigned int val) {
-  return __builtin_clz(val);
-}
-template <> inline int clz<unsigned long int>(unsigned long int val) {
-  return __builtin_clzl(val);
-}
-template <> inline int clz<unsigned long long int>(unsigned long long int val) {
-  return __builtin_clzll(val);
-}
-
 template <typename T>
 static inline void normalize(int &exponent,
                              typename FPBits<T>::UIntType &mantissa) {
diff --git a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
--- a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
+++ b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
@@ -12,6 +12,7 @@
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PlatformDefs.h"
+#include "src/__support/FPUtil/clz.h"
 
 namespace __llvm_libc {
 namespace fputil {
@@ -19,7 +20,7 @@
 
 inline void normalize(int &exponent, __uint128_t &mantissa) {
   const int shift =
-      __builtin_clzll(static_cast<uint64_t>(mantissa)) -
+      clz(static_cast<uint64_t>(mantissa)) -
       (8 * sizeof(uint64_t) - 1 - MantissaWidth<long double>::VALUE);
   exponent -= shift;
   mantissa <<= shift;
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1177,6 +1177,18 @@
     libc.src.__support.FPUtil.fputil
 )
 
+add_fp_unittest(
+  fma_no_fma_test
+  NEED_MPFR
+  SUITE
+    libc_math_unittests
+  SRCS
+    fma_no_fma_test.cpp
+  DEPENDS
+    libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.fma
+)
+
 add_fp_unittest(
   tan_test
   NEED_MPFR
diff --git a/libc/test/src/math/FmaTest.h b/libc/test/src/math/FmaTest.h
--- a/libc/test/src/math/FmaTest.h
+++ b/libc/test/src/math/FmaTest.h
@@ -72,9 +72,9 @@
          v += STEP, w -= STEP) {
       T x = T(FPBits(get_random_bit_pattern())), y = T(FPBits(v)),
         z = T(FPBits(w));
-      T result = func(x, y, z);
       mpfr::TernaryInput<T> input{x, y, z};
-      ASSERT_MPFR_MATCH(mpfr::Operation::Fma, input, result, 0.5);
+      ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Fma, input, func(x, y, z),
+                                     0.5);
     }
   }
 
@@ -86,9 +86,9 @@
          v += STEP, w -= STEP) {
       T x = T(FPBits(v)), y = T(FPBits(w)),
         z = T(FPBits(get_random_bit_pattern()));
-      T result = func(x, y, z);
       mpfr::TernaryInput<T> input{x, y, z};
-      ASSERT_MPFR_MATCH(mpfr::Operation::Fma, input, result, 0.5);
+      ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Fma, input, func(x, y, z),
+                                     0.5);
     }
   }
 };
diff --git a/libc/test/src/math/fma_no_fma_test.cpp b/libc/test/src/math/fma_no_fma_test.cpp
new file mode 100644
--- /dev/null
+++ b/libc/test/src/math/fma_no_fma_test.cpp
@@ -0,0 +1,28 @@
+//===-- Unittests for fma without FMA instructions ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/architectures.h"
+#ifdef LIBC_TARGET_HAS_FMA
+#undef LIBC_TARGET_HAS_FMA
+#endif // LIBC_TARGET_HAS_FMA
+
+#include "FmaTest.h"
+
+#include "src/__support/FPUtil/FMA.h"
+
+using LlvmLibcFmaNoFMATest = FmaTestTemplate<double>;
+
+static auto func = [](double x, double y, double z) -> double {
+  return __llvm_libc::fputil::fma(x, y, z);
+};
+
+TEST_F(LlvmLibcFmaNoFMATest, SpecialNumbers) { test_special_numbers(func); }
+
+TEST_F(LlvmLibcFmaNoFMATest, SubnormalRange) { test_subnormal_range(func); }
+
+TEST_F(LlvmLibcFmaNoFMATest, NormalRange) { test_normal_range(func); }
diff --git a/libc/test/src/math/fmaf_test.cpp b/libc/test/src/math/fmaf_test.cpp
--- a/libc/test/src/math/fmaf_test.cpp
+++ b/libc/test/src/math/fmaf_test.cpp
@@ -10,14 +10,14 @@
 
 #include "src/math/fmaf.h"
 
-using LlvmLibcFmaTest = FmaTestTemplate<float>;
+using LlvmLibcFmafTest = FmaTestTemplate<float>;
 
-TEST_F(LlvmLibcFmaTest, SpecialNumbers) {
+TEST_F(LlvmLibcFmafTest, SpecialNumbers) {
   test_special_numbers(&__llvm_libc::fmaf);
 }
 
-TEST_F(LlvmLibcFmaTest, SubnormalRange) {
+TEST_F(LlvmLibcFmafTest, SubnormalRange) {
   test_subnormal_range(&__llvm_libc::fmaf);
 }
 
-TEST_F(LlvmLibcFmaTest, NormalRange) { test_normal_range(&__llvm_libc::fmaf); }
+TEST_F(LlvmLibcFmafTest, NormalRange) { test_normal_range(&__llvm_libc::fmaf); }