diff --git a/libc/cmake/modules/LLVMLibCFlagRules.cmake b/libc/cmake/modules/LLVMLibCFlagRules.cmake
--- a/libc/cmake/modules/LLVMLibCFlagRules.cmake
+++ b/libc/cmake/modules/LLVMLibCFlagRules.cmake
@@ -131,8 +131,14 @@
 
 # Special flags
 set(FMA_OPT_FLAG "FMA_OPT")
+set(ROUND_OPT_FLAG "ROUND_OPT")
 
 # Skip FMA_OPT flag for targets that don't support fma.
 if(NOT(LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "FMA")))
   set(SKIP_FLAG_EXPANSION_FMA_OPT TRUE)
 endif()
+
+# Skip ROUND_OPT flag for targets that don't support SSE 4.2.
+if(NOT(LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE4_2")))
+  set(SKIP_FLAG_EXPANSION_ROUND_OPT TRUE)
+endif()
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -9,6 +9,14 @@
     set(ADD_FMA_FLAG TRUE)
   endif()
 
+  list(FIND flags ${ROUND_OPT_FLAG} round)
+  if(${round} LESS 0)
+    list(FIND flags "${ROUND_OPT_FLAG}__ONLY" round)
+  endif()
+  if((${round} GREATER -1) AND (LIBC_CPU_FEATURES MATCHES "SSE4_2"))
+    set(ADD_SSE4_2_FLAG TRUE)
+  endif()
+
   set(compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT} ${ARGN})
   if(NOT ${LIBC_TARGET_OS} STREQUAL "windows")
     set(compile_options ${compile_options} -fpie -ffreestanding -fno-builtin)
@@ -21,6 +29,9 @@
     if(ADD_FMA_FLAG)
       list(APPEND compile_options "-mfma")
     endif()
+    if(ADD_SSE4_2_FLAG)
+      list(APPEND compile_options "-msse4.2")
+    endif()
   elseif(MSVC)
     list(APPEND compile_options "/EHs-c-")
     list(APPEND compile_options "/GR-")
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -75,6 +75,8 @@
     nearest_integer.h
   DEPENDS
     libc.src.__support.common
+  FLAGS
+    ROUND_OPT
 )
 
 add_subdirectory(generic)
diff --git a/libc/src/__support/FPUtil/aarch64/nearest_integer.h b/libc/src/__support/FPUtil/aarch64/nearest_integer.h
--- a/libc/src/__support/FPUtil/aarch64/nearest_integer.h
+++ b/libc/src/__support/FPUtil/aarch64/nearest_integer.h
@@ -18,6 +18,12 @@
 namespace __llvm_libc {
 namespace fputil {
 
+static inline float nearest_integer(float x) {
+  float result;
+  __asm__ __volatile__("frintn %s0, %s1\n\t" : "=w"(result) : "w"(x));
+  return result;
+}
+
 static inline double nearest_integer(double x) {
   double result;
   __asm__ __volatile__("frintn %d0, %d1\n\t" : "=w"(result) : "w"(x));
diff --git a/libc/src/__support/FPUtil/nearest_integer.h b/libc/src/__support/FPUtil/nearest_integer.h
--- a/libc/src/__support/FPUtil/nearest_integer.h
+++ b/libc/src/__support/FPUtil/nearest_integer.h
@@ -28,6 +28,22 @@
 // Notice that for AARCH64 and x86-64 with SSE4.2 support, we will use their
 // corresponding rounding instruction instead.  And in those cases, the results
 // are rounded to the nearest integer, tie-to-even.
+static inline float nearest_integer(float x) {
+  if (x < 0x1p24 && x > -0x1p24) {
+    float r = x < 0 ? (x - 0x1.0p23) + 0x1.0p23 : (x + 0x1.0p23) - 0x1.0p23;
+    float diff = x - r;
+    // The expression above is correct for the default rounding mode, round-to-
+    // nearest, tie-to-even.  For other rounding modes, it might be off by 1,
+    // which is corrected below.
+    if (unlikely(diff > 0.5f))
+      return r + 1.0f;
+    if (unlikely(diff < -0.5f))
+      return r - 1.0f;
+    return r;
+  }
+  return x;
+}
+
 static inline double nearest_integer(double x) {
   if (x < 0x1p53 && x > -0x1p53) {
     double r = x < 0 ? (x - 0x1.0p52) + 0x1.0p52 : (x + 0x1.0p52) - 0x1.0p52;
diff --git a/libc/src/__support/FPUtil/x86_64/nearest_integer.h b/libc/src/__support/FPUtil/x86_64/nearest_integer.h
--- a/libc/src/__support/FPUtil/x86_64/nearest_integer.h
+++ b/libc/src/__support/FPUtil/x86_64/nearest_integer.h
@@ -24,6 +24,13 @@
 namespace __llvm_libc {
 namespace fputil {
 
+static inline float nearest_integer(float x) {
+  __m128 xmm = _mm_set_ss(x); // NOLINT
+  __m128 ymm =
+      _mm_round_ss(xmm, xmm, _MM_ROUND_NEAREST | _MM_FROUND_NO_EXC); // NOLINT
+  return ymm[0];
+}
+
 static inline double nearest_integer(double x) {
   __m128d xmm = _mm_set_sd(x); // NOLINT
   __m128d ymm =