diff --git a/libc/cmake/modules/LLVMLibCFlagRules.cmake b/libc/cmake/modules/LLVMLibCFlagRules.cmake --- a/libc/cmake/modules/LLVMLibCFlagRules.cmake +++ b/libc/cmake/modules/LLVMLibCFlagRules.cmake @@ -131,8 +131,14 @@ # Special flags set(FMA_OPT_FLAG "FMA_OPT") +set(ROUND_OPT_FLAG "ROUND_OPT") # Skip FMA_OPT flag for targets that don't support fma. if(NOT(LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "FMA"))) set(SKIP_FLAG_EXPANSION_FMA_OPT TRUE) endif() + +# Skip ROUND_OPT flag for targets that don't support SSE 4.2. +if(NOT(LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE4_2"))) + set(SKIP_FLAG_EXPANSION_ROUND_OPT TRUE) +endif() diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake --- a/libc/cmake/modules/LLVMLibCObjectRules.cmake +++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake @@ -9,6 +9,14 @@ set(ADD_FMA_FLAG TRUE) endif() + list(FIND flags ${ROUND_OPT_FLAG} round) + if(${round} LESS 0) + list(FIND flags "${ROUND_OPT_FLAG}__ONLY" round) + endif() + if((${round} GREATER -1) AND (LIBC_CPU_FEATURES MATCHES "SSE4_2")) + set(ADD_SSE4_2_FLAG TRUE) + endif() + set(compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT} ${ARGN}) if(NOT ${LIBC_TARGET_OS} STREQUAL "windows") set(compile_options ${compile_options} -fpie -ffreestanding -fno-builtin) @@ -21,6 +29,9 @@ if(ADD_FMA_FLAG) list(APPEND compile_options "-mfma") endif() + if(ADD_SSE4_2_FLAG) + list(APPEND compile_options "-msse4.2") + endif() elseif(MSVC) list(APPEND compile_options "/EHs-c-") list(APPEND compile_options "/GR-") diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt --- a/libc/src/__support/FPUtil/CMakeLists.txt +++ b/libc/src/__support/FPUtil/CMakeLists.txt @@ -75,6 +75,8 @@ nearest_integer.h DEPENDS libc.src.__support.common + FLAGS + ROUND_OPT ) add_subdirectory(generic) diff --git a/libc/src/__support/FPUtil/aarch64/nearest_integer.h b/libc/src/__support/FPUtil/aarch64/nearest_integer.h --- a/libc/src/__support/FPUtil/aarch64/nearest_integer.h +++ b/libc/src/__support/FPUtil/aarch64/nearest_integer.h @@ -18,6 +18,12 @@ namespace __llvm_libc { namespace fputil { +static inline float nearest_integer(float x) { + float result; + __asm__ __volatile__("frintn %s0, %s1\n\t" : "=w"(result) : "w"(x)); + return result; +} + static inline double nearest_integer(double x) { double result; __asm__ __volatile__("frintn %d0, %d1\n\t" : "=w"(result) : "w"(x)); diff --git a/libc/src/__support/FPUtil/nearest_integer.h b/libc/src/__support/FPUtil/nearest_integer.h --- a/libc/src/__support/FPUtil/nearest_integer.h +++ b/libc/src/__support/FPUtil/nearest_integer.h @@ -28,6 +28,22 @@ // Notice that for AARCH64 and x86-64 with SSE4.2 support, we will use their // corresponding rounding instruction instead. And in those cases, the results // are rounded to the nearest integer, tie-to-even. +static inline float nearest_integer(float x) { + if (x < 0x1p24 && x > -0x1p24) { + float r = x < 0 ? (x - 0x1.0p23) + 0x1.0p23 : (x + 0x1.0p23) - 0x1.0p23; + float diff = x - r; + // The expression above is correct for the default rounding mode, round-to- + // nearest, tie-to-even. For other rounding modes, it might be off by 1, + // which is corrected below. + if (unlikely(diff > 0.5f)) + return r + 1.0f; + if (unlikely(diff < -0.5f)) + return r - 1.0f; + return r; + } + return x; +} + static inline double nearest_integer(double x) { if (x < 0x1p53 && x > -0x1p53) { double r = x < 0 ? (x - 0x1.0p52) + 0x1.0p52 : (x + 0x1.0p52) - 0x1.0p52; diff --git a/libc/src/__support/FPUtil/x86_64/nearest_integer.h b/libc/src/__support/FPUtil/x86_64/nearest_integer.h --- a/libc/src/__support/FPUtil/x86_64/nearest_integer.h +++ b/libc/src/__support/FPUtil/x86_64/nearest_integer.h @@ -24,6 +24,13 @@ namespace __llvm_libc { namespace fputil { +static inline float nearest_integer(float x) { + __m128 xmm = _mm_set_ss(x); // NOLINT + __m128 ymm = + _mm_round_ss(xmm, xmm, _MM_ROUND_NEAREST | _MM_FROUND_NO_EXC); // NOLINT + return ymm[0]; +} + static inline double nearest_integer(double x) { __m128d xmm = _mm_set_sd(x); // NOLINT __m128d ymm =