diff --git a/libc/docs/math.rst b/libc/docs/math.rst --- a/libc/docs/math.rst +++ b/libc/docs/math.rst @@ -154,7 +154,7 @@ ============== ================ =============== ====================== (float) (double) (long double) ============== ================ =============== ====================== -cos 0.776 ULPs large +cos |check| large exp |check| exp2 |check| expm1 |check| @@ -197,7 +197,7 @@ | +-----------+-------------------+-----------+-------------------+ +------------+-------------------------+--------------+---------------+ | | LLVM libc | Reference (glibc) | LLVM libc | Reference (glibc) | | CPU | OS | Compiler | Special flags | +==============+===========+===================+===========+===================+=====================================+============+=========================+==============+===============+ -| cosf | 37 | 32 | 73 | 72 | :math:`[0, 2\pi]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | | +| cosf | 14 | 32 | 56 | 59 | :math:`[0, 2\pi]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | | +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+ | expf | 9 | 7 | 44 | 38 | :math:`[-10, 10]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA | +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+ diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -62,10 +62,17 @@ cosf.cpp HDRS ../cosf.h + range_reduction.h + range_reduction_fma.h DEPENDS - .sincosf_utils + .common_constants libc.include.math libc.src.errno.errno + libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.fma + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.nearest_integer + libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 ) diff --git a/libc/src/math/generic/cosf.cpp b/libc/src/math/generic/cosf.cpp --- a/libc/src/math/generic/cosf.cpp +++ b/libc/src/math/generic/cosf.cpp @@ -7,59 +7,184 @@ //===----------------------------------------------------------------------===// #include "src/math/cosf.h" -#include "math_utils.h" -#include "sincosf_utils.h" - +#include "common_constants.h" +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/except_value_utils.h" +#include "src/__support/FPUtil/multiply_add.h" #include "src/__support/common.h" -#include -#include +#include + +#if defined(LIBC_TARGET_HAS_FMA) +#include "range_reduction_fma.h" +// using namespace __llvm_libc::fma; +using __llvm_libc::fma::FAST_PASS_BOUND; +using __llvm_libc::fma::large_range_reduction; +using __llvm_libc::fma::small_range_reduction; +#else +#include "range_reduction.h" +// using namespace __llvm_libc::generic; +using __llvm_libc::generic::FAST_PASS_BOUND; +using __llvm_libc::generic::large_range_reduction; +using __llvm_libc::generic::small_range_reduction; +#endif namespace __llvm_libc { -// Fast cosf implementation. Worst-case ULP is 0.5607, maximum relative -// error is 0.5303 * 2^-23. A single-step range reduction is used for -// small values. Large inputs have their range reduced using fast integer -// arithmetic. -LLVM_LIBC_FUNCTION(float, cosf, (float y)) { - double x = y; - double s; - int n; - const sincos_t *p = &SINCOSF_TABLE[0]; - - if (abstop12(y) < abstop12(PIO4)) { - double x2 = x * x; - - if (unlikely(abstop12(y) < abstop12(as_float(0x39800000)))) - return 1.0f; - - return sinf_poly(x, x2, p, 1); - } else if (likely(abstop12(y) < abstop12(120.0f))) { - x = reduce_fast(x, p, &n); - - // Setup the signs for sin and cos. - s = p->sign[n & 3]; - - if (n & 2) - p = &SINCOSF_TABLE[1]; - - return sinf_poly(x * s, x * x, p, n ^ 1); - } else if (abstop12(y) < abstop12(INFINITY)) { - uint32_t xi = as_uint32_bits(y); - int sign = xi >> 31; - - x = reduce_large(xi, &n); - - // Setup signs for sin and cos - include original sign. - s = p->sign[(n + sign) & 3]; +// Exceptional cases for cosf. +static constexpr int COSF_EXCEPTS = 6; + +static constexpr fputil::ExceptionalValues CosfExcepts{ + /* inputs */ { + 0x55325019, // x = 0x1.64a032p43 + 0x5922aa80, // x = 0x1.4555p51 + 0x5aa4542c, // x = 0x1.48a858p54 + 0x5f18b878, // x = 0x1.3170fp63 + 0x6115cb11, // x = 0x1.2b9622p67 + 0x7beef5ef, // x = 0x1.ddebdep120 + }, + /* outputs (RZ, RU offset, RD offset, RN offset) */ + { + {0x3f4ea5d2, 1, 0, 0}, // x = 0x1.64a032p43, cos(x) = 0x1.9d4ba4p-1 (RZ) + {0x3f08aebe, 1, 0, 1}, // x = 0x1.4555p51, cos(x) = 0x1.115d7cp-1 (RZ) + {0x3efa40a4, 1, 0, 0}, // x = 0x1.48a858p54, cos(x) = 0x1.f48148p-2 (RZ) + {0x3f7f14bb, 1, 0, 0}, // x = 0x1.3170fp63, cos(x) = 0x1.fe2976p-1 (RZ) + {0x3f78142e, 1, 0, 1}, // x = 0x1.2b9622p67, cos(x) = 0x1.f0285cp-1 (RZ) + {0x3f08a21c, 1, 0, + 0}, // x = 0x1.ddebdep120, cos(x) = 0x1.114438p-1 (RZ) + }}; + +LLVM_LIBC_FUNCTION(float, cosf, (float x)) { + using FPBits = typename fputil::FPBits; + FPBits xbits(x); + xbits.set_sign(false); + + uint32_t x_abs = xbits.uintval(); + double xd = static_cast(xbits.get_val()); + + // Range reduction: + // For |x| > pi/16, we perform range reduction as follows: + // Find k and y such that: + // x = (k + y) * pi/16 + // k is an integer + // |y| < 0.5 + // For small range (|x| < 2^46 when FMA instructions are available, 2^22 + // otherwise), this is done by performing: + // k = round(x * 16/pi) + // y = x * 16/pi - k + // For large range, we will omit all the higher parts of 16/pi such that the + // least significant bits of their full products with x are larger than 31, + // since cos((k + y + 32*i) * pi/16) = cos(x + i * 2pi) = cos(x). + // + // When FMA instructions are not available, we store the digits of 16/pi in + // chunks of 28-bit precision. This will make sure that the products: + // x * SIXTEEN_OVER_PI_28[i] are all exact. + // When FMA instructions are available, we simply store the digits of 16/pi in + // chunks of doubles (53-bit of precision). + // So when multiplying by the largest values of single precision, the + // resulting output should be correct up to 2^(-208 + 128) ~ 2^-80. By the + // worst-case analysis of range reduction, |y| >= 2^-38, so this should give + // us more than 40 bits of accuracy. For the worst-case estimation of range + // reduction, see for instances: + // Elementary Functions by J-M. Muller, Chapter 11, + // Handbook of Floating-Point Arithmetic by J-M. Muller et. al., + // Chapter 10.2. + // + // Once k and y are computed, we then deduce the answer by the cosine of sum + // formula: + // cos(x) = cos((k + y)*pi/16) + // = cos(y*pi/16) * cos(k*pi/16) - sin(y*pi/16) * sin(k*pi/16) + // The values of sin(k*pi/16) and cos(k*pi/16) for k = 0..31 are precomputed + // and stored using a vector of 32 doubles. Sin(y*pi/16) and cos(y*pi/16) are + // computed using degree-7 and degree-8 minimax polynomials generated by + // Sollya respectively. + + // |x| < 0x1.0p-12f + if (unlikely(x_abs < 0x3980'0000U)) { + // When |x| < 2^-12, the relative error of the approximation cos(x) ~ 1 + // is: + // |cos(x) - 1| < |x^2 / 2| = 2^-25 < epsilon(1)/2. + // So the correctly rounded values of cos(x) are: + // = 1 - eps(x) if rounding mode = FE_TOWARDZERO or FE_DOWWARD, + // = 1 otherwise. + // To simplify the rounding decision and make it more efficient and to + // prevent compiler to perform constant folding, we use + // fma(x, -2^-25, 1) instead. + // Note: to use the formula 1 - 2^-25*x to decide the correct rounding, we + // do need fma(x, -2^-25, 1) to prevent underflow caused by -2^-25*x when + // |x| < 2^-125. For targets without FMA instructions, we simply use + // double for intermediate results as it is more efficient than using an + // emulated version of FMA. +#if defined(LIBC_TARGET_HAS_FMA) + return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f); +#else + return static_cast(fputil::multiply_add(xd, -0x1.0p-25, 1.0)); +#endif // LIBC_TARGET_HAS_FMA + } - if ((n + sign) & 2) - p = &SINCOSF_TABLE[1]; + using ExceptChecker = typename fputil::ExceptionChecker; + { + float result; + if (ExceptChecker::check_odd_func(CosfExcepts, x_abs, false, result)) + return result; + } - return sinf_poly(x * s, x * x, p, n ^ 1); + // TODO(lntue): refactor range reduction and most of polynomial approximation + // to share between sinf, cosf, and sincosf. + int k; + double y; + + if (likely(x_abs < FAST_PASS_BOUND)) { + k = small_range_reduction(xd, y); + } else { + // x is inf or nan. + if (unlikely(x_abs >= 0x7f80'0000U)) { + if (x_abs == 0x7f80'0000U) { + errno = EDOM; + fputil::set_except(FE_INVALID); + } + return x + + FPBits::build_nan(1 << (fputil::MantissaWidth::VALUE - 1)); + } + + k = large_range_reduction(xd, xbits.get_exponent(), y); } - return invalid(y); + // After range reduction, k = round(x * 16 / pi) and y = (x * 16 / pi) - k. + // So k is an integer and -0.5 <= y <= 0.5. + // Then cos(x) = cos((k + y)*pi/16) + // = cos(y*pi/16) * cos(k*pi/16) - sin(y*pi/16) * sin(k*pi/16) + + double ysq = y * y; + + // Degree-6 minimax even polynomial for sin(y*pi/16)/y generated by Sollya + // with: + // > Q = fpminimax(sin(y*pi/16)/y, [|0, 2, 4, 6|], [|D...|], [0, 0.5]); + double sin_y = + y * fputil::polyeval(ysq, 0x1.921fb54442d17p-3, -0x1.4abbce6256adp-10, + 0x1.466bc5a5ac6b3p-19, -0x1.32bdcb4207562p-29); + // Degree-8 minimax even polynomial for cos(y*pi/16) generated by Sollya with: + // > P = fpminimax(cos(x*pi/16), [|0, 2, 4, 6, 8|], [|1, D...|], [0, 0.5]); + // Note that cosm1_y = cos(y*pi/16) - 1. + double cosm1_y = + ysq * fputil::polyeval(ysq, -0x1.3bd3cc9be45dcp-6, 0x1.03c1f081b08ap-14, + -0x1.55d3c6fb0fb6ep-24, 0x1.e1d3d60f58873p-35); + + double sin_k = -SIN_K_PI_OVER_16[k & 31]; + // cos(k * pi/16) = sin(k * pi/16 + pi/2) = sin((k + 8) * pi/16). + // cos_k = y * cos(k * pi/16) + double cos_k = SIN_K_PI_OVER_16[(k + 8) & 31]; + + // Combine the results with the sine of sum formula: + // cos(x) = cos((k + y)*pi/16) + // = cos(y*pi/16) * cos(k*pi/16) - sin(y*pi/16) * sin(k*pi/16) + // = cosm1_y * cos_k + sin_y * sin_k + // = (cosm1_y * cos_k + cos_k) + sin_y * sin_k + return fputil::multiply_add(sin_y, sin_k, + fputil::multiply_add(cosm1_y, cos_k, cos_k)); } } // namespace __llvm_libc diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp --- a/libc/test/src/math/cosf_test.cpp +++ b/libc/test/src/math/cosf_test.cpp @@ -51,21 +51,65 @@ float x = float(FPBits(v)); if (isnan(x) || isinf(x)) continue; - ASSERT_MPFR_MATCH(mpfr::Operation::Cos, x, __llvm_libc::cosf(x), 1.0); + ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x, + __llvm_libc::cosf(x), 0.5); } } -// For small values, cos(x) is 1. -TEST(LlvmLibcCosfTest, SmallValues) { - float x = float(FPBits(0x17800000U)); - float result = __llvm_libc::cosf(x); - EXPECT_MPFR_MATCH(mpfr::Operation::Cos, x, result, 1.0); - EXPECT_FP_EQ(1.0f, result); +TEST(LlvmLibcCosfTest, SpecificBitPatterns) { + constexpr int N = 42; + constexpr uint32_t INPUTS[N] = { + 0x3f06'0a92U, // x = pi/6 + 0x3f3a'dc51U, // x = 0x1.75b8a2p-1f + 0x3f49'0fdbU, // x = pi/4 + 0x3f86'0a92U, // x = pi/3 + 0x3fa7'832aU, // x = 0x1.4f0654p+0f + 0x3fc9'0fdbU, // x = pi/2 + 0x4017'1973U, // x = 0x1.2e32e6p+1f + 0x4049'0fdbU, // x = pi + 0x4096'cbe4U, // x = 0x1.2d97c8p+2f + 0x40c9'0fdbU, // x = 2*pi + 0x433b'7490U, // x = 0x1.76e92p+7f + 0x437c'e5f1U, // x = 0x1.f9cbe2p+7f + 0x4619'9998U, // x = 0x1.33333p+13f + 0x474d'246fU, // x = 0x1.9a48dep+15f + 0x4afd'ece4U, // x = 0x1.fbd9c8p+22f + 0x4c23'32e9U, // x = 0x1.4665d2p+25f + 0x50a3'e87fU, // x = 0x1.47d0fep+34f + 0x5239'47f6U, // x = 0x1.728fecp+37f + 0x53b1'46a6U, // x = 0x1.628d4cp+40f + 0x5532'5019U, // x = 0x1.64a032p+43f + 0x55ca'fb2aU, // x = 0x1.95f654p+44f + 0x588e'f060U, // x = 0x1.1de0cp+50f + 0x5922'aa80U, // x = 0x1.4555p+51f + 0x5aa4'542cU, // x = 0x1.48a858p+54f + 0x5c07'bcd0U, // x = 0x1.0f79ap+57f + 0x5ebc'fddeU, // x = 0x1.79fbbcp+62f + 0x5f18'b878U, // x = 0x1.3170fp+63f + 0x5fa6'eba7U, // x = 0x1.4dd74ep+64f + 0x6115'cb11U, // x = 0x1.2b9622p+67f + 0x61a4'0b40U, // x = 0x1.48168p+68f + 0x6386'134eU, // x = 0x1.0c269cp+72f + 0x6589'8498U, // x = 0x1.13093p+76f + 0x6600'0001U, // x = 0x1.000002p+77f + 0x664e'46e4U, // x = 0x1.9c8dc8p+77f + 0x66b0'14aaU, // x = 0x1.602954p+78f + 0x67a9'242bU, // x = 0x1.524856p+80f + 0x6a19'76f1U, // x = 0x1.32ede2p+85f + 0x6c55'da58U, // x = 0x1.abb4bp+89f + 0x6f79'be45U, // x = 0x1.f37c8ap+95f + 0x7276'69d4U, // x = 0x1.ecd3a8p+101f + 0x7758'4625U, // x = 0x1.b08c4ap+111f + 0x7bee'f5efU, // x = 0x1.ddebdep+120f + }; - x = float(FPBits(0x0040000U)); - result = __llvm_libc::cosf(x); - EXPECT_MPFR_MATCH(mpfr::Operation::Cos, x, result, 1.0); - EXPECT_FP_EQ(1.0f, result); + for (int i = 0; i < N; ++i) { + float x = float(FPBits(INPUTS[i])); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x, + __llvm_libc::cosf(x), 0.5); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, -x, + __llvm_libc::cosf(-x), 0.5); + } } // SDCOMP-26094: check cosf in the cases for which the range reducer @@ -73,6 +117,6 @@ TEST(LlvmLibcCosfTest, SDCOMP_26094) { for (uint32_t v : SDCOMP26094_VALUES) { float x = float(FPBits(v)); - ASSERT_MPFR_MATCH(mpfr::Operation::Cos, x, __llvm_libc::cosf(x), 1.0); + ASSERT_MPFR_MATCH(mpfr::Operation::Cos, x, __llvm_libc::cosf(x), 0.5); } } diff --git a/libc/test/src/math/exhaustive/CMakeLists.txt b/libc/test/src/math/exhaustive/CMakeLists.txt --- a/libc/test/src/math/exhaustive/CMakeLists.txt +++ b/libc/test/src/math/exhaustive/CMakeLists.txt @@ -40,15 +40,19 @@ add_fp_unittest( cosf_test + NO_RUN_POSTBUILD NEED_MPFR SUITE libc_math_exhaustive_tests SRCS cosf_test.cpp DEPENDS + .exhaustive_test libc.include.math libc.src.math.cosf libc.src.__support.FPUtil.fputil + LINK_LIBRARIES + -lpthread ) add_fp_unittest( diff --git a/libc/test/src/math/exhaustive/cosf_test.cpp b/libc/test/src/math/exhaustive/cosf_test.cpp --- a/libc/test/src/math/exhaustive/cosf_test.cpp +++ b/libc/test/src/math/exhaustive/cosf_test.cpp @@ -6,20 +6,71 @@ // //===----------------------------------------------------------------------===// +#include "exhaustive_test.h" #include "src/__support/FPUtil/FPBits.h" #include "src/math/cosf.h" #include "utils/MPFRWrapper/MPFRUtils.h" -#include +#include "utils/UnitTest/FPMatcher.h" + +#include using FPBits = __llvm_libc::fputil::FPBits; namespace mpfr = __llvm_libc::testing::mpfr; -TEST(LlvmLibccosffExhaustiveTest, AllValues) { - uint32_t bits = 0; - do { - FPBits xbits(bits); - float x = float(xbits); - ASSERT_MPFR_MATCH(mpfr::Operation::Cos, x, __llvm_libc::cosf(x), 1.0); - } while (bits++ < 0xffff'ffffU); +struct LlvmLibcCosfExhaustiveTest : public LlvmLibcExhaustiveTest { + bool check(uint32_t start, uint32_t stop, + mpfr::RoundingMode rounding) override { + mpfr::ForceRoundingMode r(rounding); + uint32_t bits = start; + bool result = true; + do { + FPBits xbits(bits); + float x = float(xbits); + bool r = EXPECT_MPFR_MATCH(mpfr::Operation::Cos, x, __llvm_libc::cosf(x), + 0.5, rounding); + result &= r; + } while (++bits < stop); + return result; + } +}; + +// Range: [0, +Inf); +static constexpr uint32_t POS_START = 0x0000'0000U; +static constexpr uint32_t POS_STOP = 0x7f80'0000U; + +TEST_F(LlvmLibcCosfExhaustiveTest, PostiveRangeRoundNearestTieToEven) { + test_full_range(POS_START, POS_STOP, mpfr::RoundingMode::Nearest); +} + +TEST_F(LlvmLibcCosfExhaustiveTest, PostiveRangeRoundUp) { + test_full_range(POS_START, POS_STOP, mpfr::RoundingMode::Upward); +} + +TEST_F(LlvmLibcCosfExhaustiveTest, PostiveRangeRoundDown) { + test_full_range(POS_START, POS_STOP, mpfr::RoundingMode::Downward); } + +TEST_F(LlvmLibcCosfExhaustiveTest, PostiveRangeRoundTowardZero) { + test_full_range(POS_START, POS_STOP, mpfr::RoundingMode::TowardZero); +} + +// // Range: (-Inf, 0]; +// static constexpr uint32_t NEG_START = 0x8000'0000U; +// static constexpr uint32_t NEG_STOP = 0xff80'0000U; + +// TEST_F(LlvmLibcSinfExhaustiveTest, NegativeRangeRoundNearestTieToEven) { +// test_full_range(NEG_START, NEG_STOP, mpfr::RoundingMode::Nearest); +// } + +// TEST_F(LlvmLibcSinfExhaustiveTest, NegativeRangeRoundUp) { +// test_full_range(NEG_START, NEG_STOP, mpfr::RoundingMode::Upward); +// } + +// TEST_F(LlvmLibcSinfExhaustiveTest, NegativeRangeRoundDown) { +// test_full_range(NEG_START, NEG_STOP, mpfr::RoundingMode::Downward); +// } + +// TEST_F(LlvmLibcSinfExhaustiveTest, NegativeRangeRoundTowardZero) { +// test_full_range(NEG_START, NEG_STOP, mpfr::RoundingMode::TowardZero); +// } diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -636,8 +636,11 @@ libc_math_function( name = "cosf", additional_deps = [ - ":math_utils", - ":sincosf_utils", + ":__support_fputil_fma", + ":__support_fputil_multiply_add", + ":__support_fputil_polyeval", + ":common_constants", + ":range_reduction", ], )