diff --git a/libc/docs/math.rst b/libc/docs/math.rst --- a/libc/docs/math.rst +++ b/libc/docs/math.rst @@ -223,7 +223,7 @@ +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+ | log2f | 13 | 10 | 57 | 46 | :math:`[e^{-1}, e]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA | +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+ -| sinf | 14 | 26 | 65 | 59 | :math:`[-\pi, \pi]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA | +| sinf | 13 | 25 | 54 | 57 | :math:`[-\pi, \pi]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA | +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+ References diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -79,6 +79,7 @@ range_reduction.h range_reduction_fma.h DEPENDS + .common_constants libc.include.math libc.src.errno.errno libc.src.__support.FPUtil.fputil diff --git a/libc/src/math/generic/common_constants.h b/libc/src/math/generic/common_constants.h --- a/libc/src/math/generic/common_constants.h +++ b/libc/src/math/generic/common_constants.h @@ -31,6 +31,12 @@ // > for i from 0 to 127 do { D(exp(i / 128)); }; extern const double EXP_M2[128]; +// Lookup table for sin(k * pi / 16) with k = 0, ..., 31. +// Table is generated with Sollya as follow: +// > display = hexadecimal; +// > for k from 0 to 31 do { D(sin(k * pi/16)); }; +extern const double SIN_K_PI_OVER_16[32]; + } // namespace __llvm_libc #endif // LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H diff --git a/libc/src/math/generic/common_constants.cpp b/libc/src/math/generic/common_constants.cpp --- a/libc/src/math/generic/common_constants.cpp +++ b/libc/src/math/generic/common_constants.cpp @@ -226,4 +226,21 @@ 0x1.568bb722dd593p1, 0x1.593b7d72305bbp1, }; +// Lookup table for sin(k * pi / 16) with k = 0, ..., 31. +// Table is generated with Sollya as follow: +// > display = hexadecimal; +// > for k from 0 to 31 do { D(sin(k * pi/16)); }; +const double SIN_K_PI_OVER_16[32] = { + 0x0.0000000000000p+0, 0x1.8f8b83c69a60bp-3, 0x1.87de2a6aea963p-2, + 0x1.1c73b39ae68c8p-1, 0x1.6a09e667f3bcdp-1, 0x1.a9b66290ea1a3p-1, + 0x1.d906bcf328d46p-1, 0x1.f6297cff75cb0p-1, 0x1.0000000000000p+0, + 0x1.f6297cff75cb0p-1, 0x1.d906bcf328d46p-1, 0x1.a9b66290ea1a3p-1, + 0x1.6a09e667f3bcdp-1, 0x1.1c73b39ae68c8p-1, 0x1.87de2a6aea963p-2, + 0x1.8f8b83c69a60bp-3, 0x0.0000000000000p+0, -0x1.8f8b83c69a60bp-3, + -0x1.87de2a6aea963p-2, -0x1.1c73b39ae68c8p-1, -0x1.6a09e667f3bcdp-1, + -0x1.a9b66290ea1a3p-1, -0x1.d906bcf328d46p-1, -0x1.f6297cff75cb0p-1, + -0x1.0000000000000p+0, -0x1.f6297cff75cb0p-1, -0x1.d906bcf328d46p-1, + -0x1.a9b66290ea1a3p-1, -0x1.6a09e667f3bcdp-1, -0x1.1c73b39ae68c8p-1, + -0x1.87de2a6aea963p-2, -0x1.8f8b83c69a60bp-3}; + } // namespace __llvm_libc diff --git a/libc/src/math/generic/range_reduction.h b/libc/src/math/generic/range_reduction.h --- a/libc/src/math/generic/range_reduction.h +++ b/libc/src/math/generic/range_reduction.h @@ -18,110 +18,85 @@ namespace generic { -static constexpr uint32_t FAST_PASS_BOUND = 0x4c80'0000U; // 2^26 +static constexpr uint32_t FAST_PASS_BOUND = 0x4a80'0000U; // 2^22 static constexpr int N_ENTRIES = 8; -// We choose to split bits of 1/pi into 28-bit precision pieces, so that the -// product of x * ONE_OVER_PI_28[i] is exact. +// We choose to split bits of 16/pi into 28-bit precision pieces, so that the +// product of x * SIXTEEN_OVER_PI_28[i] is exact. // These are generated by Sollya with: -// > a1 = D(round(1/pi, 28, RN)); a1; -// > a2 = D(round(1/pi - a1, 28, RN)); a2; -// > a3 = D(round(1/pi - a1 - a2, 28, RN)); a3; -// > a4 = D(round(1/pi - a1 - a2 - a3, 28, RN)); a4; +// > a1 = D(round(16/pi, 28, RN)); a1; +// > a2 = D(round(16/pi - a1, 28, RN)); a2; +// > a3 = D(round(16/pi - a1 - a2, 28, RN)); a3; +// > a4 = D(round(16/pi - a1 - a2 - a3, 28, RN)); a4; // ... -static constexpr double ONE_OVER_PI_28[N_ENTRIES] = { - 0x1.45f306ep-2, -0x1.b1bbeaep-33, 0x1.3f84ebp-62, -0x1.7056592p-92, - 0x1.c0db62ap-121, -0x1.4cd8778p-150, -0x1.bef806cp-179, 0x1.63abdecp-209}; +static constexpr double SIXTEEN_OVER_PI_28[N_ENTRIES] = { + 0x1.45f306ep+2, -0x1.b1bbeaep-29, 0x1.3f84ebp-58, -0x1.7056592p-88, + 0x1.c0db62ap-117, -0x1.4cd8778p-146, -0x1.bef806cp-175, 0x1.63abdecp-205}; // Exponents of the least significant bits of the corresponding entries in -// ONE_OVER_PI_28. -static constexpr int ONE_OVER_PI_28_LSB_EXP[N_ENTRIES] = { - -29, -60, -86, -119, -148, -175, -205, -235}; +// SIXTEEN_OVER_PI_28. +static constexpr int SIXTEEN_OVER_PI_28_LSB_EXP[N_ENTRIES] = { + -25, -56, -82, -115, -144, -171, -201, -231}; -// Return (k mod 2) and y, where -// k = round(x / pi) and y = (x / pi) - k. +// Return k and y, where +// k = round(x * 16 / pi) and y = (x * 16 / pi) - k. static inline int64_t small_range_reduction(double x, double &y) { - double prod = x * ONE_OVER_PI_28[0]; + double prod = x * SIXTEEN_OVER_PI_28[0]; double kd = fputil::nearest_integer(prod); y = prod - kd; - y = fputil::multiply_add(x, ONE_OVER_PI_28[1], y); - y = fputil::multiply_add(x, ONE_OVER_PI_28[2], y); + y = fputil::multiply_add(x, SIXTEEN_OVER_PI_28[1], y); + y = fputil::multiply_add(x, SIXTEEN_OVER_PI_28[2], y); return static_cast(kd); } // Return k and y, where -// k = round(x / pi) and y = (x / pi) - k. -// For large range, there are at most 2 parts of ONE_OVER_PI_28 contributing to -// the unit binary digit (k & 1). If the least significant bit of x * the least -// significant bit of ONE_OVER_PI_28[i] > 1, we can completely ignore -// ONE_OVER_PI_28[i]. +// k = round(x * 16 / pi) and y = (x * 16 / pi) - k. +// For large range, there are at most 2 parts of SIXTEEN_OVER_PI_28 contributing +// to the lowest 5 binary digits (k & 31). If the least significant bit of +// x * the least significant bit of SIXTEEN_OVER_PI_28[i] >= 32, we can +// completely ignore SIXTEEN_OVER_PI_28[i]. static inline int64_t large_range_reduction(double x, int x_exp, double &y) { int idx = 0; y = 0; - int x_lsb_exp = x_exp - fputil::FloatProperties::MANTISSA_WIDTH; + int x_lsb_exp_m4 = x_exp - fputil::FloatProperties::MANTISSA_WIDTH; - // Skipping the first parts of 1/pi such that: - // LSB of x * LSB of ONE_OVER_PI_28[i] > 1. - while (x_lsb_exp + ONE_OVER_PI_28_LSB_EXP[idx] > 0) + // Skipping the first parts of 16/pi such that: + // LSB of x * LSB of SIXTEEN_OVER_PI_28[i] >= 32. + while (x_lsb_exp_m4 + SIXTEEN_OVER_PI_28_LSB_EXP[idx] > 4) ++idx; - double prod_hi = x * ONE_OVER_PI_28[idx]; - // Get the integral part of x * ONE_OVER_PI_28[idx] + double prod_hi = x * SIXTEEN_OVER_PI_28[idx]; + // Get the integral part of x * SIXTEEN_OVER_PI_28[idx] double k_hi = fputil::nearest_integer(prod_hi); - // Get the fractional part of x * ONE_OVER_PI_28[idx] + // Get the fractional part of x * SIXTEEN_OVER_PI_28[idx] double frac = prod_hi - k_hi; - double prod_lo = fputil::multiply_add(x, ONE_OVER_PI_28[idx + 1], frac); + double prod_lo = fputil::multiply_add(x, SIXTEEN_OVER_PI_28[idx + 1], frac); double k_lo = fputil::nearest_integer(prod_lo); // Now y is the fractional parts. y = prod_lo - k_lo; - y = fputil::multiply_add(x, ONE_OVER_PI_28[idx + 2], y); - y = fputil::multiply_add(x, ONE_OVER_PI_28[idx + 3], y); + y = fputil::multiply_add(x, SIXTEEN_OVER_PI_28[idx + 2], y); + y = fputil::multiply_add(x, SIXTEEN_OVER_PI_28[idx + 3], y); - return static_cast(k_hi + k_lo); + return static_cast(k_hi) + static_cast(k_lo); } // Exceptional cases. -static constexpr int N_EXCEPT_SMALL = 4; +static constexpr int N_EXCEPTS = 3; -static constexpr fputil::ExceptionalValues SmallExcepts{ +static constexpr fputil::ExceptionalValues SinfExcepts{ /* inputs */ { 0x3fa7832a, // x = 0x1.4f0654p0 0x46199998, // x = 0x1.33333p13 - 0x4afdece4, // x = 0x1.fbd9c8p22 - 0x4c2332e9, // x = 0x1.4665d2p25 + 0x55cafb2a, // x = 0x1.95f654p44 }, /* outputs (RZ, RU offset, RD offset, RN offset) */ { {0x3f7741b5, 1, 0, 1}, // x = 0x1.4f0654p0, sin(x) = 0x1.ee836ap-1 (RZ) {0xbeb1fa5d, 0, 1, 0}, // x = 0x1.33333p13, sin(x) = -0x1.63f4bap-2 (RZ) - {0xbf7fb6e0, 0, 1, 1}, // x = 0x1.fbd9c8p22, sin(x) = -0x1.ff6dcp-1 (RZ) - {0xbf7fffff, 0, 1, - 1}, // x = 0x1.4665d2p25, sin(x) = -0x1.fffffep-1 (RZ) - }}; - -static constexpr int N_EXCEPT_LARGE = 5; - -static constexpr fputil::ExceptionalValues LargeExcepts{ - /* inputs */ { - 0x523947f6, // x = 0x1.728fecp37 - 0x53b146a6, // x = 0x1.628d4cp40 - 0x55cafb2a, // x = 0x1.95f654p44 - 0x6a1976f1, // x = 0x1.32ede2p85 - 0x77584625, // x = 0x1.b08c4ap111 - }, - /* outputs (RZ, RU offset, RD offset, RN offset) */ - { - {0xbf12791d, 0, 1, - 1}, // x = 0x1.728fecp37, sin(x) = -0x1.24f23ap-1 (RZ) - {0xbf7fffff, 0, 1, - 1}, // x = 0x1.628d4cp40, sin(x) = -0x1.fffffep-1 (RZ) {0xbf7e7a16, 0, 1, 1}, // x = 0x1.95f654p44, sin(x) = -0x1.fcf42cp-1 (RZ) - {0x3f7fffff, 1, 0, 1}, // x = 0x1.32ede2p85, sin(x) = 0x1.fffffep-1 (RZ) - {0xbf7fffff, 0, 1, - 1}, // x = 0x1.b08c4ap111, sin(x) = -0x1.fffffep-1 (RZ) }}; } // namespace generic diff --git a/libc/src/math/generic/range_reduction_fma.h b/libc/src/math/generic/range_reduction_fma.h --- a/libc/src/math/generic/range_reduction_fma.h +++ b/libc/src/math/generic/range_reduction_fma.h @@ -18,118 +18,85 @@ namespace fma { -static constexpr uint32_t FAST_PASS_BOUND = 0x5880'0000U; // 2^50 +static constexpr uint32_t FAST_PASS_BOUND = 0x5680'0000U; // 2^46 // Digits of 1/pi, generated by Sollya with: -// > a0 = D(1/pi); -// > a1 = D(1/pi - a0); -// > a2 = D(1/pi - a0 - a1); -// > a3 = D(1/pi - a0 - a1 - a2); -static constexpr double ONE_OVER_PI[5] = { - 0x1.45f306dc9c883p-2, -0x1.6b01ec5417056p-56, -0x1.6447e493ad4cep-110, - 0x1.e21c820ff28b2p-164, -0x1.508510ea79237p-219}; +// > a0 = D(16/pi); +// > a1 = D(16/pi - a0); +// > a2 = D(16/pi - a0 - a1); +// > a3 = D(16/pi - a0 - a1 - a2); +static constexpr double SIXTEEN_OVER_PI[5] = { + 0x1.45f306dc9c883p+2, -0x1.6b01ec5417056p-52, -0x1.6447e493ad4cep-106, + 0x1.e21c820ff28b2p-160, -0x1.508510ea79237p-215}; // Return k and y, where -// k = round(x / pi) and y = (x / pi) - k. +// k = round(x * 16 / pi) and y = (x * 16 / pi) - k. // Assume x is non-negative. static inline int64_t small_range_reduction(double x, double &y) { - double kd = fputil::nearest_integer(x * ONE_OVER_PI[0]); - y = fputil::fma(x, ONE_OVER_PI[0], -kd); - y = fputil::fma(x, ONE_OVER_PI[1], y); + double kd = fputil::nearest_integer(x * SIXTEEN_OVER_PI[0]); + y = fputil::fma(x, SIXTEEN_OVER_PI[0], -kd); + y = fputil::fma(x, SIXTEEN_OVER_PI[1], y); return static_cast(kd); } // Return k and y, where -// k = round(x / pi) and y = (x / pi) - k. +// k = round(x * 16 / pi) and y = (x * 16 / pi) - k. static inline int64_t large_range_reduction(double x, int x_exp, double &y) { - // 2^50 <= |x| < 2^104 - if (x_exp < 103) { - // - When x < 2^104, the unit bit is contained in the full exact product of - // x * ONE_OVER_PI[0]. - // - When 2^50 <= |x| < 2^55, the unit bit is contained - // in the last 8 bits of double(x * ONE_OVER_PI[0]). - // - When |x| >= 2^55, the LSB of double(x * ONE_OVER_PI[0]) is at least 2. - fputil::FPBits prod_hi(x * ONE_OVER_PI[0]); - prod_hi.bits &= (x_exp < 55) ? (~0xffULL) : (~0ULL); // |x| < 2^55 + // 2^46 <= |x| < 2^99 + if (x_exp < 99) { + // - When x < 2^99, the full exact product of x * SIXTEEN_OVER_PI[0] + // contains at least one integral bit <= 2^4. + // - When 2^46 <= |x| < 2^56, the lowest 5 unit bits are contained + // in the last 10 bits of double(x * SIXTEEN_OVER_PI[0]). + // - When |x| >= 2^56, the LSB of double(x * SIXTEEN_OVER_PI[0]) is at least + // 32. + fputil::FPBits prod_hi(x * SIXTEEN_OVER_PI[0]); + prod_hi.bits &= (x_exp < 56) ? (~0xfffULL) : (~0ULL); // |x| < 2^56 double k_hi = fputil::nearest_integer(static_cast(prod_hi)); - double truncated_prod = fputil::fma(x, ONE_OVER_PI[0], -k_hi); - double prod_lo = fputil::fma(x, ONE_OVER_PI[1], truncated_prod); + double truncated_prod = fputil::fma(x, SIXTEEN_OVER_PI[0], -k_hi); + double prod_lo = fputil::fma(x, SIXTEEN_OVER_PI[1], truncated_prod); double k_lo = fputil::nearest_integer(prod_lo); - y = fputil::fma(x, ONE_OVER_PI[1], truncated_prod - k_lo); - y = fputil::fma(x, ONE_OVER_PI[2], y); - y = fputil::fma(x, ONE_OVER_PI[3], y); + y = fputil::fma(x, SIXTEEN_OVER_PI[1], truncated_prod - k_lo); + y = fputil::fma(x, SIXTEEN_OVER_PI[2], y); + y = fputil::fma(x, SIXTEEN_OVER_PI[3], y); return static_cast(k_lo); } - // - When x >= 2^104, the full exact product of x * ONE_OVER_PI[0] does not - // contain the unit bit, so we can ignore it completely. - // - When 2^104 <= |x| < 2^109, the unit bit is contained - // in the last 8 bits of double(x * ONE_OVER_PI[1]). - // - When |x| >= 2^109, the LSB of double(x * ONE_OVER_PI[1]) is at least 2. - fputil::FPBits prod_hi(x * ONE_OVER_PI[1]); - prod_hi.bits &= (x_exp < 109) ? (~0xffULL) : (~0ULL); // |x| < 2^55 + // - When x >= 2^110, the full exact product of x * SIXTEEN_OVER_PI[0] does + // not contain any of the lowest 5 unit bits, so we can ignore it completely. + // - When 2^99 <= |x| < 2^110, the lowest 5 unit bits are contained + // in the last 12 bits of double(x * SIXTEEN_OVER_PI[1]). + // - When |x| >= 2^110, the LSB of double(x * SIXTEEN_OVER_PI[1]) is at + // least 32. + fputil::FPBits prod_hi(x * SIXTEEN_OVER_PI[1]); + prod_hi.bits &= (x_exp < 110) ? (~0xfffULL) : (~0ULL); // |x| < 2^110 double k_hi = fputil::nearest_integer(static_cast(prod_hi)); - double truncated_prod = fputil::fma(x, ONE_OVER_PI[1], -k_hi); - double prod_lo = fputil::fma(x, ONE_OVER_PI[2], truncated_prod); + double truncated_prod = fputil::fma(x, SIXTEEN_OVER_PI[1], -k_hi); + double prod_lo = fputil::fma(x, SIXTEEN_OVER_PI[2], truncated_prod); double k_lo = fputil::nearest_integer(prod_lo); - y = fputil::fma(x, ONE_OVER_PI[2], truncated_prod - k_lo); - y = fputil::fma(x, ONE_OVER_PI[3], y); - y = fputil::fma(x, ONE_OVER_PI[4], y); + y = fputil::fma(x, SIXTEEN_OVER_PI[2], truncated_prod - k_lo); + y = fputil::fma(x, SIXTEEN_OVER_PI[3], y); + y = fputil::fma(x, SIXTEEN_OVER_PI[4], y); return static_cast(k_lo); } // Exceptional cases. -static constexpr int N_EXCEPT_SMALL = 9; +static constexpr int N_EXCEPTS = 2; -static constexpr fputil::ExceptionalValues SmallExcepts{ +static constexpr fputil::ExceptionalValues SinfExcepts{ /* inputs */ { 0x3fa7832a, // x = 0x1.4f0654p0 - 0x40171973, // x = 0x1.2e32e6p1 - 0x4096cbe4, // x = 0x1.2d97c8p2 - 0x433b7490, // x = 0x1.76e92p7 - 0x437ce5f1, // x = 0x1.f9cbe2p7 - 0x46199998, // x = 0x1.33333p13 - 0x474d246f, // x = 0x1.9a48dep15 - 0x4afdece4, // x = 0x1.fbd9c8p22 0x55cafb2a, // x = 0x1.95f654p44 }, /* outputs (RZ, RU offset, RD offset, RN offset) */ { {0x3f7741b5, 1, 0, 1}, // x = 0x1.4f0654p0, sin(x) = 0x1.ee836ap-1 (RZ) - {0x3f34290f, 1, 0, 1}, // x = 0x1.2e32e6p1, sin(x) = 0x1.68521ep-1 (RZ) - {0xbf7fffff, 0, 1, 1}, // x = 0x1.2d97c8p2, sin(x) = -0x1.fffffep-1 (RZ) - {0xbf5cce62, 0, 1, 0}, // x = 0x1.76e92p7, sin(x) = -0x1.b99cc4p-1 (RZ) - {0x3f7fffff, 1, 0, 1}, // x = 0x1.f9cbe2p7, sin(x) = 0x1.fffffep-1 (RZ) - {0xbeb1fa5d, 0, 1, 0}, // x = 0x1.33333p13, sin(x) = -0x1.63f4bap-2 (RZ) - {0x3f7fffff, 1, 0, 1}, // x = 0x1.9a48dep15, sin(x) = 0x1.fffffep-1 (RZ) - {0xbf7fb6e0, 0, 1, 1}, // x = 0x1.fbd9c8p22, sin(x) = -0x1.ff6dcp-1 (RZ) {0xbf7e7a16, 0, 1, 1}, // x = 0x1.95f654p44, sin(x) = -0x1.fcf42cp-1 (RZ) }}; -static constexpr int N_EXCEPT_LARGE = 5; - -static constexpr fputil::ExceptionalValues LargeExcepts{ - /* inputs */ { - 0x5ebcfdde, // x = 0x1.79fbbcp62 - 0x5fa6eba7, // x = 0x1.4dd74ep64 - 0x6386134e, // x = 0x1.0c269cp72 - 0x6a1976f1, // x = 0x1.32ede2p85 - 0x727669d4, // x = 0x1.ecd3a8p101 - }, - /* outputs (RZ, RU offset, RD offset, RN offset) */ - { - {0x3f50622d, 1, 0, 0}, // x = 0x1.79fbbcp62, sin(x) = 0x1.a0c45ap-1 (RZ) - {0xbe52464a, 0, 1, - 0}, // x = 0x1.4dd74ep64, sin(x) = -0x1.a48c94p-3 (RZ) - {0x3f7cb2e7, 1, 0, 0}, // x = 0x1.0c269cp72, sin(x) = 0x1.f965cep-1 (RZ) - {0x3f7fffff, 1, 0, 1}, // x = 0x1.32ede2p85, sin(x) = 0x1.fffffep-1 (RZ) - {0xbf7a781d, 0, 1, - 0}, // x = 0x1.ecd3a8p101, sin(x) = -0x1.f4f038p-1 (RZ) - }}; - } // namespace fma } // namespace __llvm_libc diff --git a/libc/src/math/generic/sinf.cpp b/libc/src/math/generic/sinf.cpp --- a/libc/src/math/generic/sinf.cpp +++ b/libc/src/math/generic/sinf.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/math/sinf.h" +#include "common_constants.h" #include "src/__support/FPUtil/BasicOperations.h" #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" @@ -22,21 +23,17 @@ // using namespace __llvm_libc::fma; using __llvm_libc::fma::FAST_PASS_BOUND; using __llvm_libc::fma::large_range_reduction; -using __llvm_libc::fma::LargeExcepts; -using __llvm_libc::fma::N_EXCEPT_LARGE; -using __llvm_libc::fma::N_EXCEPT_SMALL; +using __llvm_libc::fma::N_EXCEPTS; +using __llvm_libc::fma::SinfExcepts; using __llvm_libc::fma::small_range_reduction; -using __llvm_libc::fma::SmallExcepts; #else #include "range_reduction.h" // using namespace __llvm_libc::generic; using __llvm_libc::generic::FAST_PASS_BOUND; using __llvm_libc::generic::large_range_reduction; -using __llvm_libc::generic::LargeExcepts; -using __llvm_libc::generic::N_EXCEPT_LARGE; -using __llvm_libc::generic::N_EXCEPT_SMALL; +using __llvm_libc::generic::N_EXCEPTS; +using __llvm_libc::generic::SinfExcepts; using __llvm_libc::generic::small_range_reduction; -using __llvm_libc::generic::SmallExcepts; #endif namespace __llvm_libc { @@ -47,26 +44,26 @@ uint32_t x_u = xbits.uintval(); uint32_t x_abs = x_u & 0x7fff'ffffU; - double xd, y; + double xd = static_cast(x); // Range reduction: // For |x| > pi/16, we perform range reduction as follows: // Find k and y such that: - // x = (k + y) * pi + // x = (k + y) * pi/16 // k is an integer // |y| < 0.5 - // For small range (|x| < 2^50 when FMA instructions are available, 2^26 + // For small range (|x| < 2^46 when FMA instructions are available, 2^22 // otherwise), this is done by performing: - // k = round(x * 1/pi) - // y = x * 1/pi - k - // For large range, we will omit all the higher parts of 1/pi such that the - // least significant bits of their full products with x are larger than 1, - // since sin(x + i * 2pi) = sin(x). + // k = round(x * 16/pi) + // y = x * 16/pi - k + // For large range, we will omit all the higher parts of 16/pi such that the + // least significant bits of their full products with x are larger than 31, + // since sin((k + y + 32*i) * pi/16) = sin(x + i * 2pi) = sin(x). // - // When FMA instructions are not available, we store the digits of 1/pi in + // When FMA instructions are not available, we store the digits of 16/pi in // chunks of 28-bit precision. This will make sure that the products: - // x * ONE_OVER_PI_28[i] are all exact. - // When FMA instructions are available, we simply store the digits of 1/pi in + // x * SIXTEEN_OVER_PI_28[i] are all exact. + // When FMA instructions are available, we simply store the digits of 16/pi in // chunks of doubles (53-bit of precision). // So when multiplying by the largest values of single precision, the // resulting output should be correct up to 2^(-208 + 128) ~ 2^-80. By the @@ -79,20 +76,18 @@ // // Once k and y are computed, we then deduce the answer by the sine of sum // formula: - // sin(x) = sin((k + y)*pi) - // = sin(y*pi) * cos(k*pi) + cos(y*pi) * sin(k*pi) - // = (-1)^(k & 1) * sin(y*pi) - // ~ (-1)^(k & 1) * y * P(y^2) - // where y*P(y^2) is a degree-15 minimax polynomial generated by Sollya - // with: > Q = fpminimax(sin(x*pi)/x, [|0, 2, 4, 6, 8, 10, 12, 14|], - // [|D...|], [0, 0.5]); + // sin(x) = sin((k + y)*pi/16) + // = sin(y*pi/16) * cos(k*pi/16) + cos(y*pi/16) * sin(k*pi/16) + // The values of sin(k*pi/16) and cos(k*pi/16) for k = 0..31 are precomputed + // and stored using a vector of 32 doubles. Sin(y*pi/16) and cos(y*pi/16) are + // computed using degree-7 and degree-8 minimax polynomials generated by + // Sollya respectively. // |x| <= pi/16 - if (x_abs <= 0x3e49'0fdbU) { - xd = static_cast(x); + if (unlikely(x_abs <= 0x3e49'0fdbU)) { // |x| < 0x1.d12ed2p-12f - if (x_abs < 0x39e8'9769U) { + if (unlikely(x_abs < 0x39e8'9769U)) { if (unlikely(x_abs == 0U)) { // For signed zeros. return x; @@ -140,21 +135,18 @@ return xd * result; } - bool x_sign = xbits.get_sign(); - - int64_t k; - xd = static_cast(x); + using ExceptChecker = typename fputil::ExceptionChecker; + { + float result; + if (ExceptChecker::check_odd_func(SinfExcepts, x_abs, xbits.get_sign(), + result)) + return result; + } - if (x_abs < FAST_PASS_BOUND) { - using ExceptChecker = - typename fputil::ExceptionChecker; - { - float result; - if (ExceptChecker::check_odd_func(SmallExcepts, x_abs, x_sign, result)) { - return result; - } - } + int k; + double y; + if (likely(x_abs < FAST_PASS_BOUND)) { k = small_range_reduction(xd, y); } else { // x is inf or nan. @@ -167,37 +159,41 @@ FPBits::build_nan(1 << (fputil::MantissaWidth::VALUE - 1)); } - using ExceptChecker = - typename fputil::ExceptionChecker; - { - float result; - if (ExceptChecker::check_odd_func(LargeExcepts, x_abs, x_sign, result)) - return result; - } - k = large_range_reduction(xd, xbits.get_exponent(), y); } - // After range reduction, k = round(x / pi) and y = (x/pi) - k. + // After range reduction, k = round(x * 16 / pi) and y = (x * 16 / pi) - k. // So k is an integer and -0.5 <= y <= 0.5. - // Then sin(x) = sin(y*pi + k*pi) - // = (-1)^(k & 1) * sin(y*pi) - // ~ (-1)^(k & 1) * y * P(y^2) - // where y*P(y^2) is a degree-15 minimax polynomial generated by Sollya - // with: > P = fpminimax(sin(x*pi)/x, [|0, 2, 4, 6, 8, 10, 12, 14|], - // [|D...|], [0, 0.5]); - - constexpr double SIGN[2] = {1.0, -1.0}; + // Then sin(x) = sin((k + y)*pi/16) + // = sin(y*pi/16) * cos(k*pi/16) + cos(y*pi/16) * sin(k*pi/16) double ysq = y * y; - double result = - y * fputil::polyeval(ysq, 0x1.921fb54442d17p1, -0x1.4abbce625bd4bp2, - 0x1.466bc67750a3fp1, -0x1.32d2cce1612b5p-1, - 0x1.507832417bce6p-4, -0x1.e3062119b6071p-8, - 0x1.e89c7aa14122dp-12, -0x1.625b1709dece6p-16); - - return SIGN[k & 1] * result; - // } + + // Degree-6 minimax even polynomial for sin(y*pi/16)/y generated by Sollya + // with: + // > Q = fpminimax(sin(y*pi/16)/y, [|0, 2, 4, 6|], [|D...|], [0, 0.5]); + double sin_y = + fputil::polyeval(ysq, 0x1.921fb54442d17p-3, -0x1.4abbce6256adp-10, + 0x1.466bc5a5ac6b3p-19, -0x1.32bdcb4207562p-29); + // Degree-8 minimax even polynomial for cos(y*pi/16) generated by Sollya with: + // > P = fpminimax(cos(x*pi/16), [|0, 2, 4, 6, 8|], [|1, D...|], [0, 0.5]); + // Note that cosm1_y = cos(y*pi/16) - 1. + double cosm1_y = + ysq * fputil::polyeval(ysq, -0x1.3bd3cc9be45dcp-6, 0x1.03c1f081b08ap-14, + -0x1.55d3c6fb0fb6ep-24, 0x1.e1d3d60f58873p-35); + + double sin_k = SIN_K_PI_OVER_16[k & 31]; + // cos(k * pi/16) = sin(k * pi/16 + pi/2) = sin((k + 8) * pi/16). + // cos_k = y * cos(k * pi/16) + double cos_k = y * SIN_K_PI_OVER_16[(k + 8) & 31]; + + // Combine the results with the sine of sum formula: + // sin(x) = sin((k + y)*pi/16) + // = sin(y*pi/16) * cos(k*pi/16) + cos(y*pi/16) * sin(k*pi/16) + // = sin_y * cos_k + (1 + cosm1_y) * sin_k + // = sin_y * cos_k + (cosm1_y * sin_k + sin_k) + return fputil::multiply_add(sin_y, cos_k, + fputil::multiply_add(cosm1_y, sin_k, sin_k)); } } // namespace __llvm_libc diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -655,6 +655,7 @@ ":__support_fputil_fma", ":__support_fputil_multiply_add", ":__support_fputil_polyeval", + ":common_constants", ":range_reduction", ], )