diff --git a/libc/docs/math.rst b/libc/docs/math.rst --- a/libc/docs/math.rst +++ b/libc/docs/math.rst @@ -215,7 +215,7 @@ +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+ | cosf | 13 | 32 | 53 | 59 | :math:`[0, 2\pi]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA | +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+ -| coshf | 23 | 20 | 73 | 49 | :math:`[-10, 10]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA | +| coshf | 15 | 20 | 51 | 48 | :math:`[-10, 10]` | Ryzen 1700 | Ubuntu 22.04 LTS x86_64 | Clang 14.0.0 | FMA | +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+ | expf | 9 | 7 | 44 | 38 | :math:`[-10, 10]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA | +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+ @@ -245,7 +245,7 @@ +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+ | sincosf | 19 | 30 | 57 | 68 | :math:`[-\pi, \pi]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA | +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+ -| sinhf | 23 | 64 | 73 | 141 | :math:`[-10, 10]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA | +| sinhf | 14 | 63 | 49 | 137 | :math:`[-10, 10]` | Ryzen 1700 | Ubuntu 22.04 LTS x86_64 | Clang 14.0.0 | FMA | +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+ | tanf | 19 | 50 | 82 | 107 | :math:`[-\pi, \pi]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA | +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+ diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -549,6 +549,7 @@ HDRS ../exp2f.h DEPENDS + .explogxf libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.multiply_add diff --git a/libc/src/math/generic/coshf.cpp b/libc/src/math/generic/coshf.cpp --- a/libc/src/math/generic/coshf.cpp +++ b/libc/src/math/generic/coshf.cpp @@ -39,16 +39,13 @@ return x + FPBits::inf().get_val(); } - auto ep_p = exp_eval<-1>(x); - auto ep_m = exp_eval<-1>(-x); - // 0.5 * exp(x) = ep_p.mult_exp * (ep_p.r + 1) - // = ep_p.mult_exp * ep_p.r + ep_p.mult_exp - // 0.5 * exp(-x) = ep_m.mult_exp * (ep_m.r + 1) - // = ep_m.mult_exp * ep_m.r + ep_m.mult_exp - // cos(x) = 0.5 * exp(x) + 0.5 * expm1(-x) - double ep = fputil::multiply_add(ep_p.mult_exp, ep_p.r, ep_p.mult_exp) + - fputil::multiply_add(ep_m.mult_exp, ep_m.r, ep_m.mult_exp); - return ep; + + // TODO: We should be able to reduce the latency and reciprocal throughput + // further by using a low degree (maybe 3-7 ?) minimax polynomial for small + // but not too small inputs, such as |x| < 2^-2, or |x| < 2^-3. + + // cosh(x) = (e^x + e^(-x)) / 2. + return exp_pm_eval(x); } } // namespace __llvm_libc diff --git a/libc/src/math/generic/exp2f.cpp b/libc/src/math/generic/exp2f.cpp --- a/libc/src/math/generic/exp2f.cpp +++ b/libc/src/math/generic/exp2f.cpp @@ -16,23 +16,14 @@ #include +#include "explogxf.h" + namespace __llvm_libc { constexpr uint32_t exval1 = 0x3b42'9d37U; constexpr uint32_t exval2 = 0xbcf3'a937U; constexpr uint32_t exval_mask = exval1 & exval2; -// Look up table for bit fields of 2^(i/16) for i = 0..15, generated by Sollya -// with: -// > for i from 0 to 15 do printdouble(round(2^(i/16), D, RN)); -constexpr int64_t EXP_2_M[16] = { - 0x3ff0000000000000, 0x3ff0b5586cf9890f, 0x3ff172b83c7d517b, - 0x3ff2387a6e756238, 0x3ff306fe0a31b715, 0x3ff3dea64c123422, - 0x3ff4bfdad5362a27, 0x3ff5ab07dd485429, 0x3ff6a09e667f3bcd, - 0x3ff7a11473eb0187, 0x3ff8ace5422aa0db, 0x3ff9c49182a3f090, - 0x3ffae89f995ad3ad, 0x3ffc199bdd85529c, 0x3ffd5818dcfba487, - 0x3ffea4afa2a490da}; - LLVM_LIBC_FUNCTION(float, exp2f, (float x)) { using FPBits = typename fputil::FPBits; FPBits xbits(x); @@ -101,7 +92,7 @@ // 2^mid is stored in the lookup table EXP_2_M of 16 elements. // 2^lo is computed using a degree-6 minimax polynomial // generated by Sollya. - // We perform 2^hi * 2^lo by simply add hi to the exponent field + // We perform 2^hi * 2^mid by simply add hi to the exponent field // of 2^mid. // kf = (hi + mid) * 2^4 = round(x * 2^4) diff --git a/libc/src/math/generic/explogxf.h b/libc/src/math/generic/explogxf.h --- a/libc/src/math/generic/explogxf.h +++ b/libc/src/math/generic/explogxf.h @@ -30,6 +30,17 @@ // printf("%.13a,\n", d[i]); extern const double EXP_2_POW[EXP_num_p]; +// Look up table for bit fields of 2^(i/16) for i = 0..15, generated by Sollya +// with: +// > for i from 0 to 15 do printdouble(round(2^(i/16), D, RN)); +inline constexpr int64_t EXP_2_M[16] = { + 0x3ff0000000000000, 0x3ff0b5586cf9890f, 0x3ff172b83c7d517b, + 0x3ff2387a6e756238, 0x3ff306fe0a31b715, 0x3ff3dea64c123422, + 0x3ff4bfdad5362a27, 0x3ff5ab07dd485429, 0x3ff6a09e667f3bcd, + 0x3ff7a11473eb0187, 0x3ff8ace5422aa0db, 0x3ff9c49182a3f090, + 0x3ffae89f995ad3ad, 0x3ffc199bdd85529c, 0x3ffd5818dcfba487, + 0x3ffea4afa2a490da}; + constexpr int LOG_P1_BITS = 6; constexpr int LOG_P1_SIZE = 1 << LOG_P1_BITS; @@ -54,11 +65,18 @@ // EXP_num_p Precise value of the constant is not needed. static constexpr double LN2_INV = 0x1.71547652b82fep+0 * EXP_num_p; +// log2(e) * 2^4 +static constexpr double LOG2_E_4 = 0x1.71547652b82fep+4; + // LN2_HIGH + LN2_LOW = ln(2) with precision higher than double(ln(2)) // Minus sign is to use FMA directly. static constexpr double LN2_HIGH = -0x1.62e42fefa0000p-1 / EXP_num_p; static constexpr double LN2_LOW = -0x1.cf79abc9e3b3ap-40 / EXP_num_p; +// -log(2) * 2^(-4) +static constexpr double M_LN2_4_HI = -0x1.62e42fefa0000p-5; +static constexpr double M_LN2_4_LO = -0x1.cf79abc9e3b3ap-44; + struct exe_eval_result_t { // exp(x) = 2^MULT_POWER2 * mult_exp * (r + 1.0) // where @@ -98,6 +116,106 @@ return {mult_e1, r}; } +// The function correctly calculates sinh(x) and cosh(x) by calculating exp(x) +// and exp(-x) simultaneously. +// To compute e^x, we perform the following range +// reduction: find hi, mid, lo such that: +// x = (hi + mid) * log(2) + lo, in which +// hi is an integer, +// 0 <= mid * 2^4 < 16 is an integer +// -2^(-5) <= lo * log2(e) <= 2^-5. +// In particular, +// hi + mid = round(x * log2(e) * 2^4) * 2^(-4). +// Then, +// e^x = 2^(hi + mid) * e^lo = 2^hi * 2^mid * e^lo. +// 2^mid is stored in the lookup table EXP_2_M of 16 elements. +// e^lo is computed using a degree-6 minimax polynomial +// generated by Sollya: +// e^lo ~ P(lo) = 1 + lo + c2 * lo^2 + ... + c6 * lo^6 +// = (1 + c2*lo^2 + c4*lo^4 + c6*lo^6) + lo * (1 + c3*lo^2 + c5*lo^4) +// = P_even + lo * P_odd +// We perform 2^hi * 2^mid by simply add hi to the exponent field +// of 2^mid. +// To compute e^(-x), notice that: +// e^(-x) = 2^(-(hi + mid)) * e^(-lo) +// ~ 2^(-(hi + mid)) * P(-lo) +// = 2^(-(hi + mid)) * (P_even - lo * P_odd) +// So: +// sinh(x) = (e^x - e^(-x)) / 2 +// ~ 0.5 * (2^(hi + mid) * (P_even + lo * P_odd) - +// 2^(-(hi + mid)) * (P_even - lo * P_odd)) +// = 0.5 * (P_even * (2^(hi + mid) - 2^(-(hi + mid))) + +// lo * P_odd * (2^(hi + mid) + 2^(-(hi + mid)))) +// And similarly: +// cosh(x) = (e^x + e^(-x)) / 2 +// ~ 0.5 * (P_even * (2^(hi + mid) + 2^(-(hi + mid))) + +// lo * P_odd * (2^(hi + mid) - 2^(-(hi + mid)))) +// The main point of these formulas is that the expensive part of calculating +// the polynomials approximating lower parts of e^(x) and e^(-x) are shared +// and only done once. +template static inline double exp_pm_eval(float x) { + double xd = static_cast(x); + + // round(x * log2(e) * 2^4) + double kd = fputil::nearest_integer(LOG2_E_4 * xd); + + // k_p = round(x * log2(e) * 2^4) + int k_p = static_cast(kd); + // k_m = round(-x * log2(e) * 2^4) + int k_m = -k_p; + + // hi = floor(kf * 2^(-4)) + // exp_hi = shift hi to the exponent field of double precision. + int64_t exp_hi_p = static_cast((k_p >> 4)) + << fputil::FloatProperties::MANTISSA_WIDTH; + int64_t exp_hi_m = static_cast((k_m >> 4)) + << fputil::FloatProperties::MANTISSA_WIDTH; + // mh = 2^hi * 2^mid + // mh_bits = bit field of mh + int64_t mh_bits_p = EXP_2_M[k_p & 15] + exp_hi_p; + int64_t mh_bits_m = EXP_2_M[k_m & 15] + exp_hi_m; + double mh_p = fputil::FPBits(uint64_t(mh_bits_p)).get_val(); + double mh_m = fputil::FPBits(uint64_t(mh_bits_m)).get_val(); + // mh_sum = 2^(hi + mid) + 2^(-(hi + mid)) + double mh_sum = mh_p + mh_m; + // mh_diff = 2^(hi + mid) - 2^(-(hi + mid)) + double mh_diff = mh_p - mh_m; + + // dx = lo = x - (hi + mid) * log(2) + double dx = fputil::multiply_add(kd, M_LN2_4_LO, + fputil::multiply_add(kd, M_LN2_4_HI, xd)); + double dx2 = dx * dx; + + // Polynomials generated by Sollya with: + // Q = fpminimax(expm1(x)/x, 5, [|1, D...|], [-1/32*log(2), 1/32*log(2)]); + // Then: + // e^lo ~ P(dx) = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[4] * dx^6. + constexpr double COEFFS[5] = {0x1.fffffffffffep-2, 0x1.55555554ad3f3p-3, + 0x1.55555557179cap-5, 0x1.111228f3478c9p-7, + 0x1.6c161beccc69dp-10}; + // c0 = 1 + COEFFS[0] * lo^2 + double c0 = fputil::multiply_add(dx2, COEFFS[0], 1.0); + // c1 = 1 + COEFFS[0] * lo^2 + double c1 = fputil::multiply_add(dx2, COEFFS[1], 1.0); + // c2 = COEFFS[2] + COEFFS[4] * lo^2 + double c2 = fputil::multiply_add(dx2, COEFFS[4], COEFFS[2]); + double dx4 = dx2 * dx2; + // P_even = c0 + c2 * lo^4 + // = (1 + COEFFS[0] * lo^2) + lo^4 * (COEFFS[2] + COEFFS[4] * lo^2) + // = 1 + COEFFS[0] * lo^2 + COEFFS[2] * lo^4 + COEFFS[4] * lo^6 + double p_even = fputil::multiply_add(dx4, c2, c0); + // P_odd = c1 + COEFFS[3] * lo^4 + // = 1 + COEFFS[1] * lo^2 + COEFFS[3] * lo^4 + double p_odd = fputil::multiply_add(dx4, COEFFS[3], c1); + + double r; + if constexpr (is_sinh) + r = fputil::multiply_add(dx * mh_sum, p_odd, p_even * mh_diff); + else + r = fputil::multiply_add(dx * mh_diff, p_odd, p_even * mh_sum); + return 0.5 * r; +} + // x should be positive, normal finite value inline static double log2_eval(double x) { using FPB = fputil::FPBits; diff --git a/libc/src/math/generic/sinhf.cpp b/libc/src/math/generic/sinhf.cpp --- a/libc/src/math/generic/sinhf.cpp +++ b/libc/src/math/generic/sinhf.cpp @@ -66,19 +66,8 @@ return fputil::multiply_add(xdbl, pe, xdbl); } - // MULT_POWER2 = -1 - auto ep_p = exp_eval<-1>(x); // 0.5 * exp(x) - auto ep_m = exp_eval<-1>(-x); // 0.5 * exp(-x) - - // 0.5 * expm1(x) = ep_p.mult_exp * (ep_p.r + 1) - 0.5 - // = ep_p.mult_exp * ep_p.r + ep_p.mult_exp - 0.5 - // 0.5 * expm1(-x) = ep_m.mult_exp * (ep_m.r + 1) - 0.5 - // = ep_m.mult_exp * ep_m.r + ep_m.mult_exp - 0.5 - // sinh(x) = 0.5 * expm1(x) - 0.5 * expm1(-x) - // Using expm1 instead of exp improved precision around zero. - double ep = fputil::multiply_add(ep_p.mult_exp, ep_p.r, ep_p.mult_exp - 0.5) - - fputil::multiply_add(ep_m.mult_exp, ep_m.r, ep_m.mult_exp - 0.5); - return ep; + // sinh(x) = (e^x - e^(-x)) / 2. + return exp_pm_eval(x); } } // namespace __llvm_libc