diff --git a/libc/src/__support/FPUtil/generic/sqrt.h b/libc/src/__support/FPUtil/generic/sqrt.h --- a/libc/src/__support/FPUtil/generic/sqrt.h +++ b/libc/src/__support/FPUtil/generic/sqrt.h @@ -31,47 +31,28 @@ }; #endif // SPECIAL_X86_LONG_DOUBLE -template -static inline void normalize(int &exponent, - typename FPBits::UIntType &mantissa); - -template <> inline void normalize(int &exponent, uint32_t &mantissa) { - // Use binary search to shift the leading 1 bit. - // With MantissaWidth = 23, it will take - // ceil(log2(23)) = 5 steps checking the mantissa bits as followed: - // Step 1: 0000 0000 0000 XXXX XXXX XXXX - // Step 2: 0000 00XX XXXX XXXX XXXX XXXX - // Step 3: 000X XXXX XXXX XXXX XXXX XXXX - // Step 4: 00XX XXXX XXXX XXXX XXXX XXXX - // Step 5: 0XXX XXXX XXXX XXXX XXXX XXXX - constexpr int NSTEPS = 5; // = ceil(log2(MantissaWidth)) - constexpr uint32_t BOUNDS[NSTEPS] = {1 << 12, 1 << 18, 1 << 21, 1 << 22, - 1 << 23}; - constexpr int SHIFTS[NSTEPS] = {12, 6, 3, 2, 1}; - - for (int i = 0; i < NSTEPS; ++i) { - if (mantissa < BOUNDS[i]) { - exponent -= SHIFTS[i]; - mantissa <<= SHIFTS[i]; - } - } +// The following overloads are matched based on what is accepted by +// __builtin_clz* rather than using the exactly-sized aliases from stdint.h. +// This way, we can avoid making any assumptions about integer sizes and let the +// compiler match for us. +template static inline int clz(T val); +template <> inline int clz(unsigned int val) { + return __builtin_clz(val); +} +template <> inline int clz(unsigned long int val) { + return __builtin_clzl(val); +} +template <> inline int clz(unsigned long long int val) { + return __builtin_clzll(val); } -template <> inline void normalize(int &exponent, uint64_t &mantissa) { - // Use binary search to shift the leading 1 bit similar to float. - // With MantissaWidth = 52, it will take - // ceil(log2(52)) = 6 steps checking the mantissa bits. - constexpr int NSTEPS = 6; // = ceil(log2(MantissaWidth)) - constexpr uint64_t BOUNDS[NSTEPS] = {1ULL << 26, 1ULL << 39, 1ULL << 46, - 1ULL << 49, 1ULL << 51, 1ULL << 52}; - constexpr int SHIFTS[NSTEPS] = {27, 14, 7, 4, 2, 1}; - - for (int i = 0; i < NSTEPS; ++i) { - if (mantissa < BOUNDS[i]) { - exponent -= SHIFTS[i]; - mantissa <<= SHIFTS[i]; - } - } +template +static inline void normalize(int &exponent, + typename FPBits::UIntType &mantissa) { + const int shift = + clz(mantissa) - (8 * sizeof(mantissa) - 1 - MantissaWidth::VALUE); + exponent -= shift; + mantissa <<= shift; } #ifdef LONG_DOUBLE_IS_DOUBLE @@ -82,22 +63,11 @@ #elif !defined(SPECIAL_X86_LONG_DOUBLE) template <> inline void normalize(int &exponent, __uint128_t &mantissa) { - // Use binary search to shift the leading 1 bit similar to float. - // With MantissaWidth = 112, it will take - // ceil(log2(112)) = 7 steps checking the mantissa bits. - constexpr int NSTEPS = 7; // = ceil(log2(MantissaWidth)) - constexpr __uint128_t BOUNDS[NSTEPS] = { - __uint128_t(1) << 56, __uint128_t(1) << 84, __uint128_t(1) << 98, - __uint128_t(1) << 105, __uint128_t(1) << 109, __uint128_t(1) << 111, - __uint128_t(1) << 112}; - constexpr int SHIFTS[NSTEPS] = {57, 29, 15, 8, 4, 2, 1}; - - for (int i = 0; i < NSTEPS; ++i) { - if (mantissa < BOUNDS[i]) { - exponent -= SHIFTS[i]; - mantissa <<= SHIFTS[i]; - } - } + const uint64_t hi_bits = static_cast(mantissa >> 64); + const int shift = hi_bits ? (clz(hi_bits) - 15) + : (clz(static_cast(mantissa)) + 49); + exponent -= shift; + mantissa <<= shift; } #endif diff --git a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h --- a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h +++ b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h @@ -18,21 +18,11 @@ namespace x86 { inline void normalize(int &exponent, __uint128_t &mantissa) { - // Use binary search to shift the leading 1 bit similar to float. - // With MantissaWidth = 63, it will take - // ceil(log2(63)) = 6 steps checking the mantissa bits. - constexpr int NSTEPS = 6; // = ceil(log2(MantissaWidth)) - constexpr __uint128_t BOUNDS[NSTEPS] = { - __uint128_t(1) << 32, __uint128_t(1) << 48, __uint128_t(1) << 56, - __uint128_t(1) << 60, __uint128_t(1) << 62, __uint128_t(1) << 63}; - constexpr int SHIFTS[NSTEPS] = {32, 16, 8, 4, 2, 1}; - - for (int i = 0; i < NSTEPS; ++i) { - if (mantissa < BOUNDS[i]) { - exponent -= SHIFTS[i]; - mantissa <<= SHIFTS[i]; - } - } + const int shift = + __builtin_clzll(static_cast(mantissa)) - + (8 * sizeof(uint64_t) - 1 - MantissaWidth::VALUE); + exponent -= shift; + mantissa <<= shift; } // if constexpr statement in sqrt.h still requires x86::sqrt to be declared