Diff 411902

libc/src/__support/FPUtil/generic/sqrt.h

	Show All 25 Lines
	};			};

	#if defined(SPECIAL_X86_LONG_DOUBLE)			#if defined(SPECIAL_X86_LONG_DOUBLE)
	template <> struct SpecialLongDouble<long double> {			template <> struct SpecialLongDouble<long double> {
	static constexpr bool VALUE = true;			static constexpr bool VALUE = true;
	};			};
	#endif // SPECIAL_X86_LONG_DOUBLE			#endif // SPECIAL_X86_LONG_DOUBLE

	template <typename T>			// The following overloads are matched based on what is accepted by
	static inline void normalize(int &exponent,			// __builtin_clz* rather than using the exactly-sized aliases from stdint.h.
	typename FPBits<T>::UIntType &mantissa);			// This way, we can avoid making any assumptions about integer sizes and let the
				// compiler match for us.
	template <> inline void normalize<float>(int &exponent, uint32_t &mantissa) {			template <typename T> static inline int clz(T val);
	// Use binary search to shift the leading 1 bit.			template <> inline int clz<unsigned int>(unsigned int val) {
	// With MantissaWidth<float> = 23, it will take			return __builtin_clz(val);
	// ceil(log2(23)) = 5 steps checking the mantissa bits as followed:
	// Step 1: 0000 0000 0000 XXXX XXXX XXXX
	// Step 2: 0000 00XX XXXX XXXX XXXX XXXX
	// Step 3: 000X XXXX XXXX XXXX XXXX XXXX
	// Step 4: 00XX XXXX XXXX XXXX XXXX XXXX
	// Step 5: 0XXX XXXX XXXX XXXX XXXX XXXX
	constexpr int NSTEPS = 5; // = ceil(log2(MantissaWidth))
	constexpr uint32_t BOUNDS[NSTEPS] = {1 << 12, 1 << 18, 1 << 21, 1 << 22,
	1 << 23};
	constexpr int SHIFTS[NSTEPS] = {12, 6, 3, 2, 1};

	for (int i = 0; i < NSTEPS; ++i) {
	if (mantissa < BOUNDS[i]) {
	exponent -= SHIFTS[i];
	mantissa <<= SHIFTS[i];
	}			}
				template <> inline int clz<unsigned long int>(unsigned long int val) {
				return __builtin_clzl(val);
	}			}
				template <> inline int clz<unsigned long long int>(unsigned long long int val) {
				return __builtin_clzll(val);
	}			}

	template <> inline void normalize<double>(int &exponent, uint64_t &mantissa) {			template <typename T>
	// Use binary search to shift the leading 1 bit similar to float.			static inline void normalize(int &exponent,
	// With MantissaWidth<double> = 52, it will take			typename FPBits<T>::UIntType &mantissa) {
	// ceil(log2(52)) = 6 steps checking the mantissa bits.			const int shift =
	constexpr int NSTEPS = 6; // = ceil(log2(MantissaWidth))			clz(mantissa) - (8 * sizeof(mantissa) - 1 - MantissaWidth<T>::VALUE);
	constexpr uint64_t BOUNDS[NSTEPS] = {1ULL << 26, 1ULL << 39, 1ULL << 46,			exponent -= shift;
	1ULL << 49, 1ULL << 51, 1ULL << 52};			mantissa <<= shift;
	constexpr int SHIFTS[NSTEPS] = {27, 14, 7, 4, 2, 1};

	for (int i = 0; i < NSTEPS; ++i) {
	if (mantissa < BOUNDS[i]) {
	exponent -= SHIFTS[i];
	mantissa <<= SHIFTS[i];
	}
	}
	}			}

	#ifdef LONG_DOUBLE_IS_DOUBLE			#ifdef LONG_DOUBLE_IS_DOUBLE
	template <>			template <>
	inline void normalize<long double>(int &exponent, uint64_t &mantissa) {			inline void normalize<long double>(int &exponent, uint64_t &mantissa) {
	normalize<double>(exponent, mantissa);			normalize<double>(exponent, mantissa);
	}			}
	#elif !defined(SPECIAL_X86_LONG_DOUBLE)			#elif !defined(SPECIAL_X86_LONG_DOUBLE)
	template <>			template <>
	inline void normalize<long double>(int &exponent, __uint128_t &mantissa) {			inline void normalize<long double>(int &exponent, __uint128_t &mantissa) {
	// Use binary search to shift the leading 1 bit similar to float.			const uint64_t hi_bits = static_cast<uint64_t>(mantissa >> 64);
				lntueUnsubmitted Not Done Reply Inline Actions Can you do a simple perf test to see if using clz for 64 bits is faster than binary search? Something like: uint64_t hi_bits = static_cast<uint64_t>(mantissa >> 64); int shift = hi_bits ? (clz(hi_bits) - 15) : (clz(static_cast<uint64_t>(mantissa)) + 49); exponent -= shift; mantissa <<= shift; lntue: Can you do a simple perf test to see if using clz for 64 bits is faster than binary search?
				cratonicaAuthorUnsubmitted Not Done Reply Inline Actions Unfortunately there aren't any differential or performance tests for sqrt or sqrtl, so I'll need to add those first in a separate PR. It won't be too much work, just clone the ones for sqrtf. Also, I don't actually have a machine on which I can test 128-bit floats -- my machine uses the 80-bit x87 format, and aarch64 uses 64-bit for long double. cratonica: # Unfortunately there aren't any differential or performance tests for sqrt or sqrtl, so I'll…
				cratonicaAuthorUnsubmitted Done Reply Inline Actions Following up here: re: #1: It seems as though we only have perf tests for the float32 variants of math functions due to the logarithmic increase in the domain required for 64-bit inputs. Trying to run an exhaustive performance test using float64 never completed even after an hour of waiting, and I can't imagine an exhaustive test for 128-bit inputs would complete even after days. So I'm going to write a one-off performance test that terminates after 2^24 iterations to test this, but I won't be checking it in re: #2: The x87 8-bit variant uses a 64-bit mantissa, which means that __bulting_clzll is will still work after truncation, so this is trivial to implement (and I have confirmed a slight performance increase here using the method mentioned above). I was incorrect that aarch64 uses 64-bit floats for long double, and I have access to some hardware with an aarch64 Cortex-A53 that I can run these performance test on with the changes you mentioned. If performance is improved, then I will update the patch accordingly. cratonica: Following up here: re: #1: It seems as though we only have perf tests for the float32 variants…
				cratonicaAuthorUnsubmitted Done Reply Inline Actions Results for 128-bit floats (long double) from the aarch64 Cortex-A53 core in denormal range: clz: 28894915309ns binary search: 29253929397ns So, just over a 1% performance improvement, which is in-line with what I'm seeing on the 32-bit float sqrtf function. Therefore, I'm patching in that change (as well as the x87 80-bit specialization). cratonica: Results for 128-bit floats (long double) from the aarch64 Cortex-A53 core in denormal range…
	// With MantissaWidth<long double> = 112, it will take			const int shift = hi_bits ? (clz(hi_bits) - 15)
	// ceil(log2(112)) = 7 steps checking the mantissa bits.			: (clz(static_cast<uint64_t>(mantissa)) + 49);
	constexpr int NSTEPS = 7; // = ceil(log2(MantissaWidth))			exponent -= shift;
	constexpr __uint128_t BOUNDS[NSTEPS] = {			mantissa <<= shift;
	__uint128_t(1) << 56, __uint128_t(1) << 84, __uint128_t(1) << 98,
	__uint128_t(1) << 105, __uint128_t(1) << 109, __uint128_t(1) << 111,
	__uint128_t(1) << 112};
	constexpr int SHIFTS[NSTEPS] = {57, 29, 15, 8, 4, 2, 1};

	for (int i = 0; i < NSTEPS; ++i) {
	if (mantissa < BOUNDS[i]) {
	exponent -= SHIFTS[i];
	mantissa <<= SHIFTS[i];
	}
	}
	}			}
	#endif			#endif

	} // namespace internal			} // namespace internal

	// Correctly rounded IEEE 754 SQRT for all rounding modes.			// Correctly rounded IEEE 754 SQRT for all rounding modes.
	// Shift-and-add algorithm.			// Shift-and-add algorithm.
	template <typename T>			template <typename T>
	▲ Show 20 Lines • Show All 107 Lines • Show Last 20 Lines

libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h

	Show All 12 Lines
	#include "src/__support/FPUtil/FPBits.h"			#include "src/__support/FPUtil/FPBits.h"
	#include "src/__support/FPUtil/PlatformDefs.h"			#include "src/__support/FPUtil/PlatformDefs.h"

	namespace __llvm_libc {			namespace __llvm_libc {
	namespace fputil {			namespace fputil {
	namespace x86 {			namespace x86 {

	inline void normalize(int &exponent, __uint128_t &mantissa) {			inline void normalize(int &exponent, __uint128_t &mantissa) {
	// Use binary search to shift the leading 1 bit similar to float.			const int shift =
	// With MantissaWidth<long double> = 63, it will take			__builtin_clzll(static_cast<uint64_t>(mantissa)) -
	// ceil(log2(63)) = 6 steps checking the mantissa bits.			(8 * sizeof(uint64_t) - 1 - MantissaWidth<long double>::VALUE);
	constexpr int NSTEPS = 6; // = ceil(log2(MantissaWidth))			exponent -= shift;
	constexpr __uint128_t BOUNDS[NSTEPS] = {			mantissa <<= shift;
	__uint128_t(1) << 32, __uint128_t(1) << 48, __uint128_t(1) << 56,
	__uint128_t(1) << 60, __uint128_t(1) << 62, __uint128_t(1) << 63};
	constexpr int SHIFTS[NSTEPS] = {32, 16, 8, 4, 2, 1};

	for (int i = 0; i < NSTEPS; ++i) {
	if (mantissa < BOUNDS[i]) {
	exponent -= SHIFTS[i];
	mantissa <<= SHIFTS[i];
	}
	}
	}			}

	// if constexpr statement in sqrt.h still requires x86::sqrt to be declared			// if constexpr statement in sqrt.h still requires x86::sqrt to be declared
	// even when it's not used.			// even when it's not used.
	static inline long double sqrt(long double x);			static inline long double sqrt(long double x);

	// Correctly rounded SQRT for all rounding modes.			// Correctly rounded SQRT for all rounding modes.
	// Shift-and-add algorithm.			// Shift-and-add algorithm.
	▲ Show 20 Lines • Show All 108 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

Use __builtin_clz to find leading 1 in generic sqrt (where possible)
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 411902

libc/src/__support/FPUtil/generic/sqrt.h

libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h

This is an archive of the discontinued LLVM Phabricator instance.

Use __builtin_clz to find leading 1 in generic sqrt (where possible)ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 411902

libc/src/__support/FPUtil/generic/sqrt.h

libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h

Use __builtin_clz to find leading 1 in generic sqrt (where possible)
ClosedPublic