diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h --- a/libc/src/__support/UInt.h +++ b/libc/src/__support/UInt.h @@ -342,6 +342,126 @@ return remainder; } + // Efficiently perform UInt / (x * 2^e), where x is a 32-bit unsigned integer, + // and return the remainder. + // The main idea is as follow: + // Let q = y / (x * 2^e) be the quotient, and + // r = y % (x * 2^e) be the remainder. + // First, notice that: + // r % (2^e) = y % (2^e), + // so we just need to focus on all the bits of y that is >= 2^e. + // To speed up the shift-and-add steps, we only use x as the divisor, and + // performing 32-bit shiftings instead of bit-by-bit shiftings. + // Since the remainder of each division step < x < 2^32, the computation of + // each step is now properly contained within uint64_t. + // And finally we perform some extra alignment steps for the remaining bits. + constexpr optional> div_uint32_times_pow_2(uint32_t x, size_t e) { + UInt remainder(0); + + if (x == 0) { + return nullopt; + } + if (e >= Bits) { + remainder = *this; + *this = UInt(0); + return remainder; + } + + UInt quotient(0); + uint64_t x64 = static_cast(x); + // lower64 = smallest multiple of 64 that is >= e. + size_t lower64 = ((e >> 6) + ((e & 63) != 0)) << 6; + // lower_pos is the index of the closest 64-bit chunk >= 2^e. + size_t lower_pos = lower64 / 64; + // Keep track of current remainder mod x * 2^(32*i) + uint64_t rem = 0; + // pos is the index of the current 64-bit chunk that we are processing. + size_t pos = WORDCOUNT; + + for (size_t q_pos = WORDCOUNT - lower_pos; q_pos > 0; --q_pos) { + // q_pos is 1 + the index of the current 64-bit chunk of the quotient + // being processed. + // Performing the division / modulus with divisor: + // x * 2^(64*q_pos - 32), + // i.e. using the upper 32-bit of the current 64-bit chunk. + rem <<= 32; + rem += val[--pos] >> 32; + uint64_t q_tmp = rem / x64; + rem %= x64; + + // Performing the division / modulus with divisor: + // x * 2^(64*(q_pos - 1)), + // i.e. using the lower 32-bit of the current 64-bit chunk. + rem <<= 32; + rem += val[pos] & MASK32; + quotient.val[q_pos - 1] = (q_tmp << 32) + rem / x64; + rem %= x64; + } + + // So far, what we have is: + // quotient = y / (x * 2^lower64), and + // rem = (y % (x * 2^lower64)) / 2^lower64. + // If (lower64 > e), we will need to perform an extra adjustment of the + // quotient and remainder, namely: + // y / (x * 2^e) = [ y / (x * 2^lower64) ] * 2^(lower64 - e) + + // + (rem * 2^(lower64 - e)) / x + // (y % (x * 2^e)) / 2^e = (rem * 2^(lower64 - e)) % x + size_t last_shift = lower64 - e; + + if (last_shift > 0) { + // quotient * 2^(lower64 - e) + quotient <<= last_shift; + uint64_t q_tmp = 0; + uint64_t d = val[--pos]; + if (last_shift >= 32) { + // The shifting (rem * 2^(lower64 - e)) might overflow uint64_t, so we + // perform a 32-bit shift first. + rem <<= 32; + rem += d >> 32; + d &= MASK32; + q_tmp = rem / x64; + rem %= x64; + last_shift -= 32; + } else { + // Only use the upper 32-bit of the current 64-bit chunk. + d >>= 32; + } + + if (last_shift > 0) { + rem <<= 32; + rem += d; + q_tmp <<= last_shift; + x64 <<= 32 - last_shift; + q_tmp += rem / x64; + rem %= x64; + } + + quotient.val[0] += q_tmp; + + if (lower64 - e <= 32) { + // The remainder rem * 2^(lower64 - e) might overflow to the higher + // 64-bit chunk. + if (pos < WORDCOUNT - 1) { + remainder[pos + 1] = rem >> 32; + } + remainder[pos] = (rem << 32) + (val[pos] & MASK32); + } else { + remainder[pos] = rem; + } + + } else { + remainder[pos] = rem; + } + + // Set the remaining lower bits of the remainder. + for (; pos > 0; --pos) { + remainder[pos - 1] = val[pos - 1]; + } + + *this = quotient; + return remainder; + } + constexpr UInt operator/(const UInt &other) const { UInt result(*this); result.div(other); diff --git a/libc/test/src/__support/uint_test.cpp b/libc/test/src/__support/uint_test.cpp --- a/libc/test/src/__support/uint_test.cpp +++ b/libc/test/src/__support/uint_test.cpp @@ -533,3 +533,30 @@ constexpr LL_UInt128 sub = LL_UInt128(5) - LL_UInt128(4); ASSERT_EQ(sub, LL_UInt128(1)); } + +#define TEST_QUICK_DIV_UINT32_POW2(x, e) \ + do { \ + LL_UInt320 y({0x8899aabbccddeeffULL, 0x0011223344556677ULL, \ + 0x583715f4d3b29171ULL, 0xffeeddccbbaa9988ULL, \ + 0x1f2f3f4f5f6f7f8fULL}); \ + LL_UInt320 d = LL_UInt320(x); \ + d <<= e; \ + LL_UInt320 q1 = y / d; \ + LL_UInt320 r1 = y % d; \ + LL_UInt320 r2 = *y.div_uint32_times_pow_2(x, e); \ + EXPECT_EQ(q1, y); \ + EXPECT_EQ(r1, r2); \ + } while (0) + +TEST(LlvmLibcUIntClassTest, DivUInt32TimesPow2Tests) { + for (size_t i = 0; i < 320; i += 32) { + TEST_QUICK_DIV_UINT32_POW2(1, i); + TEST_QUICK_DIV_UINT32_POW2(13151719, i); + } + + TEST_QUICK_DIV_UINT32_POW2(1, 75); + TEST_QUICK_DIV_UINT32_POW2(1, 101); + + TEST_QUICK_DIV_UINT32_POW2(1000000000, 75); + TEST_QUICK_DIV_UINT32_POW2(1000000000, 101); +}