diff --git a/compiler-rt/lib/builtins/udivmodti4.c b/compiler-rt/lib/builtins/udivmodti4.c --- a/compiler-rt/lib/builtins/udivmodti4.c +++ b/compiler-rt/lib/builtins/udivmodti4.c @@ -14,84 +14,68 @@ #ifdef CRT_HAS_128BIT -// Returns the 128 bit division result by 64 bit. Result must fit in 64 bits. -// Remainder stored in r. -// Taken and adjusted from libdivide libdivide_128_div_64_to_64 division -// fallback. For a correctness proof see the reference for this algorithm -// in Knuth, Volume 2, section 4.3.1, Algorithm D. -UNUSED -static inline du_int udiv128by64to64default(du_int u1, du_int u0, du_int v, - du_int *r) { - const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT; - const du_int b = (1ULL << (n_udword_bits / 2)); // Number base (32 bits) - du_int un1, un0; // Norm. dividend LSD's - du_int vn1, vn0; // Norm. divisor digits - du_int q1, q0; // Quotient digits - du_int un64, un21, un10; // Dividend digit pairs - du_int rhat; // A remainder - si_int s; // Shift amount for normalization - - s = __builtin_clzll(v); - if (s > 0) { - // Normalize the divisor. - v = v << s; - un64 = (u1 << s) | (u0 >> (n_udword_bits - s)); - un10 = u0 << s; // Shift dividend left - } else { - // Avoid undefined behavior of (u0 >> 64). - un64 = u1; - un10 = u0; - } +#define RECIPROCAL_TABLE_ITEM(d) \ + (unsigned short)(0x7fd00 / (0x100 | (unsigned char)d)) - // Break divisor up into two 32-bit digits. - vn1 = v >> (n_udword_bits / 2); - vn0 = v & 0xFFFFFFFF; +#define REPEAT4(x) \ + RECIPROCAL_TABLE_ITEM((x) + 0), RECIPROCAL_TABLE_ITEM((x) + 1), \ + RECIPROCAL_TABLE_ITEM((x) + 2), RECIPROCAL_TABLE_ITEM((x) + 3) - // Break right half of dividend into two digits. - un1 = un10 >> (n_udword_bits / 2); - un0 = un10 & 0xFFFFFFFF; +#define REPEAT32(x) \ + REPEAT4((x) + 4 * 0), REPEAT4((x) + 4 * 1), REPEAT4((x) + 4 * 2), \ + REPEAT4((x) + 4 * 3), REPEAT4((x) + 4 * 4), REPEAT4((x) + 4 * 5), \ + REPEAT4((x) + 4 * 6), REPEAT4((x) + 4 * 7) - // Compute the first quotient digit, q1. - q1 = un64 / vn1; - rhat = un64 - q1 * vn1; +#define REPEAT256() \ + REPEAT32(32 * 0), REPEAT32(32 * 1), REPEAT32(32 * 2), REPEAT32(32 * 3), \ + REPEAT32(32 * 4), REPEAT32(32 * 5), REPEAT32(32 * 6), REPEAT32(32 * 7) - // q1 has at most error 2. No more than 2 iterations. - while (q1 >= b || q1 * vn0 > b * rhat + un1) { - q1 = q1 - 1; - rhat = rhat + vn1; - if (rhat >= b) - break; - } +// Reciprocal lookup table of 512 bytes. +static unsigned short kReciprocalTable[] = {REPEAT256()}; - un21 = un64 * b + un1 - q1 * v; +// Computes the reciprocal (2^128 - 1) / d - 2^64 for normalized d. +// Based on Algorithm 2 from "Improved division by invariant integers". +static inline du_int reciprocal2by1(du_int d) { - // Compute the second quotient digit. - q0 = un21 / vn1; - rhat = un21 - q0 * vn1; + const du_int d9 = d >> 55; + const su_int v0 = kReciprocalTable[d9 - 256]; - // q0 has at most error 2. No more than 2 iterations. - while (q0 >= b || q0 * vn0 > b * rhat + un0) { - q0 = q0 - 1; - rhat = rhat + vn1; - if (rhat >= b) - break; - } + const du_int d40 = (d >> 24) + 1; + const du_int v1 = (v0 << 11) - (su_int)(v0 * v0 * d40 >> 40) - 1; + + const du_int v2 = (v1 << 13) + (v1 * (0x1000000000000000 - v1 * d40) >> 47); - *r = (un21 * b + un0 - q0 * v) >> s; - return q1 * b + q0; + const du_int d0 = d & 1; + const du_int d63 = (d >> 1) + d0; + const du_int e = ((v2 >> 1) & (0 - d0)) - v2 * d63; + const du_int v3 = (((tu_int)(v2)*e) >> 65) + (v2 << 31); + + const du_int v4 = v3 - ((((tu_int)(v3)*d) + d) >> 64) - d; + return v4; } -static inline du_int udiv128by64to64(du_int u1, du_int u0, du_int v, - du_int *r) { -#if defined(__x86_64__) - du_int result; - __asm__("divq %[v]" - : "=a"(result), "=d"(*r) - : [ v ] "r"(v), "a"(u0), "d"(u1)); - return result; -#else - return udiv128by64to64default(u1, u0, v, r); -#endif +// Based on Algorithm 2 from "Improved division by invariant integers". +static inline du_int udivrem2by1(utwords dividend, du_int divisor, + du_int reciprocal, du_int *remainder) { + utwords quotient; + quotient.all = (tu_int)(reciprocal)*dividend.s.high; + quotient.all += dividend.all; + + ++quotient.s.high; + + *remainder = dividend.s.low - quotient.s.high * divisor; + + if (*remainder > quotient.s.low) { + --quotient.s.high; + *remainder += divisor; + } + + if (*remainder >= divisor) { + ++quotient.s.high; + *remainder -= divisor; + } + + return quotient.s.high; } // Effects: if rem != 0, *rem = a % b @@ -112,20 +96,26 @@ } // When the divisor fits in 64 bits, we can use an optimized path. if (divisor.s.high == 0) { + const du_int left_shift = __builtin_clzll(divisor.s.low); + const du_int right_shift = (64 - left_shift) % 64; + const du_int right_shift_mask = (du_int)(left_shift == 0) - 1; + utwords normalized_quotient; + normalized_quotient.s.low = + (dividend.s.high << left_shift) | + ((dividend.s.low >> right_shift) & right_shift_mask); + normalized_quotient.s.high = + (dividend.s.high >> right_shift) & right_shift_mask; + const du_int normalized_divisor = divisor.s.low << left_shift; + const du_int reciprocal = reciprocal2by1(normalized_divisor); + du_int normalized_remainder; + quotient.s.high = udivrem2by1(normalized_quotient, normalized_divisor, + reciprocal, &normalized_remainder); + normalized_quotient.s.high = normalized_remainder; + normalized_quotient.s.low = dividend.s.low << left_shift; remainder.s.high = 0; - if (dividend.s.high < divisor.s.low) { - // The result fits in 64 bits. - quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low, - divisor.s.low, &remainder.s.low); - quotient.s.high = 0; - } else { - // First, divide with the high part to get the remainder in dividend.s.high. - // After that dividend.s.high < divisor.s.low. - quotient.s.high = dividend.s.high / divisor.s.low; - dividend.s.high = dividend.s.high % divisor.s.low; - quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low, - divisor.s.low, &remainder.s.low); - } + quotient.s.low = udivrem2by1(normalized_quotient, normalized_divisor, + reciprocal, &remainder.s.low); + remainder.s.low >>= left_shift; if (rem) *rem = remainder.all; return quotient.all;