diff --git a/compiler-rt/lib/builtins/udivmodti4.c b/compiler-rt/lib/builtins/udivmodti4.c --- a/compiler-rt/lib/builtins/udivmodti4.c +++ b/compiler-rt/lib/builtins/udivmodti4.c @@ -14,182 +14,145 @@ #ifdef CRT_HAS_128BIT +// Returns the 128 bit division result by 64 bit. Result must fit in 64 bits. +// Remainder stored in r. +// Taken and adjusted from libdivide libdivide_128_div_64_to_64 division +// fallback. For a correctness proof see the reference for this algorithm +// in Knuth, Volume 2, section 4.3.1, Algorithm D. +UNUSED +static inline du_int udiv128by64to64default(du_int u1, du_int u0, du_int v, + du_int *r) { + const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT; + const du_int b = (1ULL << (n_udword_bits / 2)); // Number base (32 bits) + du_int un1, un0; // Norm. dividend LSD's + du_int vn1, vn0; // Norm. divisor digits + du_int q1, q0; // Quotient digits + du_int un64, un21, un10; // Dividend digit pairs + du_int rhat; // A remainder + si_int s; // Shift amount for normalization + + s = __builtin_clzll(v); + if (s > 0) { + // Normalize the divisor. + v = v << s; + un64 = (u1 << s) | (u0 >> (n_udword_bits - s)); + un10 = u0 << s; // Shift dividend left + } else { + // Avoid undefined behavior of (u0 >> 64). + un64 = u1; + un10 = u0; + } + + // Break divisor up into two 32-bit digits. + vn1 = v >> (n_udword_bits / 2); + vn0 = v & 0xFFFFFFFF; + + // Break right half of dividend into two digits. + un1 = un10 >> (n_udword_bits / 2); + un0 = un10 & 0xFFFFFFFF; + + // Compute the first quotient digit, q1. + q1 = un64 / vn1; + rhat = un64 - q1 * vn1; + + // q1 has at most error 2. No more than 2 iterations. + while (q1 >= b || q1 * vn0 > b * rhat + un1) { + q1 = q1 - 1; + rhat = rhat + vn1; + if (rhat >= b) + break; + } + + un21 = un64 * b + un1 - q1 * v; + + // Compute the second quotient digit. + q0 = un21 / vn1; + rhat = un21 - q0 * vn1; + + // q0 has at most error 2. No more than 2 iterations. + while (q0 >= b || q0 * vn0 > b * rhat + un0) { + q0 = q0 - 1; + rhat = rhat + vn1; + if (rhat >= b) + break; + } + + *r = (un21 * b + un0 - q0 * v) >> s; + return q1 * b + q0; +} + +static inline du_int udiv128by64to64(du_int u1, du_int u0, du_int v, + du_int *r) { +#if defined(__x86_64__) + du_int result; + __asm__("divq %[v]" + : "=a"(result), "=d"(*r) + : [ v ] "r"(v), "a"(u0), "d"(u1)); + return result; +#else + return udiv128by64to64default(u1, u0, v, r); +#endif +} + // Effects: if rem != 0, *rem = a % b // Returns: a / b -// Translated from Figure 3-40 of The PowerPC Compiler Writer's Guide - COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem) { - const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT; const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT; - utwords n; - n.all = a; - utwords d; - d.all = b; - utwords q; - utwords r; - unsigned sr; - // special cases, X is unknown, K != 0 - if (n.s.high == 0) { - if (d.s.high == 0) { - // 0 X - // --- - // 0 X - if (rem) - *rem = n.s.low % d.s.low; - return n.s.low / d.s.low; - } - // 0 X - // --- - // K X + utwords dividend; + dividend.all = a; + utwords divisor; + divisor.all = b; + utwords quotient; + utwords remainder; + if (divisor.all > dividend.all) { if (rem) - *rem = n.s.low; + *rem = dividend.all; return 0; } - // n.s.high != 0 - if (d.s.low == 0) { - if (d.s.high == 0) { - // K X - // --- - // 0 0 - if (rem) - *rem = n.s.high % d.s.low; - return n.s.high / d.s.low; - } - // d.s.high != 0 - if (n.s.low == 0) { - // K 0 - // --- - // K 0 - if (rem) { - r.s.high = n.s.high % d.s.high; - r.s.low = 0; - *rem = r.all; - } - return n.s.high / d.s.high; - } - // K K - // --- - // K 0 - if ((d.s.high & (d.s.high - 1)) == 0) /* if d is a power of 2 */ { - if (rem) { - r.s.low = n.s.low; - r.s.high = n.s.high & (d.s.high - 1); - *rem = r.all; - } - return n.s.high >> __builtin_ctzll(d.s.high); - } - // K K - // --- - // K 0 - sr = __builtin_clzll(d.s.high) - __builtin_clzll(n.s.high); - // 0 <= sr <= n_udword_bits - 2 or sr large - if (sr > n_udword_bits - 2) { - if (rem) - *rem = n.all; - return 0; - } - ++sr; - // 1 <= sr <= n_udword_bits - 1 - // q.all = n.all << (n_utword_bits - sr); - q.s.low = 0; - q.s.high = n.s.low << (n_udword_bits - sr); - // r.all = n.all >> sr; - r.s.high = n.s.high >> sr; - r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - } else /* d.s.low != 0 */ { - if (d.s.high == 0) { - // K X - // --- - // 0 K - if ((d.s.low & (d.s.low - 1)) == 0) /* if d is a power of 2 */ { - if (rem) - *rem = n.s.low & (d.s.low - 1); - if (d.s.low == 1) - return n.all; - sr = __builtin_ctzll(d.s.low); - q.s.high = n.s.high >> sr; - q.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - return q.all; - } - // K X - // --- - // 0 K - sr = 1 + n_udword_bits + __builtin_clzll(d.s.low) - - __builtin_clzll(n.s.high); - // 2 <= sr <= n_utword_bits - 1 - // q.all = n.all << (n_utword_bits - sr); - // r.all = n.all >> sr; - if (sr == n_udword_bits) { - q.s.low = 0; - q.s.high = n.s.low; - r.s.high = 0; - r.s.low = n.s.high; - } else if (sr < n_udword_bits) /* 2 <= sr <= n_udword_bits - 1 */ { - q.s.low = 0; - q.s.high = n.s.low << (n_udword_bits - sr); - r.s.high = n.s.high >> sr; - r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - } else /* n_udword_bits + 1 <= sr <= n_utword_bits - 1 */ { - q.s.low = n.s.low << (n_utword_bits - sr); - q.s.high = (n.s.high << (n_utword_bits - sr)) | - (n.s.low >> (sr - n_udword_bits)); - r.s.high = 0; - r.s.low = n.s.high >> (sr - n_udword_bits); - } + // When the divisor fits in 64 bits, we can use an optimized path. + if (divisor.s.high == 0) { + remainder.s.high = 0; + if (dividend.s.high < divisor.s.low) { + // The result fits in 64 bits. + quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low, + divisor.s.low, &remainder.s.low); + quotient.s.high = 0; } else { - // K X - // --- - // K K - sr = __builtin_clzll(d.s.high) - __builtin_clzll(n.s.high); - // 0 <= sr <= n_udword_bits - 1 or sr large - if (sr > n_udword_bits - 1) { - if (rem) - *rem = n.all; - return 0; - } - ++sr; - // 1 <= sr <= n_udword_bits - // q.all = n.all << (n_utword_bits - sr); - // r.all = n.all >> sr; - q.s.low = 0; - if (sr == n_udword_bits) { - q.s.high = n.s.low; - r.s.high = 0; - r.s.low = n.s.high; - } else { - r.s.high = n.s.high >> sr; - r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - q.s.high = n.s.low << (n_udword_bits - sr); - } + // First, divide with the high part to get the remainder in dividend.s.high. + // After that dividend.s.high < divisor.s.low. + quotient.s.high = dividend.s.high / divisor.s.low; + dividend.s.high = dividend.s.high % divisor.s.low; + quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low, + divisor.s.low, &remainder.s.low); } + if (rem) + *rem = remainder.all; + return quotient.all; } - // Not a special case - // q and r are initialized with: - // q.all = n.all << (n_utword_bits - sr); - // r.all = n.all >> sr; - // 1 <= sr <= n_utword_bits - 1 - su_int carry = 0; - for (; sr > 0; --sr) { - // r:q = ((r:q) << 1) | carry - r.s.high = (r.s.high << 1) | (r.s.low >> (n_udword_bits - 1)); - r.s.low = (r.s.low << 1) | (q.s.high >> (n_udword_bits - 1)); - q.s.high = (q.s.high << 1) | (q.s.low >> (n_udword_bits - 1)); - q.s.low = (q.s.low << 1) | carry; - // carry = 0; - // if (r.all >= d.all) + // 0 <= shift <= 63. + si_int shift = + __builtin_clzll(divisor.s.high) - __builtin_clzll(dividend.s.high); + divisor.all <<= shift; + quotient.s.high = 0; + quotient.s.low = 0; + for (; shift >= 0; --shift) { + quotient.s.low <<= 1; + // Branch free version of. + // if (dividend.all >= divisor.all) // { - // r.all -= d.all; - // carry = 1; + // dividend.all -= divisor.all; + // carry = 1; // } - const ti_int s = (ti_int)(d.all - r.all - 1) >> (n_utword_bits - 1); - carry = s & 1; - r.all -= d.all & s; + const ti_int s = + (ti_int)(divisor.all - dividend.all - 1) >> (n_utword_bits - 1); + quotient.s.low |= s & 1; + dividend.all -= divisor.all & s; + divisor.all >>= 1; } - q.all = (q.all << 1) | carry; if (rem) - *rem = r.all; - return q.all; + *rem = dividend.all; + return quotient.all; } #endif // CRT_HAS_128BIT