diff --git a/compiler-rt/lib/builtins/udivmodti4.c b/compiler-rt/lib/builtins/udivmodti4.c --- a/compiler-rt/lib/builtins/udivmodti4.c +++ b/compiler-rt/lib/builtins/udivmodti4.c @@ -14,182 +14,147 @@ #ifdef CRT_HAS_128BIT +// Returns the 128 bit division result by 64 bit. Result must fit in 64 bits. +// Remainder stored in r. +// Taken and adjusted from libdivide libdivide_128_div_64_to_64 division +// fallback. +UNUSED +static inline du_int udiv128by64to64simple(du_int U1, du_int U0, du_int V, + du_int *R) { + const unsigned NUdwordBits = sizeof(du_int) * CHAR_BIT; + const du_int B = (1ULL << (NUdwordBits / 2)); // Number base (32 bits) + du_int Un1, Un0; // Norm. dividend LSD's + du_int Vn1, Vn0; // Norm. divisor digits + du_int Q1, Q0; // Quotient digits + du_int Un64, Un21, Un10; // Dividend digit pairs + du_int Rhat; // A remainder + si_int S; // Shift amount for norm + + // count leading zeros + S = __builtin_clzll(V); + if (S > 0) { + // Normalize divisor + V = V << S; + Un64 = (U1 << S) | (U0 >> (NUdwordBits - S)); + Un10 = U0 << S; // Shift dividend left + } else { + // Avoid undefined behavior of (U0 >> 64). + // The behavior is undefined if the right operand is + // negative, or greater than or equal to the length + // in bits of the promoted left operand. + Un64 = U1; + Un10 = U0; + } + + // Break divisor up into two 32-bit digits + Vn1 = V >> (NUdwordBits / 2); + Vn0 = V & 0xFFFFFFFF; + + // Break right half of dividend into two digits + Un1 = Un10 >> (NUdwordBits / 2); + Un0 = Un10 & 0xFFFFFFFF; + + // Compute the first quotient digit, q1 + Q1 = Un64 / Vn1; + Rhat = Un64 - Q1 * Vn1; + + while (Q1 >= B || Q1 * Vn0 > B * Rhat + Un1) { + Q1 = Q1 - 1; + Rhat = Rhat + Vn1; + if (Rhat >= B) + break; + } + + // Multiply and subtract + Un21 = Un64 * B + Un1 - Q1 * V; + + // Compute the second quotient digit + Q0 = Un21 / Vn1; + Rhat = Un21 - Q0 * Vn1; + + while (Q0 >= B || Q0 * Vn0 > B * Rhat + Un0) { + Q0 = Q0 - 1; + Rhat = Rhat + Vn1; + if (Rhat >= B) + break; + } + + *R = (Un21 * B + Un0 - Q0 * V) >> S; + return Q1 * B + Q0; +} + +static inline du_int udiv128by64to64(du_int U1, du_int U0, du_int V, + du_int *R) { +#if defined(__x86_64__) + du_int result; + __asm__("divq %[v]" + : "=a"(result), "=d"(*R) + : [ v ] "r"(V), "a"(U0), "d"(U1)); + return result; +#else + return udiv128by64to64simple(U1, U0, V, R); +#endif +} + // Effects: if rem != 0, *rem = a % b // Returns: a / b -// Translated from Figure 3-40 of The PowerPC Compiler Writer's Guide - -COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem) { - const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT; - const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT; - utwords n; - n.all = a; - utwords d; - d.all = b; - utwords q; - utwords r; - unsigned sr; - // special cases, X is unknown, K != 0 - if (n.s.high == 0) { - if (d.s.high == 0) { - // 0 X - // --- - // 0 X - if (rem) - *rem = n.s.low % d.s.low; - return n.s.low / d.s.low; - } - // 0 X - // --- - // K X - if (rem) - *rem = n.s.low; +COMPILER_RT_ABI tu_int __udivmodti4(tu_int A, tu_int B, tu_int *Rem) { + const unsigned NUtwordBits = sizeof(tu_int) * CHAR_BIT; + utwords Dividend; + Dividend.all = A; + utwords Divisor; + Divisor.all = B; + utwords Quotient; + utwords Remainder; + if (Divisor.all > Dividend.all) { + if (Rem) + *Rem = Dividend.all; return 0; } - // n.s.high != 0 - if (d.s.low == 0) { - if (d.s.high == 0) { - // K X - // --- - // 0 0 - if (rem) - *rem = n.s.high % d.s.low; - return n.s.high / d.s.low; - } - // d.s.high != 0 - if (n.s.low == 0) { - // K 0 - // --- - // K 0 - if (rem) { - r.s.high = n.s.high % d.s.high; - r.s.low = 0; - *rem = r.all; - } - return n.s.high / d.s.high; - } - // K K - // --- - // K 0 - if ((d.s.high & (d.s.high - 1)) == 0) /* if d is a power of 2 */ { - if (rem) { - r.s.low = n.s.low; - r.s.high = n.s.high & (d.s.high - 1); - *rem = r.all; - } - return n.s.high >> __builtin_ctzll(d.s.high); - } - // K K - // --- - // K 0 - sr = __builtin_clzll(d.s.high) - __builtin_clzll(n.s.high); - // 0 <= sr <= n_udword_bits - 2 or sr large - if (sr > n_udword_bits - 2) { - if (rem) - *rem = n.all; - return 0; - } - ++sr; - // 1 <= sr <= n_udword_bits - 1 - // q.all = n.all << (n_utword_bits - sr); - q.s.low = 0; - q.s.high = n.s.low << (n_udword_bits - sr); - // r.all = n.all >> sr; - r.s.high = n.s.high >> sr; - r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - } else /* d.s.low != 0 */ { - if (d.s.high == 0) { - // K X - // --- - // 0 K - if ((d.s.low & (d.s.low - 1)) == 0) /* if d is a power of 2 */ { - if (rem) - *rem = n.s.low & (d.s.low - 1); - if (d.s.low == 1) - return n.all; - sr = __builtin_ctzll(d.s.low); - q.s.high = n.s.high >> sr; - q.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - return q.all; - } - // K X - // --- - // 0 K - sr = 1 + n_udword_bits + __builtin_clzll(d.s.low) - - __builtin_clzll(n.s.high); - // 2 <= sr <= n_utword_bits - 1 - // q.all = n.all << (n_utword_bits - sr); - // r.all = n.all >> sr; - if (sr == n_udword_bits) { - q.s.low = 0; - q.s.high = n.s.low; - r.s.high = 0; - r.s.low = n.s.high; - } else if (sr < n_udword_bits) /* 2 <= sr <= n_udword_bits - 1 */ { - q.s.low = 0; - q.s.high = n.s.low << (n_udword_bits - sr); - r.s.high = n.s.high >> sr; - r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - } else /* n_udword_bits + 1 <= sr <= n_utword_bits - 1 */ { - q.s.low = n.s.low << (n_utword_bits - sr); - q.s.high = (n.s.high << (n_utword_bits - sr)) | - (n.s.low >> (sr - n_udword_bits)); - r.s.high = 0; - r.s.low = n.s.high >> (sr - n_udword_bits); - } + // Divide 128 bit by 64 bit. + if (Divisor.s.high == 0) { + Remainder.s.high = 0; + if (Dividend.s.high < Divisor.s.low) { + // The result fits in 64 bits. + Quotient.s.low = udiv128by64to64(Dividend.s.high, Dividend.s.low, + Divisor.s.low, &Remainder.s.low); + Quotient.s.high = 0; } else { - // K X - // --- - // K K - sr = __builtin_clzll(d.s.high) - __builtin_clzll(n.s.high); - // 0 <= sr <= n_udword_bits - 1 or sr large - if (sr > n_udword_bits - 1) { - if (rem) - *rem = n.all; - return 0; - } - ++sr; - // 1 <= sr <= n_udword_bits - // q.all = n.all << (n_utword_bits - sr); - // r.all = n.all >> sr; - q.s.low = 0; - if (sr == n_udword_bits) { - q.s.high = n.s.low; - r.s.high = 0; - r.s.low = n.s.high; - } else { - r.s.high = n.s.high >> sr; - r.s.low = (n.s.high << (n_udword_bits - sr)) | (n.s.low >> sr); - q.s.high = n.s.low << (n_udword_bits - sr); - } + // First, divide with the high part to get the remaider in Dividend.s.high + // After that Dividend.s.high < Divisor.s.low. + Quotient.s.high = Dividend.s.high / Divisor.s.low; + Dividend.s.high = Dividend.s.high % Divisor.s.low; + Quotient.s.low = udiv128by64to64(Dividend.s.high, Dividend.s.low, + Divisor.s.low, &Remainder.s.low); } + if (Rem) + *Rem = Remainder.all; + return Quotient.all; } - // Not a special case - // q and r are initialized with: - // q.all = n.all << (n_utword_bits - sr); - // r.all = n.all >> sr; - // 1 <= sr <= n_utword_bits - 1 - su_int carry = 0; - for (; sr > 0; --sr) { - // r:q = ((r:q) << 1) | carry - r.s.high = (r.s.high << 1) | (r.s.low >> (n_udword_bits - 1)); - r.s.low = (r.s.low << 1) | (q.s.high >> (n_udword_bits - 1)); - q.s.high = (q.s.high << 1) | (q.s.low >> (n_udword_bits - 1)); - q.s.low = (q.s.low << 1) | carry; - // carry = 0; - // if (r.all >= d.all) + // 0 <= Shift <= 63 + si_int Shift = + __builtin_clzll(Divisor.s.high) - __builtin_clzll(Dividend.s.high); + Divisor.all <<= Shift; + Quotient.s.high = 0; + Quotient.s.low = 0; + for (; Shift >= 0; --Shift) { + Quotient.s.low <<= 1; + // Branch free version of. + // if (Dividend.all >= Divisor.all) // { - // r.all -= d.all; - // carry = 1; + // Dividend.all -= Divisor.all; + // carry = 1; // } - const ti_int s = (ti_int)(d.all - r.all - 1) >> (n_utword_bits - 1); - carry = s & 1; - r.all -= d.all & s; + const ti_int S = + (ti_int)(Divisor.all - Dividend.all - 1) >> (NUtwordBits - 1); + Quotient.s.low |= S & 1; + Dividend.all -= Divisor.all & S; + Divisor.all >>= 1; } - q.all = (q.all << 1) | carry; - if (rem) - *rem = r.all; - return q.all; + if (Rem) + *Rem = Dividend.all; + return Quotient.all; } #endif // CRT_HAS_128BIT