Index: compiler-rt/trunk/lib/arm/udivmodsi4.S =================================================================== --- compiler-rt/trunk/lib/arm/udivmodsi4.S +++ compiler-rt/trunk/lib/arm/udivmodsi4.S @@ -8,89 +8,161 @@ *===----------------------------------------------------------------------===// * * This file implements the __udivmodsi4 (32-bit unsigned integer divide and - * modulus) function for the ARM architecture. A naive digit-by-digit - * computation is employed for simplicity. + * modulus) function for the ARM 32-bit architecture. * *===----------------------------------------------------------------------===*/ #include "../assembly.h" -#define ESTABLISH_FRAME \ - push {r4, r7, lr} ;\ - add r7, sp, #4 -#define CLEAR_FRAME_AND_RETURN \ - pop {r4, r7, pc} - -#define a r0 -#define b r1 -#define i r3 -#define r r4 -#define q ip -#define one lr + .syntax unified -.syntax unified -.align 3 +#ifdef ARM_HAS_BX +#define JMP(r) bx r +#else +#define JMP(r) mov pc, r +#endif + + .text + .arm + .p2align 2 DEFINE_COMPILERRT_FUNCTION(__udivmodsi4) #if __ARM_ARCH_EXT_IDIV__ tst r1, r1 - beq LOCAL_LABEL(divzero) + beq LOCAL_LABEL(divby0) mov r3, r0 udiv r0, r3, r1 mls r1, r0, r1, r3 str r1, [r2] bx lr -LOCAL_LABEL(divzero): - mov r0, #0 - bx lr #else -// We use a simple digit by digit algorithm; before we get into the actual -// divide loop, we must calculate the left-shift amount necessary to align -// the MSB of the divisor with that of the dividend (If this shift is -// negative, then the result is zero, and we early out). We also conjure a -// bit mask of 1 to use in constructing the quotient, and initialize the -// quotient to zero. - ESTABLISH_FRAME - clz r4, a - tst b, b // detect divide-by-zero - clz r3, b - mov q, #0 - beq LOCAL_LABEL(return) // return 0 if b is zero. - mov one, #1 - subs i, r3, r4 - blt LOCAL_LABEL(return) // return 0 if MSB(a) < MSB(b) - -LOCAL_LABEL(mainLoop): -// This loop basically implements the following: -// -// do { -// if (a >= b << i) { -// a -= b << i; -// q |= 1 << i; -// if (a == 0) break; -// } -// } while (--i) -// -// Note that this does not perform the final iteration (i == 0); by doing it -// this way, we can merge the two branches which is a substantial win for -// such a tight loop on current ARM architectures. - subs r, a, b, lsl i - itt hs - orrhs q, q,one, lsl i - movhs a, r - it ne - subsne i, i, #1 - bhi LOCAL_LABEL(mainLoop) - -// Do the final test subtraction and update of quotient (i == 0), as it is -// not performed in the main loop. - subs r, a, b - itt hs - orrhs q, #1 - movhs a, r - -LOCAL_LABEL(return): -// Store the remainder, and move the quotient to r0, then return. - str a, [r2] - mov r0, q - CLEAR_FRAME_AND_RETURN + cmp r1, #1 + bcc LOCAL_LABEL(divby0) + beq LOCAL_LABEL(divby1) + cmp r0, r1 + bcc LOCAL_LABEL(quotient0) + /* + * Implement division using binary long division algorithm. + * + * r0 is the numerator, r1 the denominator. + * + * The code before JMP computes the correct shift I, so that + * r0 and (r1 << I) have the highest bit set in the same position. + * At the time of JMP, ip := .Ldiv0block - 12 * I. + * This depends on the fixed instruction size of block. + * + * block(shift) implements the test-and-update-quotient core. + * It assumes (r0 << shift) can be computed without overflow and + * that (r0 << shift) < 2 * r1. The quotient is stored in r3. + */ + +# ifdef __ARM_FEATURE_CLZ + clz ip, r0 + clz r3, r1 + /* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */ + sub r3, r3, ip + adr ip, LOCAL_LABEL(div0block) + sub ip, ip, r3, lsl #2 + sub ip, ip, r3, lsl #3 + mov r3, #0 + bx ip +# else + str r4, [sp, #-8]! + + mov r4, r0 + adr ip, LOCAL_LABEL(div0block) + + lsr r3, r4, #16 + cmp r3, r1 + movhs r4, r3 + subhs ip, ip, #(16 * 12) + + lsr r3, r4, #8 + cmp r3, r1 + movhs r4, r3 + subhs ip, ip, #(8 * 12) + + lsr r3, r4, #4 + cmp r3, r1 + movhs r4, r3 + subhs ip, #(4 * 12) + + lsr r3, r4, #2 + cmp r3, r1 + movhs r4, r3 + subhs ip, ip, #(2 * 12) + + /* Last block, no need to update r3 or r4. */ + cmp r1, r4, lsr #1 + subls ip, ip, #(1 * 12) + + ldr r4, [sp], #8 /* restore r4, we are done with it. */ + mov r3, #0 + + JMP(ip) +# endif + +#define IMM # + +#define block(shift) \ + cmp r0, r1, lsl IMM shift; \ + addhs r3, r3, IMM (1 << shift); \ + subhs r0, r0, r1, lsl IMM shift + + block(31) + block(30) + block(29) + block(28) + block(27) + block(26) + block(25) + block(24) + block(23) + block(22) + block(21) + block(20) + block(19) + block(18) + block(17) + block(16) + block(15) + block(14) + block(13) + block(12) + block(11) + block(10) + block(9) + block(8) + block(7) + block(6) + block(5) + block(4) + block(3) + block(2) + block(1) +LOCAL_LABEL(div0block): + block(0) + + str r0, [r2] + mov r0, r3 + JMP(lr) + +LOCAL_LABEL(quotient0): + str r0, [r2] + mov r0, #0 + JMP(lr) + +LOCAL_LABEL(divby1): + mov r3, #0 + str r3, [r2] + JMP(lr) +#endif /* __ARM_ARCH_EXT_IDIV__ */ + +LOCAL_LABEL(divby0): + mov r0, #0 +#ifdef __ARM_EABI__ + b __aeabi_idiv0 +#else + JMP(lr) #endif + +END_COMPILERRT_FUNCTION(__udivmodsi4) Index: compiler-rt/trunk/lib/arm/udivsi3.S =================================================================== --- compiler-rt/trunk/lib/arm/udivsi3.S +++ compiler-rt/trunk/lib/arm/udivsi3.S @@ -1,4 +1,4 @@ -/*===-- udivsi3.S - 32-bit unsigned integer divide ------------------------===// +/*===-- udivmodsi4.S - 32-bit unsigned integer divide ---------------------===// * * The LLVM Compiler Infrastructure * @@ -7,87 +7,151 @@ * *===----------------------------------------------------------------------===// * - * This file implements the __udivsi3 (32-bit unsigned integer divide) - * function for the ARM architecture. A naive digit-by-digit computation is - * employed for simplicity. + * This file implements the __udivsi3 (32-bit unsigned integer divide) + * function for the ARM 32-bit architecture. * *===----------------------------------------------------------------------===*/ #include "../assembly.h" -#define ESTABLISH_FRAME \ - push {r7, lr} ;\ - mov r7, sp -#define CLEAR_FRAME_AND_RETURN \ - pop {r7, pc} - -#define a r0 -#define b r1 -#define r r2 -#define i r3 -#define q ip -#define one lr - -.syntax unified -.align 3 -// Ok, APCS and AAPCS agree on 32 bit args, so it's safe to use the same routine. + .syntax unified + +#ifdef ARM_HAS_BX +#define JMP(r) bx r +#define JMPc(r,c) bx##c r +#else +#define JMP(r) mov pc, r +#define JMPc(r,c) mov##c pc, r +#endif + + .text + .arm + .p2align 2 DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_uidiv, __udivsi3) DEFINE_COMPILERRT_FUNCTION(__udivsi3) #if __ARM_ARCH_EXT_IDIV__ - tst r1,r1 - beq LOCAL_LABEL(divzero) - udiv r0, r0, r1 + tst r1, r1 + beq LOCAL_LABEL(divby0) + mov r3, r0 + udiv r0, r3, r1 + mls r1, r0, r1, r3 bx lr - LOCAL_LABEL(divzero): - mov r0,#0 - bx lr #else -// We use a simple digit by digit algorithm; before we get into the actual -// divide loop, we must calculate the left-shift amount necessary to align -// the MSB of the divisor with that of the dividend (If this shift is -// negative, then the result is zero, and we early out). We also conjure a -// bit mask of 1 to use in constructing the quotient, and initialize the -// quotient to zero. - ESTABLISH_FRAME - clz r2, a - tst b, b // detect divide-by-zero - clz r3, b - mov q, #0 - beq LOCAL_LABEL(return) // return 0 if b is zero. - mov one, #1 - subs i, r3, r2 - blt LOCAL_LABEL(return) // return 0 if MSB(a) < MSB(b) - -LOCAL_LABEL(mainLoop): -// This loop basically implements the following: -// -// do { -// if (a >= b << i) { -// a -= b << i; -// q |= 1 << i; -// if (a == 0) break; -// } -// } while (--i) -// -// Note that this does not perform the final iteration (i == 0); by doing it -// this way, we can merge the two branches which is a substantial win for -// such a tight loop on current ARM architectures. - subs r, a, b, lsl i - itt hs - orrhs q, q,one, lsl i - movhs a, r - it ne - subsne i, i, #1 - bhi LOCAL_LABEL(mainLoop) - -// Do the final test subtraction and update of quotient (i == 0), as it is -// not performed in the main loop. - subs r, a, b - it hs - orrhs q, #1 - -LOCAL_LABEL(return): -// Move the quotient to r0 and return. - mov r0, q - CLEAR_FRAME_AND_RETURN + cmp r1, #1 + bcc LOCAL_LABEL(divby0) + JMPc(lr, eq) + cmp r0, r1 + movcc r0, #0 + JMPc(lr, cc) + /* + * Implement division using binary long division algorithm. + * + * r0 is the numerator, r1 the denominator. + * + * The code before JMP computes the correct shift I, so that + * r0 and (r1 << I) have the highest bit set in the same position. + * At the time of JMP, ip := .Ldiv0block - 12 * I. + * This depends on the fixed instruction size of block. + * + * block(shift) implements the test-and-update-quotient core. + * It assumes (r0 << shift) can be computed without overflow and + * that (r0 << shift) < 2 * r1. The quotient is stored in r3. + */ + +# ifdef __ARM_FEATURE_CLZ + clz ip, r0 + clz r3, r1 + /* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */ + sub r3, r3, ip + adr ip, LOCAL_LABEL(div0block) + sub ip, ip, r3, lsl #2 + sub ip, ip, r3, lsl #3 + mov r3, #0 + bx ip +# else + mov r2, r0 + adr ip, LOCAL_LABEL(div0block) + + lsr r3, r2, #16 + cmp r3, r1 + movhs r2, r3 + subhs ip, ip, #(16 * 12) + + lsr r3, r2, #8 + cmp r3, r1 + movhs r2, r3 + subhs ip, ip, #(8 * 12) + + lsr r3, r2, #4 + cmp r3, r1 + movhs r2, r3 + subhs ip, #(4 * 12) + + lsr r3, r2, #2 + cmp r3, r1 + movhs r2, r3 + subhs ip, ip, #(2 * 12) + + /* Last block, no need to update r2 or r3. */ + cmp r1, r2, lsr #1 + subls ip, ip, #(1 * 12) + + mov r3, #0 + + JMP(ip) +# endif + +#define IMM # + +#define block(shift) \ + cmp r0, r1, lsl IMM shift; \ + addhs r3, r3, IMM (1 << shift); \ + subhs r0, r0, r1, lsl IMM shift + + block(31) + block(30) + block(29) + block(28) + block(27) + block(26) + block(25) + block(24) + block(23) + block(22) + block(21) + block(20) + block(19) + block(18) + block(17) + block(16) + block(15) + block(14) + block(13) + block(12) + block(11) + block(10) + block(9) + block(8) + block(7) + block(6) + block(5) + block(4) + block(3) + block(2) + block(1) +LOCAL_LABEL(div0block): + block(0) + + mov r0, r3 + JMP(lr) +#endif /* __ARM_ARCH_EXT_IDIV__ */ + +LOCAL_LABEL(divby0): + mov r0, #0 +#ifdef __ARM_EABI__ + b __aeabi_idiv0 +#else + JMP(lr) #endif + +END_COMPILERRT_FUNCTION(__udivsi3) Index: compiler-rt/trunk/lib/arm/umodsi3.S =================================================================== --- compiler-rt/trunk/lib/arm/umodsi3.S +++ compiler-rt/trunk/lib/arm/umodsi3.S @@ -1,4 +1,4 @@ -/*===-- umodsi3.S - 32-bit unsigned integer modulus -----------------------===// +/*===-- udivmodsi4.S - 32-bit unsigned integer modulus --------------------===// * * The LLVM Compiler Infrastructure * @@ -7,66 +7,144 @@ * *===----------------------------------------------------------------------===// * - * This file implements the __umodsi3 (32-bit unsigned integer modulus) - * function for the ARM architecture. A naive digit-by-digit computation is - * employed for simplicity. + * This file implements the __udivmodsi4 (32-bit unsigned integer divide and + * modulus) function for the ARM 32-bit architecture. * *===----------------------------------------------------------------------===*/ #include "../assembly.h" -#define a r0 -#define b r1 -#define r r2 -#define i r3 + .syntax unified -.syntax unified -.align 3 +#ifdef ARM_HAS_BX +#define JMP(r) bx r +#define JMPc(r,c) bx##c r +#else +#define JMP(r) mov pc, r +#define JMPc(r,c) mov##c pc, r +#endif + + .text + .arm + .p2align 2 DEFINE_COMPILERRT_FUNCTION(__umodsi3) #if __ARM_ARCH_EXT_IDIV__ tst r1, r1 - beq LOCAL_LABEL(divzero) - udiv r2, r0, r1 - mls r0, r2, r1, r0 - bx lr -LOCAL_LABEL(divzero): - mov r0, #0 - bx lr + beq LOCAL_LABEL(divby0) + mov r3, r0 + udiv r0, r3, r1 + mls r1, r0, r1, r3 + str r1, [r2] + bx lr +#else + cmp r1, #1 + bcc LOCAL_LABEL(divby0) + moveq r0, #0 + JMPc(lr, eq) + cmp r0, r1 + JMPc(lr, cc) + /* + * Implement division using binary long division algorithm. + * + * r0 is the numerator, r1 the denominator. + * + * The code before JMP computes the correct shift I, so that + * r0 and (r1 << I) have the highest bit set in the same position. + * At the time of JMP, ip := .Ldiv0block - 8 * I. + * This depends on the fixed instruction size of block. + * + * block(shift) implements the test-and-update-quotient core. + * It assumes (r0 << shift) can be computed without overflow and + * that (r0 << shift) < 2 * r1. The quotient is stored in r3. + */ + +# ifdef __ARM_FEATURE_CLZ + clz ip, r0 + clz r3, r1 + /* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */ + sub r3, r3, ip + adr ip, LOCAL_LABEL(div0block) + sub ip, ip, r3, lsl #3 + bx ip +# else + mov r2, r0 + adr ip, LOCAL_LABEL(div0block) + + lsr r3, r2, #16 + cmp r3, r1 + movhs r2, r3 + subhs ip, ip, #(16 * 8) + + lsr r3, r2, #8 + cmp r3, r1 + movhs r2, r3 + subhs ip, ip, #(8 * 8) + + lsr r3, r2, #4 + cmp r3, r1 + movhs r2, r3 + subhs ip, #(4 * 8) + + lsr r3, r2, #2 + cmp r3, r1 + movhs r2, r3 + subhs ip, ip, #(2 * 8) + + /* Last block, no need to update r2 or r3. */ + cmp r1, r2, lsr #1 + subls ip, ip, #(1 * 8) + + JMP(ip) +# endif + +#define IMM # + +#define block(shift) \ + cmp r0, r1, lsl IMM shift; \ + subhs r0, r0, r1, lsl IMM shift + + block(31) + block(30) + block(29) + block(28) + block(27) + block(26) + block(25) + block(24) + block(23) + block(22) + block(21) + block(20) + block(19) + block(18) + block(17) + block(16) + block(15) + block(14) + block(13) + block(12) + block(11) + block(10) + block(9) + block(8) + block(7) + block(6) + block(5) + block(4) + block(3) + block(2) + block(1) +LOCAL_LABEL(div0block): + block(0) + JMP(lr) +#endif /* __ARM_ARCH_EXT_IDIV__ */ + +LOCAL_LABEL(divby0): + mov r0, #0 +#ifdef __ARM_EABI__ + b __aeabi_idiv0 #else -// We use a simple digit by digit algorithm; before we get into the actual -// divide loop, we must calculate the left-shift amount necessary to align -// the MSB of the divisor with that of the dividend. - clz r2, a - tst b, b // detect b == 0 - clz r3, b - bxeq lr // return a if b == 0 - subs i, r3, r2 - bxlt lr // return a if MSB(a) < MSB(b) - -LOCAL_LABEL(mainLoop): -// This loop basically implements the following: -// -// do { -// if (a >= b << i) { -// a -= b << i; -// if (a == 0) break; -// } -// } while (--i) -// -// Note that this does not perform the final iteration (i == 0); by doing it -// this way, we can merge the two branches which is a substantial win for -// such a tight loop on current ARM architectures. - subs r, a, b, lsl i - it hs - movhs a, r - it ne - subsne i, i, #1 - bhi LOCAL_LABEL(mainLoop) - -// Do the final test subtraction and update of remainder (i == 0), as it is -// not performed in the main loop. - subs r, a, b - it hs - movhs a, r - bx lr + JMP(lr) #endif + +END_COMPILERRT_FUNCTION(__umodsi3)