Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -3695,6 +3695,9 @@ SDValue N1, ISD::CondCode Cond, DAGCombinerInfo &DCI, const SDLoc &DL) const; + + SDValue BuildUREMEqFold(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, + DAGCombinerInfo &DCI, const SDLoc &DL) const; }; /// Given an LLVM IR type and return type attributes, compute the return value Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2774,6 +2774,18 @@ return V; } + // Fold remainder of division by a a constant + if (N0.getOpcode() == ISD::UREM && N0.hasOneUse() && + (Cond == ISD::SETEQ || Cond == ISD::SETNE)) { + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); + + // When division is cheap or optimizing for minimum size, + // fall through to DIVREM creation by skipping this fold. + if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttribute(Attribute::MinSize)) + if (SDValue Folded = BuildUREMEqFold(VT, N0, N1, Cond, DCI, dl)) + return Folded; + } + // Fold away ALL boolean setcc's. SDValue Temp; if (N0.getValueType().getScalarType() == MVT::i1 && foldBooleans) { @@ -3703,6 +3715,80 @@ return DAG.getSelect(dl, VT, IsOne, N0, Q); } +/// Given an ISD::UREM used only by an ISD::SETEQ or ISD::SETNE +/// where the divisor is constant and the comparison target is zero, +/// return a DAG expression that will generate the same comparison result +/// using only multiplications, additions and shifts/rotations. +/// Ref: "Hacker's Delight" 10-17. +SDValue TargetLowering::BuildUREMEqFold(EVT VT, SDValue REMNode, + SDValue CompNode, ISD::CondCode Cond, + DAGCombinerInfo &DCI, + const SDLoc &DL) const { + // fold (seteq/ne (urem N, D), 0) -> (setule/gte (rotr (mul N, P), K), Q) + // - D must be constant with D = D0 * 2^K where D0 is odd + // - P is the multiplicative inverse of D0 modulo 2^W + // - Q = floor((2^W - 1) / D0) + // where W is the width of the common type of N and D + SelectionDAG &DAG = DCI.DAG; + EVT REMVT = REMNode->getValueType(0); + bool IsEq = (Cond == ISD::SETEQ); + + if (!isTypeLegal(REMVT)) + return SDValue(); + + // TODO: Add non-uniform constant support + ConstantSDNode *Divisor = isConstOrConstSplat(REMNode->getOperand(1)); + ConstantSDNode *CompTarget = isConstOrConstSplat(CompNode); + if (!Divisor || !CompTarget || Divisor->isNullValue() || + !CompTarget->isNullValue()) + return SDValue(); + + APInt D = Divisor->getAPIntValue(); + unsigned W = D.getBitWidth(); + + // Rewrite D = D0 * 2^K + unsigned K = D.countTrailingZeros(); + APInt D0 = D.lshr(K); + + // If D0 == 1, we cannot build this fold. Otherwise, D0 > 1, as needed. + if (D0.isOneValue()) + return SDValue(); + + // Calculate the multiplicative inverse P of D0 using Newton's method. + APInt tmp; + APInt P = D0; + while ((tmp = D0 * P) != 1) + P *= APInt(D0.getBitWidth(), 2) - tmp; + + // Q = floor((2^W - 1) / D0) + APInt Q = APInt::getAllOnesValue(W); + Q = Q.udiv(D0); + + SDValue PVal = DAG.getConstant(P, DL, REMVT); + SDValue QVal = DAG.getConstant(Q, DL, REMVT); + // (mul N, P) + SDValue Op1 = DAG.getNode(ISD::MUL, DL, REMVT, REMNode->getOperand(0), PVal); + DCI.AddToWorklist(Op1.getNode()); + // Will change in the signed case + SDValue Op2 = QVal; + + // Rotate right only if D was even and thus shifted (K > 0) + if (K) { + SDValue ShAmt = + DAG.getConstant(K, DL, getShiftAmountTy(REMVT, DAG.getDataLayout())); + SDNodeFlags Flags; + Flags.setExact(true); + // UREM: (rotr (mul N, P), K) + // SREM: (rotr (add (mul N, P), Q'), K) + Op1 = DAG.getNode(ISD::ROTR, DL, REMVT, Op1, ShAmt, Flags); + DCI.AddToWorklist(Op1.getNode()); + } + + // UREM: (setule/setugt (rotr (mul N, P), K), Q) + Op1 = DAG.getSetCC(DL, VT, Op1, Op2, (IsEq ? ISD::SETULE : ISD::SETUGT)); + return Op1; +} + bool TargetLowering:: verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const { if (!isa(Op.getOperand(0))) { Index: test/CodeGen/AArch64/urem-seteq-optsize.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/urem-seteq-optsize.ll @@ -0,0 +1,22 @@ +; RUN: llc < %s -mtriple=aarch64-eabi | FileCheck %s + +; On AArch64, division in expensive. BuildRemEqFold should therefore run even +; when optimizing for size. Only optimizing for minimum size retains a plain div. + +define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone { +; CHECK: udiv +; CHECK-NOT: mul + %rem = urem i32 %X, 5 + %cmp = icmp eq i32 %rem, 0 + %ret = select i1 %cmp, i32 42, i32 -10 + ret i32 %ret +} + +define i32 @test_optsize(i32 %X) optsize nounwind readnone { +; CHECK: mul +; CHECK-NOT: udiv + %rem = urem i32 %X, 5 + %cmp = icmp eq i32 %rem, 0 + %ret = select i1 %cmp, i32 42, i32 -10 + ret i32 %ret +} Index: test/CodeGen/AArch64/urem-seteq.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/urem-seteq.ll @@ -0,0 +1,148 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-eabi | FileCheck %s + +; This tests the BuildREMEqFold optimization with UREM, i32, odd divisor, SETEQ. +; The corresponding pseudocode is: +; Q <- [N * multInv(5, 2^32)] <=> [N * 0xCCCCCCCD] <=> [N * (-858993459)] +; res <- [Q <= (2^32 - 1) / 5] <=> [Q <= 858993459] <=> [Q < 858993460] +define i32 @test_urem_odd(i32 %X) nounwind readnone { +; CHECK-LABEL: test_urem_odd: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #52429 +; CHECK-NEXT: movk w8, #52428, lsl #16 +; CHECK-NEXT: mov w9, #13108 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: movk w9, #13107, lsl #16 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret +entry: + %0 = urem i32 %X, 5 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_urem_odd, except the divisor has bit 30 set. +define i32 @test_urem_odd_bit30(i32 %X) nounwind readnone { +; CHECK-LABEL: test_urem_odd_bit30: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: movk w8, #27306, lsl #16 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: cmp w8, #4 // =4 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret +entry: + %0 = urem i32 %X, 1073741827 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_urem_odd, except the divisor has bit 31 set. +define i32 @test_urem_odd_bit31(i32 %X) nounwind readnone { +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: movk w8, #10922, lsl #16 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: cmp w8, #2 // =2 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret +entry: + %0 = urem i32 %X, 2147483651 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This tests the BuildREMEqFold optimization with UREM, i16, even divisor, SETNE. +; In this case, D <=> 14 <=> 7 * 2^1, so D0 = 7 and K = 1. +; The corresponding pseudocode is: +; Q <- [N * multInv(D0, 2^16)] <=> [N * multInv(7, 2^16)] <=> [N * 28087] +; Q <- [Q >>rot K] <=> [Q >>rot 1] +; res <- ![Q <= (2^16 - 1) / 7] <=> ![Q <= 9362] <=> [Q > 9362] +define i16 @test_urem_even(i16 %X) nounwind readnone { +; CHECK-LABEL: test_urem_even: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w9, #28087 +; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: movk w9, #46811, lsl #16 +; CHECK-NEXT: mul w8, w8, w9 +; CHECK-NEXT: mov w9, #18724 +; CHECK-NEXT: ror w8, w8, #1 +; CHECK-NEXT: movk w9, #9362, lsl #16 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: ret +entry: + %0 = urem i16 %X, 14 + %cmp = icmp ne i16 %0, 0 + %ret = zext i1 %cmp to i16 + ret i16 %ret +} + +; This is like test_urem_even, except the divisor has bit 30 set. +define i32 @test_urem_even_bit30(i32 %X) nounwind readnone { +; CHECK-LABEL: test_urem_even_bit30: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #20165 +; CHECK-NEXT: movk w8, #64748, lsl #16 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: ror w8, w8, #3 +; CHECK-NEXT: cmp w8, #32 // =32 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret +entry: + %0 = urem i32 %X, 1073741928 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_urem_odd, except the divisor has bit 31 set. +define i32 @test_urem_even_bit31(i32 %X) nounwind readnone { +; CHECK-LABEL: test_urem_even_bit31: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #64251 +; CHECK-NEXT: movk w8, #47866, lsl #16 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: ror w8, w8, #1 +; CHECK-NEXT: cmp w8, #4 // =4 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret +entry: + %0 = urem i32 %X, 2147483750 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; We should not proceed with this fold if the divisor is 1 or -1 +define i32 @test_urem_one(i32 %X) nounwind readnone { +; CHECK-LABEL: test_urem_one: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr w0, wzr, #0x1 +; CHECK-NEXT: ret +entry: + %0 = urem i32 %X, 1 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; We can lower remainder of division by powers of two much better elsewhere; +; also, BuildREMEqFold does not work when the only odd factor of the divisor is 1. +; This ensures we don't touch powers of two. +define i32 @test_urem_pow2(i32 %X) nounwind readnone { +; CHECK-LABEL: test_urem_pow2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: tst w0, #0xf +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %0 = urem i32 %X, 16 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} Index: test/CodeGen/X86/jump_sign.ll =================================================================== --- test/CodeGen/X86/jump_sign.ll +++ test/CodeGen/X86/jump_sign.ll @@ -236,13 +236,11 @@ ; CHECK-NEXT: jne .LBB12_8 ; CHECK-NEXT: # %bb.4: # %if.end29 ; CHECK-NEXT: movzwl (%eax), %eax -; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: imull $52429, %eax, %ecx # imm = 0xCCCD -; CHECK-NEXT: shrl $19, %ecx -; CHECK-NEXT: addl %ecx, %ecx -; CHECK-NEXT: leal (%ecx,%ecx,4), %ecx -; CHECK-NEXT: cmpw %cx, %ax -; CHECK-NEXT: jne .LBB12_5 +; CHECK-NEXT: imull $-13107, %eax, %eax +; CHECK-NEXT: rorw $1, %ax +; CHECK-NEXT: movzwl %ax, %eax +; CHECK-NEXT: cmpl $13108, %eax +; CHECK-NEXT: jae .LBB12_5 ; CHECK-NEXT: .LBB12_8: # %if.then44 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al Index: test/CodeGen/X86/urem-seteq-optsize.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/urem-seteq-optsize.ll @@ -0,0 +1,24 @@ +; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X64,NOBMI2 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi2 < %s | FileCheck %s --check-prefixes=CHECK,X64,BMI2 + +; On X86, division in expensive. BuildRemEqFold should therefore run even +; when optimizing for size. Only optimizing for minimum size retains a plain div. + +define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone { +; CHECK: divl +; CHECK-NOT: imull + %rem = urem i32 %X, 5 + %cmp = icmp eq i32 %rem, 0 + %ret = select i1 %cmp, i32 42, i32 -10 + ret i32 %ret +} + +define i32 @test_optsize(i32 %X) optsize nounwind readnone { +; CHECK: imull +; CHECK-NOT: divl + %rem = urem i32 %X, 5 + %cmp = icmp eq i32 %rem, 0 + %ret = select i1 %cmp, i32 42, i32 -10 + ret i32 %ret +} Index: test/CodeGen/X86/urem-seteq.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/urem-seteq.ll @@ -0,0 +1,209 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,X64,NOBMI2 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi2 < %s | FileCheck %s --check-prefixes=CHECK,X64,BMI2 + +; This tests the BuildREMEqFold optimization with UREM, i32, odd divisor, SETEQ. +; The corresponding pseudocode is: +; Q <- [N * multInv(5, 2^32)] <=> [N * 0xCCCCCCCD] <=> [N * (-858993459)] +; res <- [Q <= (2^32 - 1) / 5] <=> [Q <= 858993459] <=> [Q < 858993460] +define i32 @test_urem_odd(i32 %X) nounwind readnone { +; X86-LABEL: test_urem_odd: +; X86: # %bb.0: # %entry +; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $858993460, %ecx # imm = 0x33333334 +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: test_urem_odd: +; X64: # %bb.0: # %entry +; X64-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $858993460, %ecx # imm = 0x33333334 +; X64-NEXT: setb %al +; X64-NEXT: retq + +entry: + %0 = urem i32 %X, 5 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_urem_odd, except the divisor has bit 30 set. +define i32 @test_urem_odd_bit30(i32 %X) nounwind readnone { +; X86-LABEL: test_urem_odd_bit30: +; X86: # %bb.0: # %entry +; X86-NEXT: imull $1789569707, {{[0-9]+}}(%esp), %ecx # imm = 0x6AAAAAAB +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $4, %ecx +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: test_urem_odd_bit30: +; X64: # %bb.0: # %entry +; X64-NEXT: imull $1789569707, %edi, %ecx # imm = 0x6AAAAAAB +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $4, %ecx +; X64-NEXT: setb %al +; X64-NEXT: retq + +entry: + %0 = urem i32 %X, 1073741827 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_urem_odd, except the divisor has bit 31 set. +define i32 @test_urem_odd_bit31(i32 %X) nounwind readnone { +; X86-LABEL: test_urem_odd_bit31: +; X86: # %bb.0: # %entry +; X86-NEXT: imull $715827883, {{[0-9]+}}(%esp), %ecx # imm = 0x2AAAAAAB +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $2, %ecx +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: test_urem_odd_bit31: +; X64: # %bb.0: # %entry +; X64-NEXT: imull $715827883, %edi, %ecx # imm = 0x2AAAAAAB +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $2, %ecx +; X64-NEXT: setb %al +; X64-NEXT: retq + +entry: + %0 = urem i32 %X, 2147483651 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This tests the BuildREMEqFold optimization with UREM, i16, even divisor, SETNE. +; In this case, D <=> 14 <=> 7 * 2^1, so D0 = 7 and K = 1. +; The corresponding pseudocode is: +; Q <- [N * multInv(D0, 2^16)] <=> [N * multInv(7, 2^16)] <=> [N * 28087] +; Q <- [Q >>rot K] <=> [Q >>rot 1] +; res <- ![Q <= (2^16 - 1) / 7] <=> ![Q <= 9362] <=> [Q > 9362] +define i16 @test_urem_even(i16 %X) nounwind readnone { +; X86-LABEL: test_urem_even: +; X86: # %bb.0: # %entry +; X86-NEXT: imull $28087, {{[0-9]+}}(%esp), %eax # imm = 0x6DB7 +; X86-NEXT: rorw $1, %ax +; X86-NEXT: movzwl %ax, %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $9362, %ecx # imm = 0x2492 +; X86-NEXT: seta %al +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: test_urem_even: +; X64: # %bb.0: # %entry +; X64-NEXT: imull $28087, %edi, %eax # imm = 0x6DB7 +; X64-NEXT: rorw $1, %ax +; X64-NEXT: movzwl %ax, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $9362, %ecx # imm = 0x2492 +; X64-NEXT: seta %al +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + +entry: + %0 = urem i16 %X, 14 + %cmp = icmp ne i16 %0, 0 + %ret = zext i1 %cmp to i16 + ret i16 %ret +} + +; This is like test_urem_even, except the divisor has bit 30 set. +define i32 @test_urem_even_bit30(i32 %X) nounwind readnone { +; X86-LABEL: test_urem_even_bit30: +; X86: # %bb.0: # %entry +; X86-NEXT: imull $-51622203, {{[0-9]+}}(%esp), %ecx # imm = 0xFCEC4EC5 +; X86-NEXT: rorl $3, %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $32, %ecx +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: test_urem_even_bit30: +; X64: # %bb.0: # %entry +; X64-NEXT: imull $-51622203, %edi, %ecx # imm = 0xFCEC4EC5 +; X64-NEXT: rorl $3, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $32, %ecx +; X64-NEXT: setb %al +; X64-NEXT: retq + +entry: + %0 = urem i32 %X, 1073741928 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_urem_odd, except the divisor has bit 31 set. +define i32 @test_urem_even_bit31(i32 %X) nounwind readnone { +; X86-LABEL: test_urem_even_bit31: +; X86: # %bb.0: # %entry +; X86-NEXT: imull $-1157956869, {{[0-9]+}}(%esp), %ecx # imm = 0xBAFAFAFB +; X86-NEXT: rorl $1, %ecx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $4, %ecx +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: test_urem_even_bit31: +; X64: # %bb.0: # %entry +; X64-NEXT: imull $-1157956869, %edi, %ecx # imm = 0xBAFAFAFB +; X64-NEXT: rorl $1, %ecx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $4, %ecx +; X64-NEXT: setb %al +; X64-NEXT: retq + +entry: + %0 = urem i32 %X, 2147483750 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; We should not proceed with this fold if the divisor is 1 or -1 +define i32 @test_urem_one(i32 %X) nounwind readnone { +; CHECK-LABEL: test_urem_one: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: ret{{[l|q]}} +entry: + %0 = urem i32 %X, 1 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; We can lower remainder of division by powers of two much better elsewhere; +; also, BuildREMEqFold does not work when the only odd factor of the divisor is 1. +; This ensures we don't touch powers of two. +define i32 @test_urem_pow2(i32 %X) nounwind readnone { +; X86-LABEL: test_urem_pow2: +; X86: # %bb.0: # %entry +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: testb $15, {{[0-9]+}}(%esp) +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: test_urem_pow2: +; X64: # %bb.0: # %entry +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testb $15, %dil +; X64-NEXT: sete %al +; X64-NEXT: retq +entry: + %0 = urem i32 %X, 16 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +}