Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -3495,6 +3495,13 @@ bool IsAfterLegalization, SmallVectorImpl &Created) const; + //===--------------------------------------------------------------------===// + // REM equality fold + // + SDValue BuildREMEqFold(SDNode *N, SDNode *CondNode, + const APInt &Divisor, SelectionDAG &DAG, + std::vector &Created) const; + /// Targets may override this function to provide custom SDIV lowering for /// power-of-2 denominators. If the target returns an empty SDValue, LLVM /// assumes SDIV is expensive and replaces it with a series of other integer Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -431,6 +431,7 @@ SDValue BuildSDIV(SDNode *N); SDValue BuildSDIVPow2(SDNode *N); SDValue BuildUDIV(SDNode *N); + SDValue BuildREMEqFold(SDNode *REMNode, SDNode *CondNode); SDValue BuildLogBase2(SDValue V, const SDLoc &DL); SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags); SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags); @@ -3368,6 +3369,33 @@ } } + // fold (rem lhs, rhs) -> lhs when |lhs| < |rhs| + // FIXME: this optimization is otherwise missed if BuildREMEqFold succeeds. + // Ref: test1 in test/CodeGen/X86/jump_sign.ll + if (N1C) { + KnownBits LHSBits; + DAG.computeKnownBits(N0, LHSBits); + LHSBits.makeNonNegative(); + unsigned TypeWidth = N0.getValueSizeInBits(); + unsigned LeadingZeros = LHSBits.countMinLeadingZeros(); + APInt N1Value = N1C->getAPIntValue(); + // If the possibly nonzero part of LHS is wider than RHS, + // we cannot conclude |LHS| < |RHS| + if (TypeWidth - LeadingZeros < N1Value.getBitWidth()) { + APInt LHSMaxValue(N1Value.getBitWidth(), 0); + LHSMaxValue.setLowBits(TypeWidth - LeadingZeros); + N1Value.clearSignBit(); + // If |LHS| < |RHS|, the remainder is just LHS. + if (LHSMaxValue.ult(N1Value)) + return N0; + } + } + + if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::SETCC) { + if (SDValue Folded = BuildREMEqFold(N, *N->use_begin())) + return Folded; + } + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); // If X/C can be simplified by the division-by-constant logic, lower @@ -18096,6 +18124,47 @@ return S; } +/// Given an ISD::UREM or ISD::SREM used only by an ISD::SETEQ or ISD::SETNE +/// where the divisor is constant and the comparison target is zero, +/// return a DAG expression that will generate the same comparison result +/// using only multiplications, additions and shifts/rotations. +/// Ref: "Hacker's Delight" 10-17. +SDValue DAGCombiner::BuildREMEqFold(SDNode *REMNode, SDNode *CondNode) { + // UREM/SREM will be shorter than any other path we take here + if (DAG.getMachineFunction().getFunction().optForMinSize()) + return SDValue(); + + ConstantSDNode *N1C = isConstOrConstSplat(REMNode->getOperand(1)); + ConstantSDNode *C = isConstOrConstSplat(CondNode->getOperand(1)); + // Divisor and comparison target must be constant + if (!N1C || !C) + return SDValue(); + + // Divisor must be non-zero, comparison target must be zero + if (N1C->isNullValue() || !C->isNullValue()) + return SDValue(); + + SDValue CondCodeNode = CondNode->getOperand(2); + ISD::CondCode CondCode = cast(CondCodeNode)->get(); + if (CondCode != ISD::SETEQ && CondCode != ISD::SETNE) + return SDValue(); + + std::vector Built; + SDValue NewCond = TLI.BuildREMEqFold(REMNode, CondNode, + N1C->getAPIntValue(), DAG, Built); + + if (NewCond) { + SDNode *Last = Built.back(); + for (SDNode *N : Built) + AddToWorklist(N); + AddToWorklist(NewCond.getNode()); + DAG.ReplaceAllUsesWith(SDValue(CondNode, 0), NewCond); + return SDValue(Last, 0); + } + + return SDValue(); +} + /// Given an ISD::UDIV node expressing a divide by constant, return a DAG /// expression that will generate the same value by multiplying by a magic /// number. Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3596,6 +3596,95 @@ } } +/// Given an ISD::UREM or ISD::SREM used only by an ISD::SETEQ or ISD::SETNE +/// where the divisor is constant and the comparison target is zero, +/// return a DAG expression that will generate the same comparison result +/// using only multiplications, additions and shifts/rotations. +/// Ref: "Hacker's Delight" 10-17. +SDValue TargetLowering::BuildREMEqFold(SDNode *REMNode, SDNode *CondNode, + const APInt &D, SelectionDAG &DAG, + std::vector &Created) const { + // fold (seteq/ne (urem N, D), 0) -> (setule/gte (rotr (mul N, P), K), Q) + // fold (seteq/ne (srem N, D), 0) + // -> (setule/gte (rotr (add (mul N, P), Q''), K), (srl Q', K-1)) + // - D must be constant with D = D0 * 2^K where D0 is odd + // - P is the multiplicative inverse of D0 modulo 2^W + // - Q = floor((2^W - 1) / D0) + // - Q' = floor((2^(W - 1) - 1) / D0) + // - Q'' = Q' & -2^K + // where W is the width of the common type of N and D + EVT REMVT = REMNode->getValueType(0); + EVT CondVT = CondNode->getValueType(0); + SDLoc DL(REMNode); + bool IsEq = + cast(CondNode->getOperand(2))->get() == ISD::SETEQ; + bool IsSigned = (REMNode->getOpcode() == ISD::SREM); + + if (!isTypeLegal(REMVT)) + return SDValue(); + + // Rewrite D = D0 * 2^K + unsigned K = D.countTrailingZeros(); + APInt D0 = D.ashr(K); + + // Calculate the multiplicative inverse of D0 using Newton's method. + APInt tmp, P = D0; + while ((tmp = D0*P) != 1) + P *= APInt(D0.getBitWidth(), 2) - tmp; + + // Q = floor((2^W - 1) / D) + APInt Q = APInt::getAllOnesValue(D.getBitWidth()); + if (IsSigned) + Q.clearBit(D.getBitWidth()); + Q = Q.udiv(D); + if (IsSigned) { + Q.ashrInPlace(1); + // Q'' = Q' & -2^K + APInt Neg2K = APInt(D.getBitWidth(), 0); + Neg2K.setHighBits(D.getBitWidth() - K); + Q &= Neg2K; + } + + SDValue PVal = DAG.getConstant(P, DL, REMVT); + SDValue QVal = DAG.getConstant(Q, DL, REMVT); + // (mul N, P) + SDValue Op1 = DAG.getNode(ISD::MUL, DL, REMVT, REMNode->getOperand(0), PVal); + Created.push_back(Op1.getNode()); + // Will change in the signed case + SDValue Op2 = QVal; + + if (IsSigned) { + // (add (mul N, P), Q') + Op1 = DAG.getNode(ISD::ADD, DL, REMVT, Op1, QVal); + Created.push_back(Op1.getNode()); + } + + // Rotate right only if D was even and thus shifted (K > 0) + if (K) { + SDValue ShAmt = + DAG.getConstant(K, DL, getShiftAmountTy(REMVT, DAG.getDataLayout())); + SDNodeFlags Flags; + Flags.setExact(true); + // UREM: (rotr (mul N, P), K) + // SREM: (rotr (add (mul N, P), Q'), K) + Op1 = DAG.getNode(ISD::ROTR, DL, REMVT, Op1, ShAmt, Flags); + Created.push_back(Op1.getNode()); + } + + // Don't insert a useless node if K - 1 == 0 + if (IsSigned && K != 1) { + if (K != 0) + Op2 = DAG.getConstant(Q.ashr(K - 1), DL, REMVT); + else + Op2 = DAG.getConstant(Q.shl(1), DL, REMVT); + } + + // UREM: (setule/setugt (rotr (mul N, P), K), Q) + // SREM: (setule/setugt (rotr (add (mul N, P), Q''), K), (srl Q', K-1)) + Op1 = DAG.getSetCC(DL, CondVT, Op1, Op2, (IsEq ? ISD::SETULE : ISD::SETUGT)); + return Op1; +} + bool TargetLowering:: verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const { if (!isa(Op.getOperand(0))) { Index: test/CodeGen/Hexagon/swp-const-tc2.ll =================================================================== --- test/CodeGen/Hexagon/swp-const-tc2.ll +++ test/CodeGen/Hexagon/swp-const-tc2.ll @@ -6,8 +6,8 @@ ; In the bug, the final CFG contains two iterations of the loop. ; CHECK-NOT: loop0 -; CHECK: = mpy -; CHECK-NOT: = mpy +; CHECK: = memw +; CHECK-NOT: = memw define void @f0() { b0: Index: test/CodeGen/X86/jump_sign.ll =================================================================== --- test/CodeGen/X86/jump_sign.ll +++ test/CodeGen/X86/jump_sign.ll @@ -236,13 +236,11 @@ ; CHECK-NEXT: jne .LBB12_8 ; CHECK-NEXT: # %bb.4: # %if.end29 ; CHECK-NEXT: movzwl (%eax), %eax -; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: imull $52429, %eax, %ecx # imm = 0xCCCD -; CHECK-NEXT: shrl $19, %ecx -; CHECK-NEXT: addl %ecx, %ecx -; CHECK-NEXT: leal (%ecx,%ecx,4), %ecx -; CHECK-NEXT: cmpw %cx, %ax -; CHECK-NEXT: jne .LBB12_5 +; CHECK-NEXT: imull $-13107, %eax, %eax # imm = 0xCCCD +; CHECK-NEXT: rorw $1, %ax +; CHECK-NEXT: movzwl %ax, %eax +; CHECK-NEXT: cmpl $6554, %eax +; CHECK-NEXT: jae .LBB12_5 ; CHECK-NEXT: .LBB12_8: # %if.then44 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al Index: test/CodeGen/X86/rem.ll =================================================================== --- test/CodeGen/X86/rem.ll +++ test/CodeGen/X86/rem.ll @@ -78,3 +78,87 @@ ret i32 %0 } +define i32 @test6(i32 %X) nounwind readnone { +; CHECK-LABEL: test6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: cmpl $858993460, %eax +; CHECK-NEXT: movl $42, %eax +; CHECK-NEXT: jb [[LOC:\.LBB[0-9]+_[0-9]+]] +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movl $-10, %eax +; CHECK-NEXT: [[LOC]]: # %entry +; CHECK-NEXT: retl + +entry: + %0 = urem i32 %X, 5 + %cmp = icmp eq i32 %0, 0 + %ret = select i1 %cmp, i32 42, i32 -10 + ret i32 %ret +} + +define i16 @test7(i16 %X) nounwind readnone { +; CHECK-LABEL: test7: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: imull $28087, {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: rorw $1, %ax +; CHECK-NEXT: movzwl %ax, %eax +; CHECK-NEXT: cmpl $4681, %eax +; CHECK-NEXT: movl $42, %eax +; CHECK-NEXT: ja [[LOC:\.LBB[0-9]+_[0-9]+]] +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movl $65526, %eax +; CHECK-NEXT: [[LOC]]: # %entry +; CHECK-NEXT: # kill: {{.*}} +; CHECK-NEXT: retl + +entry: + %0 = urem i16 %X, 14 + %cmp = icmp ne i16 %0, 0 + %ret = select i1 %cmp, i16 42, i16 -10 + ret i16 %ret +} + +define i32 @test8(i32 %X) nounwind readnone { +; CHECK-LABEL: test8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: imull $-1108378657, {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addl $69273666, %eax +; CHECK-NEXT: cmpl $138547333, %eax +; CHECK-NEXT: movl $42, %eax +; CHECK-NEXT: jb [[LOC:\.LBB[0-9]+_[0-9]+]] +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movl $-10, %eax +; CHECK-NEXT: [[LOC]]: # %entry +; CHECK-NEXT: retl + +entry: + %0 = srem i32 %X, 31 + %cmp = icmp eq i32 %0, 0 + %ret = select i1 %cmp, i32 42, i32 -10 + ret i32 %ret +} + +define i8 @test9(i8 %X) nounwind readnone { +; CHECK-LABEL: test9: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al +; CHECK-NEXT: movb $-85, %cl +; CHECK-NEXT: mulb %cl +; CHECK-NEXT: addb $8, %al +; CHECK-NEXT: rorb $2, %al +; CHECK-NEXT: cmpb $4, %al +; CHECK-NEXT: movb $42, %al +; CHECK-NEXT: ja [[LOC:\.LBB[0-9]+_[0-9]+]] +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movb $-10, %al +; CHECK-NEXT: [[LOC]]: # %entry +; CHECK-NEXT: retl + +entry: + %0 = srem i8 %X, 12 + %cmp = icmp ne i8 %0, 0 + %ret = select i1 %cmp, i8 42, i8 -10 + ret i8 %ret +} + Index: test/CodeGen/X86/vselect-avx.ll =================================================================== --- test/CodeGen/X86/vselect-avx.ll +++ test/CodeGen/X86/vselect-avx.ll @@ -85,17 +85,9 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) { ; AVX1-LABEL: test3: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1431655766,1431655766,1431655766,1431655766] -; AVX1-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmuldq %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; AVX1-NEXT: vpsrld $31, %xmm3, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpmulld LCPI{{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpaddd LCPI{{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpminud LCPI{{.*}}(%rip), %xmm0, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 @@ -106,19 +98,12 @@ ; ; AVX2-LABEL: test3: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766] -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; AVX2-NEXT: vpmuldq %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vpmuldq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] -; AVX2-NEXT: vpsrld $31, %xmm3, %xmm4 -; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [3,3,3,3] -; AVX2-NEXT: vpmulld %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2863311531,2863311531,2863311531,2863311531] +; AVX2-NEXT: vpmulld %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [715827882,715827882,715827882,715827882] +; AVX2-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655764,1431655764,1431655764,1431655764] +; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0