Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -4055,11 +4055,13 @@ DAGCombinerInfo &DCI, const SDLoc &DL) const; - SDValue prepareUREMEqFold(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, - DAGCombinerInfo &DCI, const SDLoc &DL, + SDValue prepareUREMEqFold(EVT VT, SDValue N0, SDValue CompTargetNode, + ISD::CondCode Cond, DAGCombinerInfo &DCI, + const SDLoc &DL, SmallVectorImpl &Created) const; - SDValue buildUREMEqFold(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, - DAGCombinerInfo &DCI, const SDLoc &DL) const; + SDValue buildUREMEqFold(EVT VT, SDValue REMNode, SDValue CompNodeTargetNode, + ISD::CondCode Cond, DAGCombinerInfo &DCI, + const SDLoc &DL) const; }; /// Given an LLVM IR type and return type attributes, compute the return value Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4463,12 +4463,13 @@ /// using only multiplications, additions and shifts/rotations. /// Ref: "Hacker's Delight" 10-17. SDValue TargetLowering::buildUREMEqFold(EVT VT, SDValue REMNode, - SDValue CompNode, ISD::CondCode Cond, + SDValue CompNodeTargetNode, + ISD::CondCode Cond, DAGCombinerInfo &DCI, const SDLoc &DL) const { SmallVector Built; - if (SDValue Folded = - prepareUREMEqFold(VT, REMNode, CompNode, Cond, DCI, DL, Built)) { + if (SDValue Folded = prepareUREMEqFold(VT, REMNode, CompNodeTargetNode, Cond, + DCI, DL, Built)) { for (SDNode *N : Built) DCI.AddToWorklist(N); return Folded; @@ -4478,79 +4479,115 @@ } SDValue -TargetLowering::prepareUREMEqFold(EVT VT, SDValue REMNode, SDValue CompNode, - ISD::CondCode Cond, DAGCombinerInfo &DCI, - const SDLoc &DL, +TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode, + SDValue CompTargetNode, ISD::CondCode Cond, + DAGCombinerInfo &DCI, const SDLoc &DL, SmallVectorImpl &Created) const { // fold (seteq/ne (urem N, D), 0) -> (setule/ugt (rotr (mul N, P), K), Q) - // - D must be constant with D = D0 * 2^K where D0 is odd and D0 != 1 + // - D must be constant, bigger than 1, with D = D0 * 2^K where D0 is odd // - P is the multiplicative inverse of D0 modulo 2^W // - Q = floor((2^W - 1) / D0) // where W is the width of the common type of N and D. assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) && "Only applicable for (in)equality comparisons."); - EVT REMVT = REMNode->getValueType(0); - - // If MUL is unavailable, we cannot proceed in any case. - if (!isOperationLegalOrCustom(ISD::MUL, REMVT)) - return SDValue(); - - // TODO: Add non-uniform constant support. - ConstantSDNode *Divisor = isConstOrConstSplat(REMNode->getOperand(1)); - ConstantSDNode *CompTarget = isConstOrConstSplat(CompNode); - if (!Divisor || !CompTarget || Divisor->isNullValue() || - !CompTarget->isNullValue()) - return SDValue(); - - const APInt &D = Divisor->getAPIntValue(); - - // Decompose D into D0 * 2^K - unsigned K = D.countTrailingZeros(); - bool DivisorIsEven = (K != 0); - APInt D0 = D.lshr(K); - - // The fold is invalid when D0 == 1. - // This is reachable because visitSetCC happens before visitREM. - if (D0.isOneValue()) - return SDValue(); - - // P = inv(D0, 2^W) - // 2^W requires W + 1 bits, so we have to extend and then truncate. - unsigned W = D.getBitWidth(); - APInt P = D0.zext(W + 1) - .multiplicativeInverse(APInt::getSignedMinValue(W + 1)) - .trunc(W); - assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable - assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check."); - - // Q = floor((2^W - 1) / D) - APInt Q = APInt::getAllOnesValue(W).udiv(D); - SelectionDAG &DAG = DCI.DAG; - SDValue PVal = DAG.getConstant(P, DL, REMVT); - SDValue QVal = DAG.getConstant(Q, DL, REMVT); - // (mul N, P) - SDValue Op1 = DAG.getNode(ISD::MUL, DL, REMVT, REMNode->getOperand(0), PVal); - Created.push_back(Op1.getNode()); + EVT VT = REMNode->getValueType(0); + EVT SVT = VT.getScalarType(); + EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout()); + EVT ShSVT = ShVT.getScalarType(); - // Rotate right only if D was even. - if (DivisorIsEven) { + // If MUL is unavailable, we cannot proceed in any case. + if (!isOperationLegalOrCustom(ISD::MUL, VT)) + return SDValue(); + + // TODO: Could support comparing with non-zero too. + ConstantSDNode *CompTarget = isConstOrConstSplat(CompTargetNode); + if (!CompTarget || !CompTarget->isNullValue()) + return SDValue(); + + bool HadEvenDivisor = false; + bool AllDivisorsArePowerOfTwo = true; + SmallVector PAmts, KAmts, QAmts; + + auto BuildUREMPattern = [&](ConstantSDNode *C) { + // This fold is only valid for D > 1. (power-of-two is ok.) + if (C->isNullValue() || C->isOne()) + return false; + + const APInt &D = C->getAPIntValue(); + + // Decompose D into D0 * 2^K + unsigned K = D.countTrailingZeros(); + APInt D0 = D.lshr(K); + + // D is even if it has trailing zeros. + HadEvenDivisor |= (K != 0); + // D is a power-of-two if D0 is one. + // If all divisors are power-of-two, we will prefer to avoid the fold. + AllDivisorsArePowerOfTwo &= D0.isOneValue(); + + // P = inv(D0, 2^W) + // 2^W requires W + 1 bits, so we have to extend and then truncate. + unsigned W = D.getBitWidth(); + APInt P = D0.zext(W + 1) + .multiplicativeInverse(APInt::getSignedMinValue(W + 1)) + .trunc(W); + assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable + assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check."); + + // Q = floor((2^W - 1) / D) + APInt Q = APInt::getAllOnesValue(W).udiv(D); + + PAmts.push_back(DAG.getConstant(P, DL, SVT)); + KAmts.push_back(DAG.getConstant(K, DL, ShSVT)); + QAmts.push_back(DAG.getConstant(Q, DL, SVT)); + return true; + }; + + SDValue N = REMNode->getOperand(0); + SDValue D = REMNode->getOperand(1); + + // Collect the values from each element. + if (!ISD::matchUnaryPredicate(D, BuildUREMPattern)) + return SDValue(); + + // If this is a urem by a powers-of-two, avoid the fold since it can be + // best implemented as a bit test. + if (AllDivisorsArePowerOfTwo) + return SDValue(); + + SDValue PVal, KVal, QVal; + if (VT.isVector()) { + PVal = DAG.getBuildVector(VT, DL, PAmts); + KVal = DAG.getBuildVector(ShVT, DL, KAmts); + QVal = DAG.getBuildVector(VT, DL, QAmts); + } else { + PVal = PAmts[0]; + KVal = KAmts[0]; + QVal = QAmts[0]; + } + + // (mul N, P) + SDValue Op0 = DAG.getNode(ISD::MUL, DL, VT, N, PVal); + Created.push_back(Op0.getNode()); + + // Rotate right only if any divisor was even. We avoid rotates for all-odd + // divisors as a performance improvement, since rotating by 0 is a no-op. + if (HadEvenDivisor) { // We need ROTR to do this. - if (!isOperationLegalOrCustom(ISD::ROTR, REMVT)) + if (!isOperationLegalOrCustom(ISD::ROTR, VT)) return SDValue(); - SDValue ShAmt = - DAG.getConstant(K, DL, getShiftAmountTy(REMVT, DAG.getDataLayout())); SDNodeFlags Flags; Flags.setExact(true); // UREM: (rotr (mul N, P), K) - Op1 = DAG.getNode(ISD::ROTR, DL, REMVT, Op1, ShAmt, Flags); - Created.push_back(Op1.getNode()); + Op0 = DAG.getNode(ISD::ROTR, DL, VT, Op0, KVal, Flags); + Created.push_back(Op0.getNode()); } // UREM: (setule/setugt (rotr (mul N, P), K), Q) - return DAG.getSetCC(DL, VT, Op1, QVal, + return DAG.getSetCC(DL, SETCCVT, Op0, QVal, ((Cond == ISD::SETEQ) ? ISD::SETULE : ISD::SETUGT)); } Index: test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll =================================================================== --- test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll +++ test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s -; At the moment, BuildUREMEqFold does not handle nonsplat vectors. - ; Odd+Even divisors define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_odd_even: @@ -38,18 +36,11 @@ ; CHECK-LABEL: test_urem_odd_allones: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: adrp x9, .LCPI1_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: adrp x8, .LCPI1_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_1] -; CHECK-NEXT: adrp x8, .LCPI1_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI1_1] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret Index: test/CodeGen/X86/urem-seteq-vec-nonsplat.ll =================================================================== --- test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -5,8 +5,6 @@ ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX512VL -; At the moment, BuildUREMEqFold does not handle nonsplat vectors. - ; Odd+Even divisors define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even: @@ -115,18 +113,9 @@ ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -140,97 +129,33 @@ define <4 x i32> @test_urem_odd_allones(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2147483649,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2147483649,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,1,858993459] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_odd_allones: -; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: retq -; -; CHECK-AVX2-LABEL: test_urem_odd_allones: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512VL-LABEL: test_urem_odd_allones: -; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: retq +; CHECK-AVX-LABEL: test_urem_odd_allones: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> @@ -328,17 +253,9 @@ ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -456,18 +373,9 @@ ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -555,16 +463,9 @@ ; ; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -661,17 +562,9 @@ ; ; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -786,18 +679,9 @@ ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -902,17 +786,9 @@ ; ; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,3435973837] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1027,18 +903,9 @@ ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,2147483649,2454267027] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1147,17 +1014,9 @@ ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,2147483649,1374389535] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq Index: unittests/ADT/APIntTest.cpp =================================================================== --- unittests/ADT/APIntTest.cpp +++ unittests/ADT/APIntTest.cpp @@ -2514,10 +2514,13 @@ .multiplicativeInverse(APInt::getSignedMinValue(BitWidth + 1)) .trunc(BitWidth); APInt One = V * MulInv; - EXPECT_TRUE(MulInv.isNullValue() || One.isOneValue()) - << " bitwidth = " << BitWidth << ", value = " << Value - << ", computed multiplicative inverse = " << MulInv - << ", value * multiplicative inverse = " << One << " (should be 1)"; + if (!V.isNullValue() && V.countTrailingZeros() == 0) { + // Multiplicative inverse exists for all odd numbers. + EXPECT_TRUE(One.isOneValue()); + } else { + // Multiplicative inverse does not exist for even numbers (and 0). + EXPECT_TRUE(MulInv.isNullValue()); + } } } }