Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5517,19 +5517,38 @@ SDValue &Overflow, SelectionDAG &DAG) const { SDLoc dl(Node); EVT VT = Node->getValueType(0); + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + SDValue LHS = Node->getOperand(0); + SDValue RHS = Node->getOperand(1); + bool isSigned = Node->getOpcode() == ISD::SMULO; + + // For power-of-two multiplications we can use a simpler shift expansion. + if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) { + const APInt &C = RHSC->getAPIntValue(); + if (C.isPowerOf2()) { + EVT ShiftAmtTy = getShiftAmountTy(VT, DAG.getDataLayout()); + SDValue ShiftAmt = DAG.getConstant(C.logBase2(), dl, ShiftAmtTy); + Result = DAG.getNode(ISD::SHL, dl, VT, LHS, ShiftAmt); + + // Check whether ((X << Shift) >> Shift) == X. + SDValue ShiftedResult = DAG.getNode( + isSigned ? ISD::SRA : ISD::SRL, dl, VT, Result, ShiftAmt); + Overflow = DAG.getSetCC( + dl, SetCCVT, ShiftedResult, LHS, ISD::SETNE); + return true; + } + } + EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); if (VT.isVector()) WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT, VT.getVectorNumElements()); - SDValue LHS = Node->getOperand(0); - SDValue RHS = Node->getOperand(1); SDValue BottomHalf; SDValue TopHalf; static const unsigned Ops[2][3] = { { ISD::MULHU, ISD::UMUL_LOHI, ISD::ZERO_EXTEND }, { ISD::MULHS, ISD::SMUL_LOHI, ISD::SIGN_EXTEND }}; - bool isSigned = Node->getOpcode() == ISD::SMULO; if (isOperationLegalOrCustom(Ops[isSigned][0], VT)) { BottomHalf = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS); TopHalf = DAG.getNode(Ops[isSigned][0], dl, VT, LHS, RHS); @@ -5616,7 +5635,6 @@ } } - EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); Result = BottomHalf; if (isSigned) { SDValue ShiftAmt = DAG.getConstant( Index: llvm/test/CodeGen/X86/mulo-pow2.ll =================================================================== --- llvm/test/CodeGen/X86/mulo-pow2.ll +++ llvm/test/CodeGen/X86/mulo-pow2.ll @@ -19,15 +19,6 @@ define <4 x i32> @umul_v4i32_1(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX-LABEL: umul_v4i32_1: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1] -; AVX-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %x = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> %a, <4 x i32> ) %y = extractvalue { <4 x i32>, <4 x i1> } %x, 0 @@ -54,14 +45,8 @@ define <4 x i32> @umul_v4i32_8(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX-LABEL: umul_v4i32_8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8] -; AVX-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm2 ; AVX-NEXT: vpslld $3, %xmm0, %xmm0 ; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq @@ -75,14 +60,8 @@ define <4 x i32> @umul_v4i32_2pow31(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX-LABEL: umul_v4i32_2pow31: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm2 ; AVX-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq @@ -108,15 +87,6 @@ define <4 x i32> @smul_v4i32_1(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX-LABEL: smul_v4i32_1: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1] -; AVX-NEXT: vpmuldq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpmuldq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX-NEXT: vpsrad $31, %xmm0, %xmm3 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %x = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> %a, <4 x i32> ) %y = extractvalue { <4 x i32>, <4 x i1> } %x, 0 @@ -148,16 +118,10 @@ define <4 x i32> @smul_v4i32_8(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX-LABEL: smul_v4i32_8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8] -; AVX-NEXT: vpmuldq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpmuldq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX-NEXT: vpslld $3, %xmm0, %xmm0 -; AVX-NEXT: vpsrad $31, %xmm0, %xmm3 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpslld $3, %xmm0, %xmm2 +; AVX-NEXT: vpsrad $3, %xmm2, %xmm3 +; AVX-NEXT: vpcmpeqd %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX-NEXT: retq %x = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> %a, <4 x i32> ) %y = extractvalue { <4 x i32>, <4 x i1> } %x, 0 @@ -169,16 +133,10 @@ define <4 x i32> @smul_v4i32_2pow31(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX-LABEL: smul_v4i32_2pow31: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX-NEXT: vpmuldq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpmuldq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX-NEXT: vpsrad $31, %xmm0, %xmm3 -; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpslld $31, %xmm0, %xmm2 +; AVX-NEXT: vpsrad $31, %xmm2, %xmm3 +; AVX-NEXT: vpcmpeqd %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX-NEXT: retq %x = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> %a, <4 x i32> ) %y = extractvalue { <4 x i32>, <4 x i1> } %x, 0