Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -26184,6 +26184,25 @@ Results.push_back(Res); return; } + case X86ISD::PMULDQ: + case X86ISD::PMULUDQ: { + // Legalize by splitting. + EVT VT = N->getValueType(0); + assert(TLI.getTypeAction(VT) == TypeSplitVector && + "Unexpected type action!"); + SDValue Op0Lo, Op0Hi; + SDValue Op1Lo, Op1Hi; + std::tie(Op0Lo, Op0Hi) = DAG.SplitVectorOperand(N, 0); + std::tie(Op1Lo, Op1Hi) = DAG.SplitVectorOperand(N, 1); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + SDLoc dl(N); + SDValue Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Op0Lo, Op1Lo); + SDValue Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Op0Hi, Op1Hi); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + Results.push_back(Res); + return; + } case ISD::SETCC: { // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when // setCC result type is v2i1 because type legalzation will end up with @@ -35032,25 +35051,13 @@ // MULDQ returns the 64-bit result of the signed multiplication of the lower // 32-bits. We can lower with this if the sign bits stretch that far. if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 && - DAG.ComputeNumSignBits(N1) > 32) { - auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef Ops) { - return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops); - }; - return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 }, - PMULDQBuilder, /*CheckBWI*/false); - } + DAG.ComputeNumSignBits(N1) > 32) + return DAG.getNode(X86ISD::PMULDQ, SDLoc(N), VT, { N0, N1 }); // If the upper bits are zero we can use a single pmuludq. APInt Mask = APInt::getHighBitsSet(64, 32); - if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) { - auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef Ops) { - return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops); - }; - return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 }, - PMULUDQBuilder, /*CheckBWI*/false); - } + if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) + return DAG.getNode(X86ISD::PMULUDQ, SDLoc(N), VT, { N0, N1 }); return SDValue(); } @@ -41165,7 +41172,8 @@ // Simplify PMULDQ and PMULUDQ operations. static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -41192,6 +41200,11 @@ if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI)) return SDValue(N, 0); + // If this is a 256-bit vector, but we don't have AVX2, we need to split. + if (!DCI.isBeforeLegalizeOps() && N->getValueType(0) == MVT::v4i64 && + !Subtarget.hasAVX2()) + return split256IntArith(SDValue(N, 0), DAG); + return SDValue(); } @@ -41322,7 +41335,7 @@ case X86ISD::PCMPEQ: case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); case X86ISD::PMULDQ: - case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI); + case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget); } return SDValue(); Index: test/CodeGen/X86/combine-pmuldq.ll =================================================================== --- test/CodeGen/X86/combine-pmuldq.ll +++ test/CodeGen/X86/combine-pmuldq.ll @@ -91,18 +91,15 @@ define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) { ; SSE-LABEL: combine_zext_pmuludq_256: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] ; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] ; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [715827883,715827883] -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm4 -; SSE-NEXT: pmuludq %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [715827883,715827883] +; SSE-NEXT: pmuludq %xmm4, %xmm0 +; SSE-NEXT: pmuludq %xmm4, %xmm1 +; SSE-NEXT: pmuludq %xmm4, %xmm2 +; SSE-NEXT: pmuludq %xmm4, %xmm3 ; SSE-NEXT: retq ; ; AVX2-LABEL: combine_zext_pmuludq_256: Index: test/CodeGen/X86/mulvi32.ll =================================================================== --- test/CodeGen/X86/mulvi32.ll +++ test/CodeGen/X86/mulvi32.ll @@ -131,35 +131,35 @@ define <4 x i64> @_mul4xi32toi64a(<4 x i32>, <4 x i32>) { ; SSE2-LABEL: _mul4xi32toi64a: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE42-LABEL: _mul4xi32toi64a: ; SSE42: # %bb.0: -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSE42-NEXT: pmuludq %xmm3, %xmm2 -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; SSE42-NEXT: pmuludq %xmm3, %xmm1 -; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: pmuludq %xmm4, %xmm0 +; SSE42-NEXT: movdqa %xmm2, %xmm1 ; SSE42-NEXT: retq ; ; AVX1-LABEL: _mul4xi32toi64a: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: _mul4xi32toi64a: Index: test/CodeGen/X86/pmul.ll =================================================================== --- test/CodeGen/X86/pmul.ll +++ test/CodeGen/X86/pmul.ll @@ -1016,30 +1016,24 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { ; SSE2-LABEL: mul_v4i64_zero_upper: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: pmuludq %xmm4, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v4i64_zero_upper: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pmuludq %xmm2, %xmm4 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: pmuludq %xmm3, %xmm0 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] +; SSE41-NEXT: pmuludq %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE41-NEXT: pmuludq %xmm3, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: mul_v4i64_zero_upper: @@ -1171,48 +1165,37 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; SSE2-LABEL: mul_v8i64_zero_upper: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE2-NEXT: pmuludq %xmm7, %xmm4 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE2-NEXT: pmuludq %xmm0, %xmm5 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3] -; SSE2-NEXT: movaps %xmm4, %xmm0 -; SSE2-NEXT: movaps %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm7, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v8i64_zero_upper: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pmuludq %xmm4, %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,2,3,3] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero -; SSE41-NEXT: pmuludq %xmm5, %xmm0 +; SSE41-NEXT: pmuludq %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; SSE41-NEXT: pmuludq %xmm5, %xmm1 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: pmuludq %xmm6, %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero -; SSE41-NEXT: pmuludq %xmm7, %xmm1 +; SSE41-NEXT: pmuludq %xmm6, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; SSE41-NEXT: pmuludq %xmm7, %xmm2 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] ; SSE41-NEXT: retq ; @@ -1326,15 +1309,13 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] ; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 ; SSE41-NEXT: pmovsxwq %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pmovsxdq %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] ; SSE41-NEXT: pmuldq %xmm4, %xmm3 -; SSE41-NEXT: pmovsxdq %xmm2, %xmm2 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero ; SSE41-NEXT: pmuldq %xmm5, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovsxdq %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] ; SSE41-NEXT: pmuldq %xmm6, %xmm4 -; SSE41-NEXT: pmovsxdq %xmm1, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pmuldq %xmm7, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: retq Index: test/CodeGen/X86/vector-mul.ll =================================================================== --- test/CodeGen/X86/vector-mul.ll +++ test/CodeGen/X86/vector-mul.ll @@ -1270,9 +1270,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; X86-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; X86-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; X86-NEXT: pmuludq %xmm2, %xmm1 -; X86-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X86-NEXT: movdqa (%eax), %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] +; X86-NEXT: pmuludq %xmm3, %xmm1 +; X86-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero ; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: retl ; @@ -1280,9 +1281,10 @@ ; X64: # %bb.0: ; X64-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; X64-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; X64-NEXT: pmuludq %xmm2, %xmm1 -; X64-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X64-NEXT: movdqa (%rsi), %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] +; X64-NEXT: pmuludq %xmm3, %xmm1 +; X64-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero ; X64-NEXT: pmuludq %xmm2, %xmm0 ; X64-NEXT: retq ; @@ -1291,12 +1293,13 @@ ; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero ; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; X64-XOP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X64-XOP-NEXT: vmovdqa (%rsi), %xmm1 +; X64-XOP-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; X64-XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; X64-XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X64-XOP-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; X64-XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; X64-XOP-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; X64-XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-XOP-NEXT: retq ; ; X64-AVX2-LABEL: mul_v4i64_zext_cross_bb: Index: test/CodeGen/X86/xop-ifma.ll =================================================================== --- test/CodeGen/X86/xop-ifma.ll +++ test/CodeGen/X86/xop-ifma.ll @@ -67,12 +67,10 @@ define <4 x i64> @test_mulx_v4i32_add_v4i64(<4 x i32> %a0, <4 x i32> %a1, <4 x i64> %a2) { ; XOP-AVX1-LABEL: test_mulx_v4i32_add_v4i64: ; XOP-AVX1: # %bb.0: -; XOP-AVX1-NEXT: vpmovsxdq %xmm0, %xmm3 -; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; XOP-AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; XOP-AVX1-NEXT: vpmovsxdq %xmm1, %xmm4 -; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; XOP-AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; XOP-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; XOP-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero +; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; XOP-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 ; XOP-AVX1-NEXT: vpmacsdql %xmm5, %xmm1, %xmm0, %xmm0 ; XOP-AVX1-NEXT: vpmacsdql %xmm2, %xmm4, %xmm3, %xmm1