Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -26117,30 +26117,35 @@ } return; } + case X86ISD::VPMADDWD: case X86ISD::ADDUS: case X86ISD::SUBUS: case X86ISD::AVG: { - // Legalize types for X86ISD::AVG/ADDUS/SUBUS by widening. + // Legalize types for X86ISD::AVG/ADDUS/SUBUS/VPMADDWD by widening. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); - auto InVT = N->getValueType(0); - assert(InVT.getSizeInBits() < 128); - assert(128 % InVT.getSizeInBits() == 0); + EVT VT = N->getValueType(0); + EVT InVT = N->getOperand(0).getValueType(); + assert(VT.getSizeInBits() < 128); + assert(128 % VT.getSizeInBits() == 0); unsigned NumConcat = 128 / InVT.getSizeInBits(); - EVT RegVT = EVT::getVectorVT(*DAG.getContext(), - InVT.getVectorElementType(), - NumConcat * InVT.getVectorNumElements()); + EVT InWideVT = EVT::getVectorVT(*DAG.getContext(), + InVT.getVectorElementType(), + NumConcat * InVT.getVectorNumElements()); + EVT WideVT = EVT::getVectorVT(*DAG.getContext(), + VT.getVectorElementType(), + NumConcat * VT.getVectorNumElements()); SmallVector Ops(NumConcat, DAG.getUNDEF(InVT)); Ops[0] = N->getOperand(0); - SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops); Ops[0] = N->getOperand(1); - SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops); - SDValue Res = DAG.getNode(N->getOpcode(), dl, RegVT, InVec0, InVec1); - if (getTypeAction(*DAG.getContext(), InVT) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, + SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); return; @@ -34399,7 +34404,8 @@ SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); - if (NumElts >= OpsVT.getVectorNumElements()) { + if (ExperimentalVectorWideningLegalization || + NumElts >= OpsVT.getVectorNumElements()) { // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the // lower part is needed. SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); @@ -34588,8 +34594,10 @@ return SDValue(); // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case. + // Also allow v2i32 if it will be widened. MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements()); - if (!DAG.getTargetLoweringInfo().isTypeLegal(WVT)) + if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) || + DAG.getTargetLoweringInfo().isTypeLegal(WVT))) return SDValue(); SDValue N0 = N->getOperand(0); Index: test/CodeGen/X86/shrink_vmul-widen.ll =================================================================== --- test/CodeGen/X86/shrink_vmul-widen.ll +++ test/CodeGen/X86/shrink_vmul-widen.ll @@ -24,15 +24,16 @@ ; X86-SSE-NEXT: movl c, %esi ; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx ; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax -; X86-SSE-NEXT: movd %eax, %xmm1 -; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: movd %eax, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X86-SSE-NEXT: pmaddwd %xmm0, %xmm2 +; X86-SSE-NEXT: movd %xmm2, (%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] ; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl @@ -53,7 +54,7 @@ ; X86-AVX-NEXT: movzbl (%eax,%ecx), %eax ; X86-AVX-NEXT: vmovd %eax, %xmm1 ; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) ; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi @@ -65,14 +66,15 @@ ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx ; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-SSE-NEXT: movd %ecx, %xmm1 -; X64-SSE-NEXT: pxor %xmm2, %xmm2 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-SSE-NEXT: pmullw %xmm0, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: movd %ecx, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-SSE-NEXT: pmaddwd %xmm0, %xmm2 +; X64-SSE-NEXT: movq %xmm2, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi8: @@ -86,7 +88,7 @@ ; X64-AVX-NEXT: movzbl (%rsi,%rdx), %esi ; X64-AVX-NEXT: vmovd %esi, %xmm1 ; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -485,8 +487,8 @@ ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) ; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl @@ -1026,9 +1028,9 @@ ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE-NEXT: movd %xmm0, (%esi,%ecx,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) +; X86-SSE-NEXT: movd %xmm1, 4(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1124,8 +1126,8 @@ ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) ; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl @@ -1468,8 +1470,8 @@ ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) @@ -1487,7 +1489,7 @@ ; X86-AVX-NEXT: movzbl (%ecx,%eax), %ecx ; X86-AVX-NEXT: vmovd %ecx, %xmm0 ; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) ; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: popl %esi @@ -1501,8 +1503,8 @@ ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: pxor %xmm1, %xmm1 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; @@ -1513,7 +1515,7 @@ ; X64-AVX-NEXT: movzbl (%rdi,%rsi), %edx ; X64-AVX-NEXT: vmovd %edx, %xmm0 ; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1622,11 +1624,8 @@ ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 -; X86-SSE-NEXT: pmullw %xmm1, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) @@ -1644,7 +1643,7 @@ ; X86-AVX-NEXT: movzbl (%ecx,%eax), %ecx ; X86-AVX-NEXT: vmovd %ecx, %xmm0 ; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) ; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: popl %esi @@ -1658,11 +1657,8 @@ ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: pxor %xmm1, %xmm1 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 -; X64-SSE-NEXT: pmullw %xmm1, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; @@ -1673,7 +1669,7 @@ ; X64-AVX-NEXT: movzbl (%rdi,%rsi), %edx ; X64-AVX-NEXT: vmovd %edx, %xmm0 ; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1709,9 +1705,9 @@ ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm1, 4(%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst4: @@ -1791,9 +1787,9 @@ ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm1, 4(%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst5: @@ -1873,9 +1869,9 @@ ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm1, 4(%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst6: @@ -1952,9 +1948,9 @@ ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm1, 4(%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst1: @@ -2019,9 +2015,9 @@ ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm1, 4(%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst2: