diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -44460,26 +44460,41 @@ if (!VT.isVector() || VT.getVectorElementType() != MVT::i32) return SDValue(); - // Make sure the type is legal or will be widened to a legal type. - if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT)) + // Make sure the type is legal or can split/widen to a legal type. + // With AVX512 but without BWI, we would need to split v32i16. + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts == 1 || !isPowerOf2_32(NumElts)) return SDValue(); - MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements()); + EVT WVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, 2 * NumElts); - // Without BWI, we would need to split v32i16. - if (WVT == MVT::v32i16 && !Subtarget.hasBWI()) + // With AVX512 but without BWI, we would need to split v32i16. + if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - // If we are zero extending two steps without SSE4.1, its better to reduce + // If we are zero/sign extending two steps without SSE4.1, its better to + // reduce the vmul width instead. + if (!Subtarget.hasSSE41() && + (((N0.getOpcode() == ISD::ZERO_EXTEND && + N0.getOperand(0).getScalarValueSizeInBits() <= 8) && + (N1.getOpcode() == ISD::ZERO_EXTEND && + N1.getOperand(0).getScalarValueSizeInBits() <= 8)) || + ((N0.getOpcode() == ISD::SIGN_EXTEND && + N0.getOperand(0).getScalarValueSizeInBits() <= 8) && + (N1.getOpcode() == ISD::SIGN_EXTEND && + N1.getOperand(0).getScalarValueSizeInBits() <= 8)))) + return SDValue(); + + // If we are sign extending a wide vector without SSE4.1, its better to reduce // the vmul width instead. if (!Subtarget.hasSSE41() && - (N0.getOpcode() == ISD::ZERO_EXTEND && - N0.getOperand(0).getScalarValueSizeInBits() <= 8) && - (N1.getOpcode() == ISD::ZERO_EXTEND && - N1.getOperand(0).getScalarValueSizeInBits() <= 8)) + (N0.getOpcode() == ISD::SIGN_EXTEND && + N0.getOperand(0).getValueSizeInBits() > 128) && + (N1.getOpcode() == ISD::SIGN_EXTEND && + N1.getOperand(0).getValueSizeInBits() > 128)) return SDValue(); // Sign bits must extend through the upper 17 bits. @@ -44496,12 +44511,18 @@ if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) return DAG.getNode(ISD::AND, SDLoc(N), VT, Op, DAG.getConstant(0xFFFF, SDLoc(N), VT)); - // Convert sext(vXi16) to zext(vXi16). - if (Op.getOpcode() == ISD::SIGN_EXTEND && VT.getSizeInBits() <= 128 && - N->isOnlyUserOf(Op.getNode())) { + if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) { SDValue Src = Op.getOperand(0); - if (Src.getScalarValueSizeInBits() == 16) + // Convert sext(vXi16) to zext(vXi16). + if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128) return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src); + // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets + // which will expand the extension. + if (Src.getScalarValueSizeInBits() <= 16 && !Subtarget.hasSSE41()) { + EVT ExtVT = VT.changeVectorElementType(MVT::i16); + Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src); + return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src); + } } // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG. if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll --- a/llvm/test/CodeGen/X86/pmaddubsw.ll +++ b/llvm/test/CodeGen/X86/pmaddubsw.ll @@ -296,29 +296,27 @@ ; SSE-LABEL: pmaddubsw_bad_extend: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psllw $8, %xmm3 -; SSE-NEXT: psraw $8, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pmulhw %xmm2, %xmm4 -; SSE-NEXT: pmullw %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: psrlw $8, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pmulhw %xmm0, %xmm4 -; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: paddd %xmm2, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: paddd %xmm3, %xmm1 -; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: psllw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psraw $8, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0],zero,xmm4[2],zero,xmm4[4],zero,xmm4[6],zero,xmm4[u,u,u,u,u,u,u,u] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: psrlw $8, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pmaddwd %xmm4, %xmm0 +; SSE-NEXT: pshufb {{.*#+}} xmm2 = xmm2[8],zero,xmm2[10],zero,xmm2[12],zero,xmm2[14],zero,xmm2[u,u,u,u,u,u,u,u] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[9],zero,xmm1[11],zero,xmm1[13],zero,xmm1[15],zero,xmm1[u,u,u,u,u,u,u,u] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: pmaddwd %xmm2, %xmm6 +; SSE-NEXT: packssdw %xmm6, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: pmaddubsw_bad_extend: @@ -395,30 +393,22 @@ ; SSE-LABEL: pmaddubsw_bad_indices: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pshufb {{.*#+}} xmm3 = xmm3[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14] -; SSE-NEXT: psraw $8, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pmulhw %xmm2, %xmm4 -; SSE-NEXT: pmullw %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14] +; SSE-NEXT: psraw $8, %xmm0 ; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,0,u,3,u,4,u,7,u,8,u,11,u,12,u,15] ; SSE-NEXT: psraw $8, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pmulhw %xmm0, %xmm4 -; SSE-NEXT: pmullw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: paddd %xmm2, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: paddd %xmm3, %xmm1 -; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE-NEXT: pmaddwd %xmm4, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pmaddwd %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm5, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: pmaddubsw_bad_indices: diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -326,32 +326,27 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: and_mulhuw_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm7 -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: packssdw %xmm7, %xmm6 -; SSE2-NEXT: pmulhw %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: packssdw %xmm5, %xmm4 -; SSE2-NEXT: pmulhw %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: packssdw %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: packssdw %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pmaddwd %xmm3, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm8 +; SSE2-NEXT: pmaddwd %xmm2, %xmm8 +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pmaddwd %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pmaddwd %xmm6, %xmm0 +; SSE2-NEXT: psrld $16, %xmm7 +; SSE2-NEXT: psrld $16, %xmm8 +; SSE2-NEXT: packssdw %xmm7, %xmm8 +; SSE2-NEXT: psrld $16, %xmm5 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: packssdw %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm8, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: and_mulhuw_v16i16: @@ -382,12 +377,12 @@ ; AVX2-LABEL: and_mulhuw_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32767,32767,32767,32767,32767,32767,32767,32767] -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -985,16 +985,16 @@ ; X86-SSE-NEXT: movl c, %ecx ; X86-SSE-NEXT: movzwl (%esi,%eax), %esi ; X86-SSE-NEXT: movd %esi, %xmm0 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X86-SSE-NEXT: psrad $24, %xmm0 ; X86-SSE-NEXT: movzwl (%edx,%eax), %edx ; X86-SSE-NEXT: movd %edx, %xmm1 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: pmaddwd %xmm0, %xmm1 -; X86-SSE-NEXT: movq %xmm1, (%ecx,%eax,4) +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: pmaddwd %xmm1, %xmm0 +; X86-SSE-NEXT: movq %xmm0, (%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1021,16 +1021,16 @@ ; X64-SSE-NEXT: movq c(%rip), %rax ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx ; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X64-SSE-NEXT: psrad $24, %xmm0 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx ; X64-SSE-NEXT: movd %ecx, %xmm1 ; X64-SSE-NEXT: pxor %xmm2, %xmm2 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: pmaddwd %xmm0, %xmm1 -; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: pmaddwd %xmm1, %xmm0 +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi8_sext_zext: diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll --- a/llvm/test/CodeGen/X86/slow-pmulld.ll +++ b/llvm/test/CodeGen/X86/slow-pmulld.ll @@ -113,50 +113,42 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) { ; SLM32-LABEL: test_mul_v8i32_v8i8: ; SLM32: # %bb.0: -; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLM32-NEXT: movdqa %xmm1, %xmm2 -; SLM32-NEXT: pmullw %xmm0, %xmm1 -; SLM32-NEXT: pmulhw %xmm0, %xmm2 -; SLM32-NEXT: movdqa %xmm1, %xmm0 -; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] +; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM32-NEXT: pmaddwd %xmm2, %xmm0 +; SLM32-NEXT: pmaddwd %xmm2, %xmm1 ; SLM32-NEXT: retl ; ; SLM64-LABEL: test_mul_v8i32_v8i8: ; SLM64: # %bb.0: -; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLM64-NEXT: movdqa %xmm1, %xmm2 -; SLM64-NEXT: pmullw %xmm0, %xmm1 -; SLM64-NEXT: pmulhw %xmm0, %xmm2 -; SLM64-NEXT: movdqa %xmm1, %xmm0 -; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] +; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM64-NEXT: pmaddwd %xmm2, %xmm0 +; SLM64-NEXT: pmaddwd %xmm2, %xmm1 ; SLM64-NEXT: retq ; ; SLOW32-LABEL: test_mul_v8i32_v8i8: ; SLOW32: # %bb.0: -; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLOW32-NEXT: movdqa %xmm1, %xmm2 -; SLOW32-NEXT: pmulhw %xmm0, %xmm2 -; SLOW32-NEXT: pmullw %xmm0, %xmm1 -; SLOW32-NEXT: movdqa %xmm1, %xmm0 -; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] +; SLOW32-NEXT: pmaddwd %xmm2, %xmm0 +; SLOW32-NEXT: pmaddwd %xmm2, %xmm1 ; SLOW32-NEXT: retl ; ; SLOW64-LABEL: test_mul_v8i32_v8i8: ; SLOW64: # %bb.0: -; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLOW64-NEXT: movdqa %xmm1, %xmm2 -; SLOW64-NEXT: pmulhw %xmm0, %xmm2 -; SLOW64-NEXT: pmullw %xmm0, %xmm1 -; SLOW64-NEXT: movdqa %xmm1, %xmm0 -; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] +; SLOW64-NEXT: pmaddwd %xmm2, %xmm0 +; SLOW64-NEXT: pmaddwd %xmm2, %xmm1 ; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v8i32_v8i8: @@ -164,7 +156,7 @@ ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1 ; SSE4-32-NEXT: retl @@ -174,7 +166,7 @@ ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1 ; SSE4-64-NEXT: retq @@ -248,86 +240,66 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SLM32-LABEL: test_mul_v16i32_v16i8: ; SLM32: # %bb.0: -; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLM32-NEXT: movdqa %xmm0, %xmm3 -; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLM32-NEXT: pxor %xmm4, %xmm4 -; SLM32-NEXT: movdqa %xmm1, %xmm2 -; SLM32-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SLM32-NEXT: movdqa %xmm3, %xmm4 -; SLM32-NEXT: pmullw %xmm0, %xmm1 -; SLM32-NEXT: pmulhw %xmm0, %xmm2 -; SLM32-NEXT: pmullw %xmm0, %xmm3 -; SLM32-NEXT: pmulhw %xmm0, %xmm4 -; SLM32-NEXT: movdqa %xmm1, %xmm0 -; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SLM32-NEXT: movdqa %xmm3, %xmm2 -; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0] +; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; SLM32-NEXT: pmaddwd %xmm5, %xmm0 +; SLM32-NEXT: pmaddwd %xmm5, %xmm1 +; SLM32-NEXT: pmaddwd %xmm5, %xmm2 +; SLM32-NEXT: pmaddwd %xmm5, %xmm3 ; SLM32-NEXT: retl ; ; SLM64-LABEL: test_mul_v16i32_v16i8: ; SLM64: # %bb.0: -; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLM64-NEXT: movdqa %xmm0, %xmm3 -; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLM64-NEXT: pxor %xmm4, %xmm4 -; SLM64-NEXT: movdqa %xmm1, %xmm2 -; SLM64-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SLM64-NEXT: movdqa %xmm3, %xmm4 -; SLM64-NEXT: pmullw %xmm0, %xmm1 -; SLM64-NEXT: pmulhw %xmm0, %xmm2 -; SLM64-NEXT: pmullw %xmm0, %xmm3 -; SLM64-NEXT: pmulhw %xmm0, %xmm4 -; SLM64-NEXT: movdqa %xmm1, %xmm0 -; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SLM64-NEXT: movdqa %xmm3, %xmm2 -; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0] +; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; SLM64-NEXT: pmaddwd %xmm5, %xmm0 +; SLM64-NEXT: pmaddwd %xmm5, %xmm1 +; SLM64-NEXT: pmaddwd %xmm5, %xmm2 +; SLM64-NEXT: pmaddwd %xmm5, %xmm3 ; SLM64-NEXT: retq ; ; SLOW32-LABEL: test_mul_v16i32_v16i8: ; SLOW32: # %bb.0: -; SLOW32-NEXT: movdqa %xmm0, %xmm3 -; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLOW32-NEXT: movdqa %xmm1, %xmm4 -; SLOW32-NEXT: pmulhw %xmm2, %xmm4 -; SLOW32-NEXT: pmullw %xmm2, %xmm1 -; SLOW32-NEXT: movdqa %xmm1, %xmm0 -; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SLOW32-NEXT: pxor %xmm4, %xmm4 -; SLOW32-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SLOW32-NEXT: movdqa %xmm3, %xmm4 -; SLOW32-NEXT: pmulhw %xmm2, %xmm4 -; SLOW32-NEXT: pmullw %xmm2, %xmm3 -; SLOW32-NEXT: movdqa %xmm3, %xmm2 -; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLOW32-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] +; SLOW32-NEXT: pmaddwd %xmm4, %xmm0 +; SLOW32-NEXT: pmaddwd %xmm4, %xmm1 +; SLOW32-NEXT: pmaddwd %xmm4, %xmm2 +; SLOW32-NEXT: pmaddwd %xmm4, %xmm3 ; SLOW32-NEXT: retl ; ; SLOW64-LABEL: test_mul_v16i32_v16i8: ; SLOW64: # %bb.0: -; SLOW64-NEXT: movdqa %xmm0, %xmm3 -; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLOW64-NEXT: movdqa %xmm1, %xmm4 -; SLOW64-NEXT: pmulhw %xmm2, %xmm4 -; SLOW64-NEXT: pmullw %xmm2, %xmm1 -; SLOW64-NEXT: movdqa %xmm1, %xmm0 -; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SLOW64-NEXT: pxor %xmm4, %xmm4 -; SLOW64-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SLOW64-NEXT: movdqa %xmm3, %xmm4 -; SLOW64-NEXT: pmulhw %xmm2, %xmm4 -; SLOW64-NEXT: pmullw %xmm2, %xmm3 -; SLOW64-NEXT: movdqa %xmm3, %xmm2 -; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLOW64-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] +; SLOW64-NEXT: pmaddwd %xmm4, %xmm0 +; SLOW64-NEXT: pmaddwd %xmm4, %xmm1 +; SLOW64-NEXT: pmaddwd %xmm4, %xmm2 +; SLOW64-NEXT: pmaddwd %xmm4, %xmm3 ; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v16i32_v16i8: @@ -339,7 +311,7 @@ ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2 @@ -355,7 +327,7 @@ ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2 @@ -364,18 +336,12 @@ ; ; AVX2-SLOW-LABEL: test_mul_v16i32_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778] -; AVX2-SLOW-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpmullw %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] +; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: ret{{[l|q]}} ; ; AVX2-32-LABEL: test_mul_v16i32_v16i8: @@ -852,7 +818,7 @@ define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize { ; SLM32-LABEL: test_mul_v8i32_v8i8_minsize: ; SLM32: # %bb.0: -; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -862,7 +828,7 @@ ; ; SLM64-LABEL: test_mul_v8i32_v8i8_minsize: ; SLM64: # %bb.0: -; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -875,7 +841,7 @@ ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SLOW32-NEXT: pmaddwd %xmm2, %xmm0 ; SLOW32-NEXT: pmaddwd %xmm2, %xmm1 ; SLOW32-NEXT: retl @@ -885,7 +851,7 @@ ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SLOW64-NEXT: pmaddwd %xmm2, %xmm0 ; SLOW64-NEXT: pmaddwd %xmm2, %xmm1 ; SLOW64-NEXT: retq @@ -895,7 +861,7 @@ ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1 ; SSE4-32-NEXT: retl @@ -905,7 +871,7 @@ ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1 ; SSE4-64-NEXT: retq @@ -980,7 +946,7 @@ ; SLM32-LABEL: test_mul_v16i32_v16i8_minsize: ; SLM32: # %bb.0: ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] +; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0] ; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -996,7 +962,7 @@ ; SLM64-LABEL: test_mul_v16i32_v16i8_minsize: ; SLM64: # %bb.0: ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] +; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0] ; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1018,7 +984,7 @@ ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SLOW32-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] ; SLOW32-NEXT: pmaddwd %xmm4, %xmm0 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm1 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm2 @@ -1034,7 +1000,7 @@ ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SLOW64-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] ; SLOW64-NEXT: pmaddwd %xmm4, %xmm0 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm1 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm2 @@ -1050,7 +1016,7 @@ ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2 @@ -1066,7 +1032,7 @@ ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2 diff --git a/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll b/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll --- a/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll @@ -11,38 +11,37 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,2863311531,2863311531] ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; CHECK-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; CHECK-NEXT: pmuludq %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-NEXT: psrld $1, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-NEXT: pmuludq %xmm0, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] ; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: pslld $15, %xmm2 -; CHECK-NEXT: psrad $16, %xmm2 -; CHECK-NEXT: pslld $15, %xmm4 -; CHECK-NEXT: psrad $16, %xmm4 -; CHECK-NEXT: packssdw %xmm2, %xmm4 -; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; CHECK-NEXT: pmullw %xmm4, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128] -; CHECK-NEXT: paddd %xmm2, %xmm1 -; CHECK-NEXT: paddd %xmm2, %xmm0 +; CHECK-NEXT: psrld $1, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; CHECK-NEXT: pand %xmm3, %xmm2 +; CHECK-NEXT: pand %xmm3, %xmm1 +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: pxor %xmm4, %xmm4 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; CHECK-NEXT: movdqa %xmm0, %xmm5 +; CHECK-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; CHECK-NEXT: pmaddwd %xmm2, %xmm5 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; CHECK-NEXT: pmaddwd %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128] +; CHECK-NEXT: paddd %xmm1, %xmm5 +; CHECK-NEXT: paddd %xmm1, %xmm0 ; CHECK-NEXT: psrld $8, %xmm0 -; CHECK-NEXT: psrld $8, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; CHECK-NEXT: pand %xmm2, %xmm1 -; CHECK-NEXT: pand %xmm2, %xmm0 -; CHECK-NEXT: packuswb %xmm1, %xmm0 +; CHECK-NEXT: psrld $8, %xmm5 +; CHECK-NEXT: pand %xmm3, %xmm5 +; CHECK-NEXT: pand %xmm3, %xmm0 +; CHECK-NEXT: packuswb %xmm5, %xmm0 ; CHECK-NEXT: packuswb %xmm0, %xmm0 ; CHECK-NEXT: retq bb: