diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -44358,14 +44358,16 @@ if (!VT.isVector() || VT.getVectorElementType() != MVT::i32) return SDValue(); - // Make sure the type is legal or will be widened to a legal type. - if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT)) + // Make sure the type is legal or can split/widen to a legal type. + // With AVX512 but without BWI, we would need to split v32i16. + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts == 1 || !isPowerOf2_32(NumElts)) return SDValue(); - MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements()); + EVT WVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, 2 * NumElts); - // Without BWI, we would need to split v32i16. - if (WVT == MVT::v32i16 && !Subtarget.hasBWI()) + // With AVX512 but without BWI, we would need to split v32i16. + if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return SDValue(); SDValue N0 = N->getOperand(0); @@ -44394,12 +44396,18 @@ if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) return DAG.getNode(ISD::AND, SDLoc(N), VT, Op, DAG.getConstant(0xFFFF, SDLoc(N), VT)); - // Convert sext(vXi16) to zext(vXi16). - if (Op.getOpcode() == ISD::SIGN_EXTEND && VT.getSizeInBits() <= 128 && - N->isOnlyUserOf(Op.getNode())) { + if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) { SDValue Src = Op.getOperand(0); - if (Src.getScalarValueSizeInBits() == 16) + // Convert sext(vXi16) to zext(vXi16). + if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128) + return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src); + // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets + // which will expand the extension. + if (Src.getScalarValueSizeInBits() <= 16 && !Subtarget.hasSSE41()) { + EVT ExtVT = VT.changeVectorElementType(MVT::i16); + Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src); return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src); + } } // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG. if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -576,28 +576,29 @@ ; SSE2-NEXT: movl %edx, %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB4_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: pmullw %xmm1, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: psraw $8, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: pmaddwd %xmm2, %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB4_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; ; AVX-LABEL: _Z9test_charPcS_i_128: @@ -1859,18 +1860,22 @@ ret <4 x i32> %ret } +; FIXME: SHUFFLE(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z),SHUFFLE(Y,W)) define <4 x i32> @larger_mul(<16 x i16> %A, <16 x i16> %B) { ; SSE2-LABEL: larger_mul: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pmulhw %xmm2, %xmm1 -; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: pmaddwd %xmm3, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pmaddwd %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll --- a/llvm/test/CodeGen/X86/pmaddubsw.ll +++ b/llvm/test/CodeGen/X86/pmaddubsw.ll @@ -296,29 +296,27 @@ ; SSE-LABEL: pmaddubsw_bad_extend: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psllw $8, %xmm3 -; SSE-NEXT: psraw $8, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pmulhw %xmm2, %xmm4 -; SSE-NEXT: pmullw %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: psrlw $8, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pmulhw %xmm0, %xmm4 -; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: paddd %xmm2, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: paddd %xmm3, %xmm1 -; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: psllw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psraw $8, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0],zero,xmm4[2],zero,xmm4[4],zero,xmm4[6],zero,xmm4[u,u,u,u,u,u,u,u] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: psrlw $8, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pmaddwd %xmm4, %xmm0 +; SSE-NEXT: pshufb {{.*#+}} xmm2 = xmm2[8],zero,xmm2[10],zero,xmm2[12],zero,xmm2[14],zero,xmm2[u,u,u,u,u,u,u,u] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[9],zero,xmm1[11],zero,xmm1[13],zero,xmm1[15],zero,xmm1[u,u,u,u,u,u,u,u] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: pmaddwd %xmm2, %xmm6 +; SSE-NEXT: packssdw %xmm6, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: pmaddubsw_bad_extend: @@ -395,30 +393,22 @@ ; SSE-LABEL: pmaddubsw_bad_indices: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pshufb {{.*#+}} xmm3 = xmm3[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14] -; SSE-NEXT: psraw $8, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pmulhw %xmm2, %xmm4 -; SSE-NEXT: pmullw %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14] +; SSE-NEXT: psraw $8, %xmm0 ; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,0,u,3,u,4,u,7,u,8,u,11,u,12,u,15] ; SSE-NEXT: psraw $8, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pmulhw %xmm0, %xmm4 -; SSE-NEXT: pmullw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: paddd %xmm2, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: paddd %xmm3, %xmm1 -; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE-NEXT: pmaddwd %xmm4, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pmaddwd %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm5, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: pmaddubsw_bad_indices: diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -329,28 +329,22 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] ; SSE2-NEXT: pand %xmm8, %xmm3 ; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm8, %xmm1 ; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pmaddwd %xmm3, %xmm7 ; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: packssdw %xmm7, %xmm6 -; SSE2-NEXT: pmulhw %xmm2, %xmm6 +; SSE2-NEXT: pmaddwd %xmm2, %xmm6 ; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pmaddwd %xmm1, %xmm5 ; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: packssdw %xmm5, %xmm4 -; SSE2-NEXT: pmulhw %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: packssdw %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: packssdw %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pmaddwd %xmm4, %xmm0 +; SSE2-NEXT: psrld $16, %xmm7 +; SSE2-NEXT: psrld $16, %xmm6 +; SSE2-NEXT: packssdw %xmm7, %xmm6 +; SSE2-NEXT: psrld $16, %xmm5 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: packssdw %xmm5, %xmm0 ; SSE2-NEXT: movdqa %xmm6, %xmm1 ; SSE2-NEXT: retq ; @@ -381,12 +375,12 @@ ; AVX2-LABEL: and_mulhuw_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32767,32767,32767,32767,32767,32767,32767,32767] -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -896,12 +896,13 @@ ; X86-SSE-NEXT: movd %edx, %xmm1 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm1 -; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%ecx,%eax,4) +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; X86-SSE-NEXT: pmaddwd %xmm0, %xmm1 +; X86-SSE-NEXT: movq %xmm1, (%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -932,12 +933,13 @@ ; X64-SSE-NEXT: movd %ecx, %xmm1 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-SSE-NEXT: psraw $8, %xmm1 -; X64-SSE-NEXT: pmullw %xmm0, %xmm1 -; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; X64-SSE-NEXT: pmaddwd %xmm0, %xmm1 +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi8_sext: @@ -985,16 +987,16 @@ ; X86-SSE-NEXT: movl c, %ecx ; X86-SSE-NEXT: movzwl (%esi,%eax), %esi ; X86-SSE-NEXT: movd %esi, %xmm0 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X86-SSE-NEXT: psrad $24, %xmm0 ; X86-SSE-NEXT: movzwl (%edx,%eax), %edx ; X86-SSE-NEXT: movd %edx, %xmm1 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: pmaddwd %xmm0, %xmm1 -; X86-SSE-NEXT: movq %xmm1, (%ecx,%eax,4) +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: pmaddwd %xmm1, %xmm0 +; X86-SSE-NEXT: movq %xmm0, (%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1021,16 +1023,16 @@ ; X64-SSE-NEXT: movq c(%rip), %rax ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx ; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X64-SSE-NEXT: psrad $24, %xmm0 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx ; X64-SSE-NEXT: movd %ecx, %xmm1 ; X64-SSE-NEXT: pxor %xmm2, %xmm2 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: pmaddwd %xmm0, %xmm1 -; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: pmaddwd %xmm1, %xmm0 +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi8_sext_zext: @@ -1244,26 +1246,31 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SSE-NEXT: movl c, %ecx -; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm2 -; X86-SSE-NEXT: movdqu 16(%esi,%eax), %xmm3 +; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm5 +; X86-SSE-NEXT: movdqu 16(%esi,%eax), %xmm1 ; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm0 -; X86-SSE-NEXT: movdqu 16(%edx,%eax), %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm4 -; X86-SSE-NEXT: pmulhw %xmm2, %xmm4 -; X86-SSE-NEXT: pmullw %xmm2, %xmm0 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-SSE-NEXT: movdqa %xmm1, %xmm4 -; X86-SSE-NEXT: pmulhw %xmm3, %xmm4 -; X86-SSE-NEXT: pmullw %xmm3, %xmm1 -; X86-SSE-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE-NEXT: movdqu 16(%edx,%eax), %xmm2 +; X86-SSE-NEXT: pxor %xmm4, %xmm4 +; X86-SSE-NEXT: movdqa %xmm5, %xmm6 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X86-SSE-NEXT: pmaddwd %xmm6, %xmm3 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-SSE-NEXT: pmaddwd %xmm5, %xmm0 +; X86-SSE-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; X86-SSE-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; X86-SSE-NEXT: pmaddwd %xmm5, %xmm6 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-SSE-NEXT: movdqu %xmm1, 32(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm3, 48(%ecx,%eax,4) +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X86-SSE-NEXT: pmaddwd %xmm1, %xmm2 +; X86-SSE-NEXT: movdqu %xmm2, 32(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm6, 48(%ecx,%eax,4) ; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm3, 16(%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1319,22 +1326,27 @@ ; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 ; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 -; X64-SSE-NEXT: movdqa %xmm2, %xmm4 -; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 -; X64-SSE-NEXT: pmullw %xmm0, %xmm2 -; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X64-SSE-NEXT: pxor %xmm4, %xmm4 +; X64-SSE-NEXT: movdqa %xmm0, %xmm5 +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; X64-SSE-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; X64-SSE-NEXT: pmaddwd %xmm5, %xmm6 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X64-SSE-NEXT: movdqa %xmm3, %xmm4 -; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 -; X64-SSE-NEXT: pmullw %xmm1, %xmm3 -; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X64-SSE-NEXT: pmaddwd %xmm0, %xmm2 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X64-SSE-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; X64-SSE-NEXT: pmaddwd %xmm0, %xmm5 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X64-SSE-NEXT: pmaddwd %xmm1, %xmm3 ; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm5, 48(%rax,%rdx,4) ; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm6, 16(%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: mul_16xi16_sext: @@ -1468,8 +1480,8 @@ ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X86-SSE-NEXT: psrad $24, %xmm0 +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl @@ -1492,8 +1504,8 @@ ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X64-SSE-NEXT: psrad $24, %xmm0 +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq @@ -1666,8 +1678,8 @@ ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X86-SSE-NEXT: psrad $24, %xmm0 +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl @@ -1690,8 +1702,8 @@ ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X64-SSE-NEXT: psrad $24, %xmm0 +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq @@ -1732,8 +1744,8 @@ ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X86-SSE-NEXT: psrad $24, %xmm0 +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl @@ -1756,8 +1768,8 @@ ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X64-SSE-NEXT: psrad $24, %xmm0 +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll --- a/llvm/test/CodeGen/X86/slow-pmulld.ll +++ b/llvm/test/CodeGen/X86/slow-pmulld.ll @@ -113,50 +113,42 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) { ; SLM32-LABEL: test_mul_v8i32_v8i8: ; SLM32: # %bb.0: -; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLM32-NEXT: movdqa %xmm1, %xmm2 -; SLM32-NEXT: pmullw %xmm0, %xmm1 -; SLM32-NEXT: pmulhw %xmm0, %xmm2 -; SLM32-NEXT: movdqa %xmm1, %xmm0 -; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] +; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM32-NEXT: pmaddwd %xmm2, %xmm0 +; SLM32-NEXT: pmaddwd %xmm2, %xmm1 ; SLM32-NEXT: retl ; ; SLM64-LABEL: test_mul_v8i32_v8i8: ; SLM64: # %bb.0: -; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLM64-NEXT: movdqa %xmm1, %xmm2 -; SLM64-NEXT: pmullw %xmm0, %xmm1 -; SLM64-NEXT: pmulhw %xmm0, %xmm2 -; SLM64-NEXT: movdqa %xmm1, %xmm0 -; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] +; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM64-NEXT: pmaddwd %xmm2, %xmm0 +; SLM64-NEXT: pmaddwd %xmm2, %xmm1 ; SLM64-NEXT: retq ; ; SLOW32-LABEL: test_mul_v8i32_v8i8: ; SLOW32: # %bb.0: -; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLOW32-NEXT: movdqa %xmm1, %xmm2 -; SLOW32-NEXT: pmulhw %xmm0, %xmm2 -; SLOW32-NEXT: pmullw %xmm0, %xmm1 -; SLOW32-NEXT: movdqa %xmm1, %xmm0 -; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] +; SLOW32-NEXT: pmaddwd %xmm2, %xmm0 +; SLOW32-NEXT: pmaddwd %xmm2, %xmm1 ; SLOW32-NEXT: retl ; ; SLOW64-LABEL: test_mul_v8i32_v8i8: ; SLOW64: # %bb.0: -; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLOW64-NEXT: movdqa %xmm1, %xmm2 -; SLOW64-NEXT: pmulhw %xmm0, %xmm2 -; SLOW64-NEXT: pmullw %xmm0, %xmm1 -; SLOW64-NEXT: movdqa %xmm1, %xmm0 -; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] +; SLOW64-NEXT: pmaddwd %xmm2, %xmm0 +; SLOW64-NEXT: pmaddwd %xmm2, %xmm1 ; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v8i32_v8i8: @@ -164,7 +156,7 @@ ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1 ; SSE4-32-NEXT: retl @@ -174,7 +166,7 @@ ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1 ; SSE4-64-NEXT: retq @@ -248,86 +240,66 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SLM32-LABEL: test_mul_v16i32_v16i8: ; SLM32: # %bb.0: -; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLM32-NEXT: movdqa %xmm0, %xmm3 -; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLM32-NEXT: pxor %xmm4, %xmm4 -; SLM32-NEXT: movdqa %xmm1, %xmm2 -; SLM32-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SLM32-NEXT: movdqa %xmm3, %xmm4 -; SLM32-NEXT: pmullw %xmm0, %xmm1 -; SLM32-NEXT: pmulhw %xmm0, %xmm2 -; SLM32-NEXT: pmullw %xmm0, %xmm3 -; SLM32-NEXT: pmulhw %xmm0, %xmm4 -; SLM32-NEXT: movdqa %xmm1, %xmm0 -; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SLM32-NEXT: movdqa %xmm3, %xmm2 -; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0] +; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; SLM32-NEXT: pmaddwd %xmm5, %xmm0 +; SLM32-NEXT: pmaddwd %xmm5, %xmm1 +; SLM32-NEXT: pmaddwd %xmm5, %xmm2 +; SLM32-NEXT: pmaddwd %xmm5, %xmm3 ; SLM32-NEXT: retl ; ; SLM64-LABEL: test_mul_v16i32_v16i8: ; SLM64: # %bb.0: -; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLM64-NEXT: movdqa %xmm0, %xmm3 -; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLM64-NEXT: pxor %xmm4, %xmm4 -; SLM64-NEXT: movdqa %xmm1, %xmm2 -; SLM64-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SLM64-NEXT: movdqa %xmm3, %xmm4 -; SLM64-NEXT: pmullw %xmm0, %xmm1 -; SLM64-NEXT: pmulhw %xmm0, %xmm2 -; SLM64-NEXT: pmullw %xmm0, %xmm3 -; SLM64-NEXT: pmulhw %xmm0, %xmm4 -; SLM64-NEXT: movdqa %xmm1, %xmm0 -; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SLM64-NEXT: movdqa %xmm3, %xmm2 -; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0] +; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; SLM64-NEXT: pmaddwd %xmm5, %xmm0 +; SLM64-NEXT: pmaddwd %xmm5, %xmm1 +; SLM64-NEXT: pmaddwd %xmm5, %xmm2 +; SLM64-NEXT: pmaddwd %xmm5, %xmm3 ; SLM64-NEXT: retq ; ; SLOW32-LABEL: test_mul_v16i32_v16i8: ; SLOW32: # %bb.0: -; SLOW32-NEXT: movdqa %xmm0, %xmm3 -; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLOW32-NEXT: movdqa %xmm1, %xmm4 -; SLOW32-NEXT: pmulhw %xmm2, %xmm4 -; SLOW32-NEXT: pmullw %xmm2, %xmm1 -; SLOW32-NEXT: movdqa %xmm1, %xmm0 -; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SLOW32-NEXT: pxor %xmm4, %xmm4 -; SLOW32-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SLOW32-NEXT: movdqa %xmm3, %xmm4 -; SLOW32-NEXT: pmulhw %xmm2, %xmm4 -; SLOW32-NEXT: pmullw %xmm2, %xmm3 -; SLOW32-NEXT: movdqa %xmm3, %xmm2 -; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLOW32-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] +; SLOW32-NEXT: pmaddwd %xmm4, %xmm0 +; SLOW32-NEXT: pmaddwd %xmm4, %xmm1 +; SLOW32-NEXT: pmaddwd %xmm4, %xmm2 +; SLOW32-NEXT: pmaddwd %xmm4, %xmm3 ; SLOW32-NEXT: retl ; ; SLOW64-LABEL: test_mul_v16i32_v16i8: ; SLOW64: # %bb.0: -; SLOW64-NEXT: movdqa %xmm0, %xmm3 -; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLOW64-NEXT: movdqa %xmm1, %xmm4 -; SLOW64-NEXT: pmulhw %xmm2, %xmm4 -; SLOW64-NEXT: pmullw %xmm2, %xmm1 -; SLOW64-NEXT: movdqa %xmm1, %xmm0 -; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SLOW64-NEXT: pxor %xmm4, %xmm4 -; SLOW64-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SLOW64-NEXT: movdqa %xmm3, %xmm4 -; SLOW64-NEXT: pmulhw %xmm2, %xmm4 -; SLOW64-NEXT: pmullw %xmm2, %xmm3 -; SLOW64-NEXT: movdqa %xmm3, %xmm2 -; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLOW64-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] +; SLOW64-NEXT: pmaddwd %xmm4, %xmm0 +; SLOW64-NEXT: pmaddwd %xmm4, %xmm1 +; SLOW64-NEXT: pmaddwd %xmm4, %xmm2 +; SLOW64-NEXT: pmaddwd %xmm4, %xmm3 ; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v16i32_v16i8: @@ -339,7 +311,7 @@ ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2 @@ -355,7 +327,7 @@ ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2 @@ -364,18 +336,12 @@ ; ; AVX2-SLOW-LABEL: test_mul_v16i32_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778] -; AVX2-SLOW-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpmullw %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] +; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: ret{{[l|q]}} ; ; AVX2-32-LABEL: test_mul_v16i32_v16i8: @@ -852,7 +818,7 @@ define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize { ; SLM32-LABEL: test_mul_v8i32_v8i8_minsize: ; SLM32: # %bb.0: -; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -862,7 +828,7 @@ ; ; SLM64-LABEL: test_mul_v8i32_v8i8_minsize: ; SLM64: # %bb.0: -; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -875,7 +841,7 @@ ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SLOW32-NEXT: pmaddwd %xmm2, %xmm0 ; SLOW32-NEXT: pmaddwd %xmm2, %xmm1 ; SLOW32-NEXT: retl @@ -885,7 +851,7 @@ ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SLOW64-NEXT: pmaddwd %xmm2, %xmm0 ; SLOW64-NEXT: pmaddwd %xmm2, %xmm1 ; SLOW64-NEXT: retq @@ -895,7 +861,7 @@ ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1 ; SSE4-32-NEXT: retl @@ -905,7 +871,7 @@ ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1 ; SSE4-64-NEXT: retq @@ -980,7 +946,7 @@ ; SLM32-LABEL: test_mul_v16i32_v16i8_minsize: ; SLM32: # %bb.0: ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] +; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0] ; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -996,7 +962,7 @@ ; SLM64-LABEL: test_mul_v16i32_v16i8_minsize: ; SLM64: # %bb.0: ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] +; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0] ; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1018,7 +984,7 @@ ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SLOW32-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] ; SLOW32-NEXT: pmaddwd %xmm4, %xmm0 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm1 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm2 @@ -1034,7 +1000,7 @@ ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SLOW64-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] ; SLOW64-NEXT: pmaddwd %xmm4, %xmm0 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm1 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm2 @@ -1050,7 +1016,7 @@ ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2 @@ -1066,7 +1032,7 @@ ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2 diff --git a/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll b/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll --- a/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll @@ -11,38 +11,37 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,2863311531,2863311531] ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; CHECK-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; CHECK-NEXT: pmuludq %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-NEXT: psrld $1, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-NEXT: pmuludq %xmm0, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; CHECK-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] ; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: pslld $15, %xmm2 -; CHECK-NEXT: psrad $16, %xmm2 -; CHECK-NEXT: pslld $15, %xmm4 -; CHECK-NEXT: psrad $16, %xmm4 -; CHECK-NEXT: packssdw %xmm2, %xmm4 -; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; CHECK-NEXT: pmullw %xmm4, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128] -; CHECK-NEXT: paddd %xmm2, %xmm1 -; CHECK-NEXT: paddd %xmm2, %xmm0 +; CHECK-NEXT: psrld $1, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; CHECK-NEXT: pand %xmm3, %xmm2 +; CHECK-NEXT: pand %xmm3, %xmm1 +; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: pxor %xmm4, %xmm4 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; CHECK-NEXT: movdqa %xmm0, %xmm5 +; CHECK-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; CHECK-NEXT: pmaddwd %xmm2, %xmm5 +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; CHECK-NEXT: pmaddwd %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128] +; CHECK-NEXT: paddd %xmm1, %xmm5 +; CHECK-NEXT: paddd %xmm1, %xmm0 ; CHECK-NEXT: psrld $8, %xmm0 -; CHECK-NEXT: psrld $8, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; CHECK-NEXT: pand %xmm2, %xmm1 -; CHECK-NEXT: pand %xmm2, %xmm0 -; CHECK-NEXT: packuswb %xmm1, %xmm0 +; CHECK-NEXT: psrld $8, %xmm5 +; CHECK-NEXT: pand %xmm3, %xmm5 +; CHECK-NEXT: pand %xmm3, %xmm0 +; CHECK-NEXT: packuswb %xmm5, %xmm0 ; CHECK-NEXT: packuswb %xmm0, %xmm0 ; CHECK-NEXT: retq bb: