Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -6452,8 +6452,8 @@ if (ThisElt) { if (1 == i) { - V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32) - : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32); + V = (NumZero && ((NonZeros & 0xc) != 0xc)) ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32) + : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32); V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); V = DAG.getBitcast(MVT::v8i16, V); @@ -30992,6 +30992,21 @@ : DAG.getZExtOrTrunc(N00, dl, MVT::i32); return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00); } + + // Otherwise widen the BUILD_VECTOR to an XMM register and extract lower + // 64-bits into a MMX register. Without this we end up promoting the + // build vector instead. + if (Subtarget.hasSSE2()) { + SDLoc dl(N); + unsigned NumElts = SrcVT.getSimpleVT().getVectorNumElements(); + MVT WideVT = + MVT::getVectorVT(SrcVT.getSimpleVT().getVectorElementType(), + NumElts * 2); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, + N0, DAG.getUNDEF(SrcVT)); + return DAG.getNode(X86ISD::MOVDQ2Q, dl, VT, + DAG.getBitcast(MVT::v2i64, Concat)); + } } // Detect bitcasts between element or subvector extraction to x86mmx. Index: test/CodeGen/X86/mmx-build-vector.ll =================================================================== --- test/CodeGen/X86/mmx-build-vector.ll +++ test/CodeGen/X86/mmx-build-vector.ll @@ -35,18 +35,11 @@ ; ; X86-SSE-LABEL: build_v2i32_01: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $8, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movlps %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 ; X86-SSE-NEXT: paddd %mm0, %mm0 ; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: retl ; ; X64-SSE-LABEL: build_v2i32_01: @@ -54,8 +47,7 @@ ; X64-SSE-NEXT: movd %edx, %xmm0 ; X64-SSE-NEXT: movd %esi, %xmm1 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSE-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-SSE-NEXT: movdq2q %xmm1, %mm0 ; X64-SSE-NEXT: paddd %mm0, %mm0 ; X64-SSE-NEXT: movq %mm0, (%rdi) ; X64-SSE-NEXT: retq @@ -64,8 +56,7 @@ ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovd %esi, %xmm0 ; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX-NEXT: paddd %mm0, %mm0 ; X64-AVX-NEXT: movq %mm0, (%rdi) ; X64-AVX-NEXT: retq @@ -119,27 +110,19 @@ ; ; X86-SSE-LABEL: build_v2i32_u1: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $8, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X86-SSE-NEXT: movq %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 ; X86-SSE-NEXT: paddd %mm0, %mm0 ; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: retl ; ; X64-SSE-LABEL: build_v2i32_u1: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movd %edx, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X64-SSE-NEXT: movdq2q %xmm0, %mm0 ; X64-SSE-NEXT: paddd %mm0, %mm0 ; X64-SSE-NEXT: movq %mm0, (%rdi) ; X64-SSE-NEXT: retq @@ -147,9 +130,8 @@ ; X64-AVX1-LABEL: build_v2i32_u1: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovd %edx, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X64-AVX1-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX1-NEXT: paddd %mm0, %mm0 ; X64-AVX1-NEXT: movq %mm0, (%rdi) ; X64-AVX1-NEXT: retq @@ -158,8 +140,7 @@ ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovd %edx, %xmm0 ; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX2-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX2-NEXT: paddd %mm0, %mm0 ; X64-AVX2-NEXT: movq %mm0, (%rdi) ; X64-AVX2-NEXT: retq @@ -168,8 +149,7 @@ ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovd %edx, %xmm0 ; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX512-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX512-NEXT: paddd %mm0, %mm0 ; X64-AVX512-NEXT: movq %mm0, (%rdi) ; X64-AVX512-NEXT: retq @@ -201,41 +181,28 @@ ; ; X86-SSE-LABEL: build_v2i32_z1: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $8, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; X86-SSE-NEXT: movq %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 ; X86-SSE-NEXT: paddd %mm0, %mm0 ; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: retl ; ; X64-SSE-LABEL: build_v2i32_z1: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: # kill: def $edx killed $edx def $rdx -; X64-SSE-NEXT: movq %rdx, %xmm0 -; X64-SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-SSE-NEXT: movd %edx, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; X64-SSE-NEXT: movdq2q %xmm0, %mm0 ; X64-SSE-NEXT: paddd %mm0, %mm0 ; X64-SSE-NEXT: movq %mm0, (%rdi) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: build_v2i32_z1: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: # kill: def $edx killed $edx def $rdx -; X64-AVX-NEXT: vmovq %rdx, %xmm0 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; X64-AVX-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX-NEXT: paddd %mm0, %mm0 ; X64-AVX-NEXT: movq %mm0, (%rdi) ; X64-AVX-NEXT: retq @@ -267,27 +234,19 @@ ; ; X86-SSE-LABEL: build_v2i32_00: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $8, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X86-SSE-NEXT: movq %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 ; X86-SSE-NEXT: paddd %mm0, %mm0 ; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: retl ; ; X64-SSE-LABEL: build_v2i32_00: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movd %esi, %xmm0 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-SSE-NEXT: movdq2q %xmm0, %mm0 ; X64-SSE-NEXT: paddd %mm0, %mm0 ; X64-SSE-NEXT: movq %mm0, (%rdi) ; X64-SSE-NEXT: retq @@ -296,8 +255,7 @@ ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovd %esi, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX1-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX1-NEXT: paddd %mm0, %mm0 ; X64-AVX1-NEXT: movq %mm0, (%rdi) ; X64-AVX1-NEXT: retq @@ -306,8 +264,7 @@ ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovd %esi, %xmm0 ; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX2-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX2-NEXT: paddd %mm0, %mm0 ; X64-AVX2-NEXT: movq %mm0, (%rdi) ; X64-AVX2-NEXT: retq @@ -316,8 +273,7 @@ ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovd %esi, %xmm0 ; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX512-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX512-NEXT: paddd %mm0, %mm0 ; X64-AVX512-NEXT: movq %mm0, (%rdi) ; X64-AVX512-NEXT: retq @@ -360,66 +316,34 @@ ; ; X86-SSE-LABEL: build_v4i16_0123: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $8, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pinsrw $1, 16(%ebp), %xmm0 -; X86-SSE-NEXT: pinsrw $2, 20(%ebp), %xmm0 -; X86-SSE-NEXT: pinsrw $3, 24(%ebp), %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 +; X86-SSE-NEXT: pinsrw $1, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pinsrw $2, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pinsrw $3, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 ; X86-SSE-NEXT: paddd %mm0, %mm0 ; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: retl ; -; X64-SSE2-LABEL: build_v4i16_0123: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movd %r8d, %xmm0 -; X64-SSE2-NEXT: movd %ecx, %xmm1 -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSE2-NEXT: movd %edx, %xmm0 -; X64-SSE2-NEXT: movd %esi, %xmm2 -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] -; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE2-NEXT: paddd %mm0, %mm0 -; X64-SSE2-NEXT: movq %mm0, (%rdi) -; X64-SSE2-NEXT: retq -; -; X64-SSSE3-LABEL: build_v4i16_0123: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movd %r8d, %xmm0 -; X64-SSSE3-NEXT: movd %ecx, %xmm1 -; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSSE3-NEXT: movd %edx, %xmm0 -; X64-SSSE3-NEXT: movd %esi, %xmm2 -; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X64-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; X64-SSSE3-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSSE3-NEXT: paddd %mm0, %mm0 -; X64-SSSE3-NEXT: movq %mm0, (%rdi) -; X64-SSSE3-NEXT: retq +; X64-SSE-LABEL: build_v4i16_0123: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movd %esi, %xmm0 +; X64-SSE-NEXT: pinsrw $1, %edx, %xmm0 +; X64-SSE-NEXT: pinsrw $2, %ecx, %xmm0 +; X64-SSE-NEXT: pinsrw $3, %r8d, %xmm0 +; X64-SSE-NEXT: movdq2q %xmm0, %mm0 +; X64-SSE-NEXT: paddd %mm0, %mm0 +; X64-SSE-NEXT: movq %mm0, (%rdi) +; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: build_v4i16_0123: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovd %esi, %xmm0 -; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrd $3, %r8d, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 +; X64-AVX-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX-NEXT: paddd %mm0, %mm0 ; X64-AVX-NEXT: movq %mm0, (%rdi) ; X64-AVX-NEXT: retq @@ -454,82 +378,33 @@ ; X86-MMX-NEXT: popl %ebp ; X86-MMX-NEXT: retl ; -; X86-SSE2-LABEL: build_v4i16_01zz: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: andl $-8, %esp -; X86-SSE2-NEXT: subl $8, %esp -; X86-SSE2-NEXT: movl 8(%ebp), %eax -; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: movq %xmm0, (%esp) -; X86-SSE2-NEXT: movq (%esp), %mm0 -; X86-SSE2-NEXT: paddd %mm0, %mm0 -; X86-SSE2-NEXT: movq %mm0, (%eax) -; X86-SSE2-NEXT: movl %ebp, %esp -; X86-SSE2-NEXT: popl %ebp -; X86-SSE2-NEXT: retl -; -; X86-SSSE3-LABEL: build_v4i16_01zz: -; X86-SSSE3: # %bb.0: -; X86-SSSE3-NEXT: pushl %ebp -; X86-SSSE3-NEXT: movl %esp, %ebp -; X86-SSSE3-NEXT: andl $-8, %esp -; X86-SSSE3-NEXT: subl $8, %esp -; X86-SSSE3-NEXT: movl 8(%ebp), %eax -; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-SSSE3-NEXT: movq %xmm1, (%esp) -; X86-SSSE3-NEXT: movq (%esp), %mm0 -; X86-SSSE3-NEXT: paddd %mm0, %mm0 -; X86-SSSE3-NEXT: movq %mm0, (%eax) -; X86-SSSE3-NEXT: movl %ebp, %esp -; X86-SSSE3-NEXT: popl %ebp -; X86-SSSE3-NEXT: retl -; -; X64-SSE2-LABEL: build_v4i16_01zz: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movd %edx, %xmm0 -; X64-SSE2-NEXT: movd %esi, %xmm1 -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE2-NEXT: paddd %mm0, %mm0 -; X64-SSE2-NEXT: movq %mm0, (%rdi) -; X64-SSE2-NEXT: retq +; X86-SSE-LABEL: build_v4i16_01zz: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: pxor %xmm0, %xmm0 +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pinsrw $1, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 +; X86-SSE-NEXT: paddd %mm0, %mm0 +; X86-SSE-NEXT: movq %mm0, (%eax) +; X86-SSE-NEXT: retl ; -; X64-SSSE3-LABEL: build_v4i16_01zz: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movd %edx, %xmm0 -; X64-SSSE3-NEXT: movd %esi, %xmm1 -; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-SSSE3-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSSE3-NEXT: paddd %mm0, %mm0 -; X64-SSSE3-NEXT: movq %mm0, (%rdi) -; X64-SSSE3-NEXT: retq +; X64-SSE-LABEL: build_v4i16_01zz: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pxor %xmm0, %xmm0 +; X64-SSE-NEXT: pinsrw $0, %esi, %xmm0 +; X64-SSE-NEXT: pinsrw $1, %edx, %xmm0 +; X64-SSE-NEXT: movdq2q %xmm0, %mm0 +; X64-SSE-NEXT: paddd %mm0, %mm0 +; X64-SSE-NEXT: movq %mm0, (%rdi) +; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: build_v4i16_01zz: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovd %edx, %xmm0 -; X64-AVX-NEXT: vmovd %esi, %xmm1 -; X64-AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-AVX-NEXT: vpinsrw $0, %esi, %xmm0, %xmm0 +; X64-AVX-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 +; X64-AVX-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX-NEXT: paddd %mm0, %mm0 ; X64-AVX-NEXT: movq %mm0, (%rdi) ; X64-AVX-NEXT: retq @@ -621,33 +496,21 @@ ; ; X86-SSE-LABEL: build_v4i16_012u: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $8, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pinsrw $1, 16(%ebp), %xmm0 -; X86-SSE-NEXT: pinsrw $2, 20(%ebp), %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 +; X86-SSE-NEXT: pinsrw $1, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pinsrw $2, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 ; X86-SSE-NEXT: paddd %mm0, %mm0 ; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: retl ; ; X64-SSE-LABEL: build_v4i16_012u: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movd %edx, %xmm0 -; X64-SSE-NEXT: movd %esi, %xmm1 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-SSE-NEXT: movd %esi, %xmm0 +; X64-SSE-NEXT: pinsrw $1, %edx, %xmm0 +; X64-SSE-NEXT: pinsrw $2, %ecx, %xmm0 +; X64-SSE-NEXT: movdq2q %xmm0, %mm0 ; X64-SSE-NEXT: paddd %mm0, %mm0 ; X64-SSE-NEXT: movq %mm0, (%rdi) ; X64-SSE-NEXT: retq @@ -655,12 +518,9 @@ ; X64-AVX-LABEL: build_v4i16_012u: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovd %esi, %xmm0 -; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX-NEXT: paddd %mm0, %mm0 ; X64-AVX-NEXT: movq %mm0, (%rdi) ; X64-AVX-NEXT: retq @@ -697,70 +557,30 @@ ; X86-MMX-NEXT: popl %ebp ; X86-MMX-NEXT: retl ; -; X86-SSE2-LABEL: build_v4i16_0u00: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: andl $-8, %esp -; X86-SSE2-NEXT: subl $8, %esp -; X86-SSE2-NEXT: movl 8(%ebp), %eax -; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,0] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: movq %xmm0, (%esp) -; X86-SSE2-NEXT: movq (%esp), %mm0 -; X86-SSE2-NEXT: paddd %mm0, %mm0 -; X86-SSE2-NEXT: movq %mm0, (%eax) -; X86-SSE2-NEXT: movl %ebp, %esp -; X86-SSE2-NEXT: popl %ebp -; X86-SSE2-NEXT: retl -; -; X86-SSSE3-LABEL: build_v4i16_0u00: -; X86-SSSE3: # %bb.0: -; X86-SSSE3-NEXT: pushl %ebp -; X86-SSSE3-NEXT: movl %esp, %ebp -; X86-SSSE3-NEXT: andl $-8, %esp -; X86-SSSE3-NEXT: subl $8, %esp -; X86-SSSE3-NEXT: movl 8(%ebp), %eax -; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,2,3] -; X86-SSSE3-NEXT: movq %xmm0, (%esp) -; X86-SSSE3-NEXT: movq (%esp), %mm0 -; X86-SSSE3-NEXT: paddd %mm0, %mm0 -; X86-SSSE3-NEXT: movq %mm0, (%eax) -; X86-SSSE3-NEXT: movl %ebp, %esp -; X86-SSSE3-NEXT: popl %ebp -; X86-SSSE3-NEXT: retl -; -; X64-SSE2-LABEL: build_v4i16_0u00: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movd %esi, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,0] -; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE2-NEXT: paddd %mm0, %mm0 -; X64-SSE2-NEXT: movq %mm0, (%rdi) -; X64-SSE2-NEXT: retq +; X86-SSE-LABEL: build_v4i16_0u00: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,0,4,5,6,7] +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 +; X86-SSE-NEXT: paddd %mm0, %mm0 +; X86-SSE-NEXT: movq %mm0, (%eax) +; X86-SSE-NEXT: retl ; -; X64-SSSE3-LABEL: build_v4i16_0u00: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movd %esi, %xmm0 -; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,2,3] -; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSSE3-NEXT: paddd %mm0, %mm0 -; X64-SSSE3-NEXT: movq %mm0, (%rdi) -; X64-SSSE3-NEXT: retq +; X64-SSE-LABEL: build_v4i16_0u00: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movd %esi, %xmm0 +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,0,4,5,6,7] +; X64-SSE-NEXT: movdq2q %xmm0, %mm0 +; X64-SSE-NEXT: paddd %mm0, %mm0 +; X64-SSE-NEXT: movq %mm0, (%rdi) +; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: build_v4i16_0u00: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovd %esi, %xmm0 -; X64-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,2,3] -; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,0,4,5,6,7] +; X64-AVX1-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX1-NEXT: paddd %mm0, %mm0 ; X64-AVX1-NEXT: movq %mm0, (%rdi) ; X64-AVX1-NEXT: retq @@ -768,11 +588,8 @@ ; X64-AVX2-LABEL: build_v4i16_0u00: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovd %esi, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; X64-AVX2-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX2-NEXT: paddd %mm0, %mm0 ; X64-AVX2-NEXT: movq %mm0, (%rdi) ; X64-AVX2-NEXT: retq @@ -780,11 +597,8 @@ ; X64-AVX512-LABEL: build_v4i16_0u00: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovd %esi, %xmm0 -; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; X64-AVX512-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX512-NEXT: paddd %mm0, %mm0 ; X64-AVX512-NEXT: movq %mm0, (%rdi) ; X64-AVX512-NEXT: retq @@ -845,39 +659,32 @@ ; ; X86-SSE-LABEL: build_v8i8_01234567: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp ; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax -; X86-SSE-NEXT: movl 24(%ebp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: shll $8, %ecx -; X86-SSE-NEXT: movzbl 20(%ebp), %edx +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: orl %ecx, %edx -; X86-SSE-NEXT: movl 16(%ebp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: shll $8, %ecx -; X86-SSE-NEXT: movzbl 12(%ebp), %esi +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %esi ; X86-SSE-NEXT: orl %ecx, %esi ; X86-SSE-NEXT: movd %esi, %xmm0 ; X86-SSE-NEXT: pinsrw $1, %edx, %xmm0 -; X86-SSE-NEXT: movl 32(%ebp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: shll $8, %ecx -; X86-SSE-NEXT: movzbl 28(%ebp), %edx +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: orl %ecx, %edx ; X86-SSE-NEXT: pinsrw $2, %edx, %xmm0 -; X86-SSE-NEXT: movl 40(%ebp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: shll $8, %ecx -; X86-SSE-NEXT: movzbl 36(%ebp), %edx +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: orl %ecx, %edx ; X86-SSE-NEXT: pinsrw $3, %edx, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 ; X86-SSE-NEXT: paddd %mm0, %mm0 ; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: leal -4(%ebp), %esp ; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: retl ; ; X64-SSE-LABEL: build_v8i8_01234567: @@ -900,8 +707,7 @@ ; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; X64-SSE-NEXT: orl %eax, %ecx ; X64-SSE-NEXT: pinsrw $3, %ecx, %xmm0 -; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-SSE-NEXT: movdq2q %xmm0, %mm0 ; X64-SSE-NEXT: paddd %mm0, %mm0 ; X64-SSE-NEXT: movq %mm0, (%rdi) ; X64-SSE-NEXT: retq @@ -916,8 +722,7 @@ ; X64-AVX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0 ; X64-AVX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0 ; X64-AVX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX-NEXT: paddd %mm0, %mm0 ; X64-AVX-NEXT: movq %mm0, (%rdi) ; X64-AVX-NEXT: retq @@ -967,124 +772,62 @@ ; X86-MMX-NEXT: popl %ebp ; X86-MMX-NEXT: retl ; -; X86-SSE2-LABEL: build_v8i8_0u2345z7: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: andl $-8, %esp -; X86-SSE2-NEXT: subl $8, %esp -; X86-SSE2-NEXT: movl 8(%ebp), %eax -; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: pxor %xmm1, %xmm1 -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: packuswb %xmm0, %xmm0 -; X86-SSE2-NEXT: movq %xmm0, (%esp) -; X86-SSE2-NEXT: movq (%esp), %mm0 -; X86-SSE2-NEXT: paddd %mm0, %mm0 -; X86-SSE2-NEXT: movq %mm0, (%eax) -; X86-SSE2-NEXT: movl %ebp, %esp -; X86-SSE2-NEXT: popl %ebp -; X86-SSE2-NEXT: retl -; -; X86-SSSE3-LABEL: build_v8i8_0u2345z7: -; X86-SSSE3: # %bb.0: -; X86-SSSE3-NEXT: pushl %ebp -; X86-SSSE3-NEXT: movl %esp, %ebp -; X86-SSSE3-NEXT: andl $-8, %esp -; X86-SSSE3-NEXT: subl $8, %esp -; X86-SSSE3-NEXT: movl 8(%ebp), %eax -; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: pxor %xmm1, %xmm1 -; X86-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,u,4,6,8,10],zero,xmm0[14,u,u,u,u,u,u,u,u] -; X86-SSSE3-NEXT: movq %xmm0, (%esp) -; X86-SSSE3-NEXT: movq (%esp), %mm0 -; X86-SSSE3-NEXT: paddd %mm0, %mm0 -; X86-SSSE3-NEXT: movq %mm0, (%eax) -; X86-SSSE3-NEXT: movl %ebp, %esp -; X86-SSSE3-NEXT: popl %ebp -; X86-SSSE3-NEXT: retl -; -; X64-SSE2-LABEL: build_v8i8_0u2345z7: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE2-NEXT: pxor %xmm1, %xmm1 -; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X64-SSE2-NEXT: movd %r9d, %xmm0 -; X64-SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSE2-NEXT: movd %r8d, %xmm1 -; X64-SSE2-NEXT: movd %ecx, %xmm2 -; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X64-SSE2-NEXT: movd %esi, %xmm1 -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; X64-SSE2-NEXT: packuswb %xmm1, %xmm1 -; X64-SSE2-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE2-NEXT: paddd %mm0, %mm0 -; X64-SSE2-NEXT: movq %mm0, (%rdi) -; X64-SSE2-NEXT: retq +; X86-SSE-LABEL: build_v8i8_0u2345z7: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: shll $8, %ecx +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: orl %ecx, %edx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: shll $8, %ecx +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %esi +; X86-SSE-NEXT: orl %ecx, %esi +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pinsrw $1, %esi, %xmm0 +; X86-SSE-NEXT: pinsrw $2, %edx, %xmm0 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: shll $8, %ecx +; X86-SSE-NEXT: pinsrw $3, %ecx, %xmm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 +; X86-SSE-NEXT: paddd %mm0, %mm0 +; X86-SSE-NEXT: movq %mm0, (%eax) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-SSSE3-LABEL: build_v8i8_0u2345z7: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSSE3-NEXT: pxor %xmm1, %xmm1 -; X64-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X64-SSSE3-NEXT: movd %r9d, %xmm0 -; X64-SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSSE3-NEXT: movd %r8d, %xmm1 -; X64-SSSE3-NEXT: movd %ecx, %xmm2 -; X64-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X64-SSSE3-NEXT: movd %esi, %xmm1 -; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,u,4,6,8,10],zero,xmm1[14,u,u,u,u,u,u,u,u] -; X64-SSSE3-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSSE3-NEXT: paddd %mm0, %mm0 -; X64-SSSE3-NEXT: movq %mm0, (%rdi) -; X64-SSSE3-NEXT: retq +; X64-SSE-LABEL: build_v8i8_0u2345z7: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movl {{[0-9]+}}(%rsp), %eax +; X64-SSE-NEXT: shll $8, %eax +; X64-SSE-NEXT: movzbl %r9b, %edx +; X64-SSE-NEXT: orl %eax, %edx +; X64-SSE-NEXT: shll $8, %r8d +; X64-SSE-NEXT: movzbl %cl, %eax +; X64-SSE-NEXT: orl %r8d, %eax +; X64-SSE-NEXT: movzbl %sil, %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pinsrw $1, %eax, %xmm0 +; X64-SSE-NEXT: pinsrw $2, %edx, %xmm0 +; X64-SSE-NEXT: movl {{[0-9]+}}(%rsp), %eax +; X64-SSE-NEXT: shll $8, %eax +; X64-SSE-NEXT: pinsrw $3, %eax, %xmm0 +; X64-SSE-NEXT: movdq2q %xmm0, %mm0 +; X64-SSE-NEXT: paddd %mm0, %mm0 +; X64-SSE-NEXT: movq %mm0, (%rdi) +; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: build_v8i8_0u2345z7: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $0, %esi, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $4, %r9d, %xmm0, %xmm0 -; X64-AVX-NEXT: movl {{[0-9]+}}(%rsp), %eax -; X64-AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; X64-AVX-NEXT: movl {{[0-9]+}}(%rsp), %eax -; X64-AVX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,4,6,8,10],zero,xmm0[14,u,u,u,u,u,u,u,u] -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX-NEXT: vpinsrb $0, %esi, %xmm0, %xmm0 +; X64-AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0 +; X64-AVX-NEXT: vpinsrb $4, %r9d, %xmm0, %xmm0 +; X64-AVX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; X64-AVX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; X64-AVX-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX-NEXT: paddd %mm0, %mm0 ; X64-AVX-NEXT: movq %mm0, (%rdi) ; X64-AVX-NEXT: retq @@ -1132,96 +875,49 @@ ; X86-MMX-NEXT: popl %ebp ; X86-MMX-NEXT: retl ; -; X86-SSE2-LABEL: build_v8i8_0123zzzu: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: andl $-8, %esp -; X86-SSE2-NEXT: subl $8, %esp -; X86-SSE2-NEXT: movl 8(%ebp), %eax -; X86-SSE2-NEXT: movl 12(%ebp), %ecx -; X86-SSE2-NEXT: pxor %xmm0, %xmm0 -; X86-SSE2-NEXT: pinsrw $0, %ecx, %xmm0 -; X86-SSE2-NEXT: movl 16(%ebp), %ecx -; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0 -; X86-SSE2-NEXT: movl 20(%ebp), %ecx -; X86-SSE2-NEXT: pinsrw $2, %ecx, %xmm0 -; X86-SSE2-NEXT: movl 24(%ebp), %ecx -; X86-SSE2-NEXT: pinsrw $3, %ecx, %xmm0 -; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: packuswb %xmm0, %xmm0 -; X86-SSE2-NEXT: movq %xmm0, (%esp) -; X86-SSE2-NEXT: movq (%esp), %mm0 -; X86-SSE2-NEXT: paddd %mm0, %mm0 -; X86-SSE2-NEXT: movq %mm0, (%eax) -; X86-SSE2-NEXT: movl %ebp, %esp -; X86-SSE2-NEXT: popl %ebp -; X86-SSE2-NEXT: retl -; -; X86-SSSE3-LABEL: build_v8i8_0123zzzu: -; X86-SSSE3: # %bb.0: -; X86-SSSE3-NEXT: pushl %ebp -; X86-SSSE3-NEXT: movl %esp, %ebp -; X86-SSSE3-NEXT: andl $-8, %esp -; X86-SSSE3-NEXT: subl $8, %esp -; X86-SSSE3-NEXT: movl 8(%ebp), %eax -; X86-SSSE3-NEXT: movl 12(%ebp), %ecx -; X86-SSSE3-NEXT: pxor %xmm0, %xmm0 -; X86-SSSE3-NEXT: pinsrw $0, %ecx, %xmm0 -; X86-SSSE3-NEXT: movl 16(%ebp), %ecx -; X86-SSSE3-NEXT: pinsrw $1, %ecx, %xmm0 -; X86-SSSE3-NEXT: movl 20(%ebp), %ecx -; X86-SSSE3-NEXT: pinsrw $2, %ecx, %xmm0 -; X86-SSSE3-NEXT: movl 24(%ebp), %ecx -; X86-SSSE3-NEXT: pinsrw $3, %ecx, %xmm0 -; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; X86-SSSE3-NEXT: movq %xmm0, (%esp) -; X86-SSSE3-NEXT: movq (%esp), %mm0 -; X86-SSSE3-NEXT: paddd %mm0, %mm0 -; X86-SSSE3-NEXT: movq %mm0, (%eax) -; X86-SSSE3-NEXT: movl %ebp, %esp -; X86-SSSE3-NEXT: popl %ebp -; X86-SSSE3-NEXT: retl -; -; X64-SSE2-LABEL: build_v8i8_0123zzzu: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pxor %xmm0, %xmm0 -; X64-SSE2-NEXT: pinsrw $0, %esi, %xmm0 -; X64-SSE2-NEXT: pinsrw $1, %edx, %xmm0 -; X64-SSE2-NEXT: pinsrw $2, %ecx, %xmm0 -; X64-SSE2-NEXT: pinsrw $3, %r8d, %xmm0 -; X64-SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; X64-SSE2-NEXT: packuswb %xmm0, %xmm0 -; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE2-NEXT: paddd %mm0, %mm0 -; X64-SSE2-NEXT: movq %mm0, (%rdi) -; X64-SSE2-NEXT: retq +; X86-SSE-LABEL: build_v8i8_0123zzzu: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: shll $8, %ecx +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: orl %ecx, %edx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: shll $8, %ecx +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %esi +; X86-SSE-NEXT: orl %ecx, %esi +; X86-SSE-NEXT: movd %esi, %xmm0 +; X86-SSE-NEXT: pinsrw $1, %edx, %xmm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 +; X86-SSE-NEXT: paddd %mm0, %mm0 +; X86-SSE-NEXT: movq %mm0, (%eax) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-SSSE3-LABEL: build_v8i8_0123zzzu: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: pxor %xmm0, %xmm0 -; X64-SSSE3-NEXT: pinsrw $0, %esi, %xmm0 -; X64-SSSE3-NEXT: pinsrw $1, %edx, %xmm0 -; X64-SSSE3-NEXT: pinsrw $2, %ecx, %xmm0 -; X64-SSSE3-NEXT: pinsrw $3, %r8d, %xmm0 -; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSSE3-NEXT: paddd %mm0, %mm0 -; X64-SSSE3-NEXT: movq %mm0, (%rdi) -; X64-SSSE3-NEXT: retq +; X64-SSE-LABEL: build_v8i8_0123zzzu: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: shll $8, %r8d +; X64-SSE-NEXT: movzbl %cl, %eax +; X64-SSE-NEXT: orl %r8d, %eax +; X64-SSE-NEXT: shll $8, %edx +; X64-SSE-NEXT: movzbl %sil, %ecx +; X64-SSE-NEXT: orl %edx, %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pinsrw $1, %eax, %xmm0 +; X64-SSE-NEXT: movdq2q %xmm0, %mm0 +; X64-SSE-NEXT: paddd %mm0, %mm0 +; X64-SSE-NEXT: movq %mm0, (%rdi) +; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: build_v8i8_0123zzzu: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $0, %esi, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX-NEXT: vpinsrb $0, %esi, %xmm0, %xmm0 +; X64-AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0 +; X64-AVX-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX-NEXT: paddd %mm0, %mm0 ; X64-AVX-NEXT: movq %mm0, (%rdi) ; X64-AVX-NEXT: retq @@ -1323,75 +1019,33 @@ ; X86-MMX-NEXT: popl %ebp ; X86-MMX-NEXT: retl ; -; X86-SSE2-LABEL: build_v8i8_00000000: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: andl $-8, %esp -; X86-SSE2-NEXT: subl $8, %esp -; X86-SSE2-NEXT: movl 8(%ebp), %eax -; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: packuswb %xmm0, %xmm0 -; X86-SSE2-NEXT: movq %xmm0, (%esp) -; X86-SSE2-NEXT: movq (%esp), %mm0 -; X86-SSE2-NEXT: paddd %mm0, %mm0 -; X86-SSE2-NEXT: movq %mm0, (%eax) -; X86-SSE2-NEXT: movl %ebp, %esp -; X86-SSE2-NEXT: popl %ebp -; X86-SSE2-NEXT: retl -; -; X86-SSSE3-LABEL: build_v8i8_00000000: -; X86-SSSE3: # %bb.0: -; X86-SSSE3-NEXT: pushl %ebp -; X86-SSSE3-NEXT: movl %esp, %ebp -; X86-SSSE3-NEXT: andl $-8, %esp -; X86-SSSE3-NEXT: subl $8, %esp -; X86-SSSE3-NEXT: movl 8(%ebp), %eax -; X86-SSSE3-NEXT: pxor %xmm0, %xmm0 -; X86-SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: pshufb %xmm0, %xmm1 -; X86-SSSE3-NEXT: movq %xmm1, (%esp) -; X86-SSSE3-NEXT: movq (%esp), %mm0 -; X86-SSSE3-NEXT: paddd %mm0, %mm0 -; X86-SSSE3-NEXT: movq %mm0, (%eax) -; X86-SSSE3-NEXT: movl %ebp, %esp -; X86-SSSE3-NEXT: popl %ebp -; X86-SSSE3-NEXT: retl -; -; X64-SSE2-LABEL: build_v8i8_00000000: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movd %esi, %xmm0 -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; X64-SSE2-NEXT: packuswb %xmm0, %xmm0 -; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE2-NEXT: paddd %mm0, %mm0 -; X64-SSE2-NEXT: movq %mm0, (%rdi) -; X64-SSE2-NEXT: retq +; X86-SSE-LABEL: build_v8i8_00000000: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 +; X86-SSE-NEXT: paddd %mm0, %mm0 +; X86-SSE-NEXT: movq %mm0, (%eax) +; X86-SSE-NEXT: retl ; -; X64-SSSE3-LABEL: build_v8i8_00000000: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movd %esi, %xmm0 -; X64-SSSE3-NEXT: pxor %xmm1, %xmm1 -; X64-SSSE3-NEXT: pshufb %xmm1, %xmm0 -; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSSE3-NEXT: paddd %mm0, %mm0 -; X64-SSSE3-NEXT: movq %mm0, (%rdi) -; X64-SSSE3-NEXT: retq +; X64-SSE-LABEL: build_v8i8_00000000: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movd %esi, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; X64-SSE-NEXT: movdq2q %xmm0, %mm0 +; X64-SSE-NEXT: paddd %mm0, %mm0 +; X64-SSE-NEXT: movq %mm0, (%rdi) +; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: build_v8i8_00000000: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovd %esi, %xmm0 -; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; X64-AVX1-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX1-NEXT: paddd %mm0, %mm0 ; X64-AVX1-NEXT: movq %mm0, (%rdi) ; X64-AVX1-NEXT: retq @@ -1400,8 +1054,7 @@ ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovd %esi, %xmm0 ; X64-AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX2-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX2-NEXT: paddd %mm0, %mm0 ; X64-AVX2-NEXT: movq %mm0, (%rdi) ; X64-AVX2-NEXT: retq @@ -1410,8 +1063,7 @@ ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovd %esi, %xmm0 ; X64-AVX512-NEXT: vpbroadcastb %xmm0, %xmm0 -; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX512-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX512-NEXT: paddd %mm0, %mm0 ; X64-AVX512-NEXT: movq %mm0, (%rdi) ; X64-AVX512-NEXT: retq Index: test/CodeGen/X86/pr29222.ll =================================================================== --- test/CodeGen/X86/pr29222.ll +++ test/CodeGen/X86/pr29222.ll @@ -10,11 +10,10 @@ ; X86-SSE-NEXT: pushl %ebp ; X86-SSE-NEXT: movl %esp, %ebp ; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: subl $8, %esp ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X86-SSE-NEXT: movq %xmm0, {{[0-9]+}}(%esp) -; X86-SSE-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 ; X86-SSE-NEXT: packsswb %mm0, %mm0 ; X86-SSE-NEXT: movq %mm0, (%esp) ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero @@ -29,10 +28,9 @@ ; X86-AVX-NEXT: pushl %ebp ; X86-AVX-NEXT: movl %esp, %ebp ; X86-AVX-NEXT: andl $-8, %esp -; X86-AVX-NEXT: subl $16, %esp +; X86-AVX-NEXT: subl $8, %esp ; X86-AVX-NEXT: vbroadcastss 8(%ebp), %xmm0 -; X86-AVX-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; X86-AVX-NEXT: movdq2q %xmm0, %mm0 ; X86-AVX-NEXT: packsswb %mm0, %mm0 ; X86-AVX-NEXT: movq %mm0, (%esp) ; X86-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero @@ -46,8 +44,7 @@ ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movd %edi, %xmm0 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-SSE-NEXT: movdq2q %xmm0, %mm0 ; X64-SSE-NEXT: packsswb %mm0, %mm0 ; X64-SSE-NEXT: movq2dq %mm0, %xmm0 ; X64-SSE-NEXT: packsswb %xmm0, %xmm0 @@ -58,8 +55,7 @@ ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovd %edi, %xmm0 ; X64-AVX-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX-NEXT: movdq2q %xmm0, %mm0 ; X64-AVX-NEXT: packsswb %mm0, %mm0 ; X64-AVX-NEXT: movq2dq %mm0, %xmm0 ; X64-AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 Index: test/CodeGen/X86/vec_insert-mmx.ll =================================================================== --- test/CodeGen/X86/vec_insert-mmx.ll +++ test/CodeGen/X86/vec_insert-mmx.ll @@ -6,12 +6,9 @@ define x86_mmx @t0(i32 %A) nounwind { ; X32-LABEL: t0: ; X32: ## %bb.0: -; X32-NEXT: subl $12, %esp ; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; X32-NEXT: movq %xmm0, (%esp) -; X32-NEXT: movq (%esp), %mm0 -; X32-NEXT: addl $12, %esp +; X32-NEXT: movdq2q %xmm0, %mm0 ; X32-NEXT: retl ; ; X64-LABEL: t0: