diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29864,22 +29864,29 @@ unsigned InBits = InVT.getSizeInBits(); if (128 % InBits == 0) { - // 128 bit and smaller inputs should avoid truncate all together and - // just use a build_vector that will become a shuffle. - // TODO: Widen and use a shuffle directly? + // 128 bit and smaller inputs should avoid truncate altogether and just + // use a shuffle. MVT InEltVT = InVT.getSimpleVT().getVectorElementType(); - EVT EltVT = VT.getVectorElementType(); - unsigned WidenNumElts = WidenVT.getVectorNumElements(); - SmallVector Ops(WidenNumElts, DAG.getUNDEF(EltVT)); - // Use the original element count so we don't do more scalar opts than - // necessary. - unsigned MinElts = VT.getVectorNumElements(); - for (unsigned i=0; i < MinElts; ++i) { - SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In, - DAG.getIntPtrConstant(i, dl)); - Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val); - } - Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops)); + MVT EltVT = WidenVT.getVectorElementType(); + // Bitcast to get the correct element type for the result. + MVT TempVT = MVT::getVectorVT(EltVT, InBits / EltVT.getSizeInBits()); + SDValue Temp = DAG.getNode(ISD::BITCAST, dl, TempVT, In); + // Widen to the desired result type. + unsigned NumConcats = + WidenVT.getVectorNumElements() / TempVT.getVectorNumElements(); + if (NumConcats != 1) { + SmallVector ConcatOps(NumConcats, DAG.getUNDEF(TempVT)); + ConcatOps[0] = Temp; + Temp = DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, ConcatOps); + } + // Shuffle to pick out the correct elements for the result. + unsigned Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits(); + SmallVector Mask(WidenVT.getVectorNumElements(), -1); + for (unsigned i = 0; i < InVT.getVectorNumElements(); ++i) + Mask[i] = i * Scale; + SDValue Undef = DAG.getUNDEF(Temp.getSimpleValueType()); + SDValue Res = DAG.getVectorShuffle(WidenVT, dl, Temp, Undef, Mask); + Results.push_back(Res); return; } // With AVX512 there are some cases that can use a target specific diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -1486,9 +1486,11 @@ ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x01] ; AVX512BW-NEXT: korw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x45,0xc1] ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ## encoding: [0x62,0xf2,0xfe,0x48,0x28,0xc0] -; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc0] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A] +; AVX512BW-NEXT: ## fixup A - offset: 5, value: LCPI68_0-4, kind: reloc_riprel_4byte ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] -; AVX512BW-NEXT: ## fixup A - offset: 4, value: LCPI68_0-4, kind: reloc_riprel_4byte +; AVX512BW-NEXT: ## fixup A - offset: 4, value: LCPI68_1-4, kind: reloc_riprel_4byte ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x15,0x06,0x00] ; AVX512BW-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; AVX512BW-NEXT: retq ## encoding: [0xc3] @@ -1517,7 +1519,7 @@ ; SKX-NEXT: kshiftrb $7, %k1, %k1 ## encoding: [0xc4,0xe3,0x79,0x30,0xc9,0x07] ; SKX-NEXT: korw %k0, %k1, %k0 ## encoding: [0xc5,0xf4,0x45,0xc0] ; SKX-NEXT: vpmovm2w %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x28,0xc0] -; SKX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc0] +; SKX-NEXT: vpmovwb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc0] ; SKX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] ; SKX-NEXT: ## fixup A - offset: 4, value: LCPI68_0-4, kind: reloc_riprel_4byte ; SKX-NEXT: vpextrw $0, %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x15,0x06,0x00] diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -380,6 +380,7 @@ ; SSE2-SSSE3-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm8 ; SSE2-SSSE3-NEXT: packssdw %xmm9, %xmm8 ; SSE2-SSSE3-NEXT: pand %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: psllw $15, %xmm8 ; SSE2-SSSE3-NEXT: packsswb %xmm8, %xmm8 ; SSE2-SSSE3-NEXT: pmovmskb %xmm8, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax @@ -577,6 +578,7 @@ ; SSE2-SSSE3-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm8 ; SSE2-SSSE3-NEXT: packssdw %xmm9, %xmm8 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: psllw $15, %xmm8 ; SSE2-SSSE3-NEXT: packsswb %xmm8, %xmm8 ; SSE2-SSSE3-NEXT: pmovmskb %xmm8, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll @@ -14,19 +14,20 @@ ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: pcmpgtq %xmm7, %xmm3 ; SSE-NEXT: pcmpgtq %xmm6, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] ; SSE-NEXT: pcmpgtq %xmm5, %xmm1 ; SSE-NEXT: pcmpgtq %xmm4, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: packssdw %xmm11, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm11[0,2] ; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: packssdw %xmm9, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm9[0,2] ; SSE-NEXT: packssdw %xmm10, %xmm8 ; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: psllw $15, %xmm8 ; SSE-NEXT: packsswb %xmm8, %xmm8 ; SSE-NEXT: pmovmskb %xmm8, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -110,19 +111,20 @@ ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: cmpltpd %xmm3, %xmm7 ; SSE-NEXT: cmpltpd %xmm2, %xmm6 -; SSE-NEXT: packssdw %xmm7, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] ; SSE-NEXT: cmpltpd %xmm1, %xmm5 ; SSE-NEXT: cmpltpd %xmm0, %xmm4 -; SSE-NEXT: packssdw %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2] ; SSE-NEXT: packssdw %xmm6, %xmm4 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: packssdw %xmm11, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm11[0,2] ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: packssdw %xmm9, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm9[0,2] ; SSE-NEXT: packssdw %xmm10, %xmm8 ; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: psllw $15, %xmm8 ; SSE-NEXT: packsswb %xmm8, %xmm8 ; SSE-NEXT: pmovmskb %xmm8, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -320,6 +322,7 @@ ; SSE-NEXT: packssdw %xmm9, %xmm8 ; SSE-NEXT: packsswb %xmm10, %xmm8 ; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: psllw $7, %xmm8 ; SSE-NEXT: pmovmskb %xmm8, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq @@ -422,6 +425,7 @@ ; SSE-NEXT: packssdw %xmm9, %xmm8 ; SSE-NEXT: packsswb %xmm10, %xmm8 ; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: psllw $7, %xmm8 ; SSE-NEXT: pmovmskb %xmm8, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -9,22 +9,18 @@ ; SSE2-LABEL: foo: ; SSE2: # %bb.0: ; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: shll $8, %ecx -; SSE2-NEXT: orl %eax, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl $65280, %eax # imm = 0xFF00 -; SSE2-NEXT: orl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: pinsrw $1, %eax, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 ; SSE2-NEXT: movd %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: foo: ; SSE42: # %bb.0: ; SSE42-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE42-NEXT: movl $255, %eax ; SSE42-NEXT: pinsrb $3, %eax, %xmm0 ; SSE42-NEXT: movd %xmm0, (%rdi) @@ -33,7 +29,7 @@ ; AVX-LABEL: foo: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: movl $255, %eax ; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -215,17 +215,15 @@ ; SSE2-LABEL: truncstore_v8i64_v8i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm1, %xmm5 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -321,47 +321,48 @@ ; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535] ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm11, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: pxor %xmm11, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002324991,9223372039002324991] -; SSE2-NEXT: movdqa %xmm10, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm13 -; SSE2-NEXT: pand %xmm13, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm13 -; SSE2-NEXT: por %xmm2, %xmm13 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm11, %xmm2 ; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSE2-NEXT: pand %xmm12, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm9, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm6 +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm10, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm11, %xmm0 ; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm6 ; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm11 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pandn %xmm9, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: pxor %xmm2, %xmm11 ; SSE2-NEXT: movdqa %xmm10, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] @@ -370,20 +371,17 @@ ; SSE2-NEXT: pand %xmm3, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm9, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm1, %xmm5 diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -1485,16 +1485,16 @@ ; SSE2: # %bb.0: # %vector.ph ; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; SSE2-NEXT: movdqa %xmm10, %xmm8 -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm6 ; SSE2-NEXT: pxor %xmm11, %xmm6 ; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: por %xmm11, %xmm7 @@ -1507,9 +1507,9 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm13 ; SSE2-NEXT: pand %xmm13, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm13 +; SSE2-NEXT: pandn %xmm3, %xmm13 ; SSE2-NEXT: por %xmm0, %xmm13 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: pxor %xmm11, %xmm0 ; SSE2-NEXT: movdqa %xmm9, %xmm5 ; SSE2-NEXT: por %xmm11, %xmm5 @@ -1522,9 +1522,9 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pxor %xmm11, %xmm5 ; SSE2-NEXT: movdqa %xmm10, %xmm7 ; SSE2-NEXT: por %xmm11, %xmm7 @@ -1537,9 +1537,9 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: pandn %xmm1, %xmm7 ; SSE2-NEXT: por %xmm10, %xmm7 -; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: pxor %xmm11, %xmm5 ; SSE2-NEXT: por %xmm8, %xmm11 ; SSE2-NEXT: movdqa %xmm11, %xmm6 @@ -1551,30 +1551,28 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm8 -; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm6 ; SSE2-NEXT: por %xmm8, %xmm6 -; SSE2-NEXT: psubq %xmm3, %xmm6 -; SSE2-NEXT: psubq %xmm4, %xmm7 -; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: psubq %xmm2, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: psubq %xmm2, %xmm6 +; SSE2-NEXT: psubq %xmm1, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[0,2] +; SSE2-NEXT: psubq %xmm4, %xmm0 +; SSE2-NEXT: psubq %xmm3, %xmm13 +; SSE2-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,2] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: psubus_8i64_max: ; SSSE3: # %bb.0: # %vector.ph -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm2, %xmm7 -; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm4, %xmm7 +; SSSE3-NEXT: pxor %xmm11, %xmm7 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991] ; SSSE3-NEXT: movdqa %xmm8, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 @@ -1585,59 +1583,54 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm7, %xmm6 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535] -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pandn %xmm9, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSSE3-NEXT: movdqa %xmm8, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pandn %xmm9, %xmm2 -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm2 -; SSSE3-NEXT: movdqa %xmm8, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm6 ; SSSE3-NEXT: pand %xmm6, %xmm4 ; SSSE3-NEXT: pandn %xmm9, %xmm6 ; SSSE3-NEXT: por %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: pxor %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm8, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm11, %xmm4 +; SSSE3-NEXT: movdqa %xmm8, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] ; SSSE3-NEXT: por %xmm5, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm3 ; SSSE3-NEXT: pandn %xmm9, %xmm4 ; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSSE3-NEXT: psubusw %xmm3, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[0,2] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm11, %xmm5 +; SSSE3-NEXT: movdqa %xmm8, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pandn %xmm9, %xmm6 +; SSSE3-NEXT: por %xmm2, %xmm6 +; SSSE3-NEXT: pxor %xmm1, %xmm11 +; SSSE3-NEXT: movdqa %xmm8, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pandn %xmm9, %xmm2 +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2] +; SSSE3-NEXT: pshufb %xmm3, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSSE3-NEXT: psubusw %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: psubus_8i64_max: diff --git a/llvm/test/CodeGen/X86/vec_cast.ll b/llvm/test/CodeGen/X86/vec_cast.ll --- a/llvm/test/CodeGen/X86/vec_cast.ll +++ b/llvm/test/CodeGen/X86/vec_cast.ll @@ -131,10 +131,10 @@ ; ; CHECK-WIN-LABEL: g: ; CHECK-WIN: # %bb.0: -; CHECK-WIN-NEXT: movdqa (%rcx), %xmm0 ; CHECK-WIN-NEXT: movdqa (%rdx), %xmm1 ; CHECK-WIN-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-WIN-NEXT: pshufb %xmm2, %xmm1 +; CHECK-WIN-NEXT: movdqa (%rcx), %xmm0 ; CHECK-WIN-NEXT: pshufb %xmm2, %xmm0 ; CHECK-WIN-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-WIN-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -499,20 +499,18 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) { ; SSE2-LABEL: trunc_v8i64_v8i1: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE2-NEXT: psllw $15, %xmm2 -; SSE2-NEXT: packsswb %xmm2, %xmm2 -; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: cmpb $-1, %al ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -1620,12 +1620,10 @@ ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pmullw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -486,19 +486,17 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) { ; SSE2-LABEL: trunc_v8i64_v8i1: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE2-NEXT: psllw $15, %xmm2 -; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -511,20 +511,18 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) { ; SSE2-LABEL: trunc_v8i64_v8i1: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE2-NEXT: psllw $15, %xmm2 -; SSE2-NEXT: packsswb %xmm2, %xmm2 -; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: testb %al, %al ; SSE2-NEXT: setnp %al ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -61,21 +61,19 @@ define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_add_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: paddq %xmm6, %xmm2 -; SSE-NEXT: paddq %xmm7, %xmm3 -; SSE-NEXT: paddq %xmm4, %xmm0 ; SSE-NEXT: paddq %xmm5, %xmm1 +; SSE-NEXT: paddq %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: paddq %xmm7, %xmm3 +; SSE-NEXT: paddq %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_v8i64_v8i16: @@ -537,17 +535,15 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; @@ -905,21 +901,19 @@ define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_sub_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: psubq %xmm6, %xmm2 -; SSE-NEXT: psubq %xmm7, %xmm3 -; SSE-NEXT: psubq %xmm4, %xmm0 ; SSE-NEXT: psubq %xmm5, %xmm1 +; SSE-NEXT: psubq %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: psubq %xmm7, %xmm3 +; SSE-NEXT: psubq %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_v8i64_v8i16: @@ -1349,17 +1343,15 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; @@ -1776,29 +1768,25 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_mul_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: pmullw %xmm6, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: pmullw %xmm4, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_v8i64_v8i16: @@ -2357,17 +2345,15 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; @@ -2813,21 +2799,19 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_and_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: andps %xmm5, %xmm1 +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: andps %xmm7, %xmm3 +; SSE-NEXT: andps %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_v8i64_v8i16: @@ -3210,18 +3194,16 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v8i64_v8i16: @@ -3576,21 +3558,19 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_xor_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm5, %xmm1 +; SSE-NEXT: xorps %xmm5, %xmm1 +; SSE-NEXT: xorps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: xorps %xmm7, %xmm3 +; SSE-NEXT: xorps %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_v8i64_v8i16: @@ -3973,18 +3953,16 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: xorpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: @@ -4339,21 +4317,19 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_or_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: orps %xmm5, %xmm1 +; SSE-NEXT: orps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: orps %xmm7, %xmm3 +; SSE-NEXT: orps %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_v8i64_v8i16: @@ -4736,18 +4712,16 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: orpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v8i64_v8i16: diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -1859,22 +1859,22 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64>* %p0) "min-legal-vector-width"="256" { ; SSE2-LABEL: trunc_packus_v8i64_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm7 -; SSE2-NEXT: movdqa 16(%rdi), %xmm2 -; SSE2-NEXT: movdqa 32(%rdi), %xmm9 -; SSE2-NEXT: movdqa 48(%rdi), %xmm6 +; SSE2-NEXT: movdqa (%rdi), %xmm5 +; SSE2-NEXT: movdqa 16(%rdi), %xmm9 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm11, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm10, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] +; SSE2-NEXT: movdqa %xmm10, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm8, %xmm1 @@ -1885,26 +1885,26 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm7 ; SSE2-NEXT: pandn %xmm8, %xmm2 ; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm11, %xmm3 ; SSE2-NEXT: movdqa %xmm10, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: pand %xmm7, %xmm5 ; SSE2-NEXT: pandn %xmm8, %xmm7 -; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: por %xmm5, %xmm7 ; SSE2-NEXT: movdqa %xmm9, %xmm3 ; SSE2-NEXT: pxor %xmm11, %xmm3 ; SSE2-NEXT: movdqa %xmm10, %xmm4 @@ -1924,12 +1924,24 @@ ; SSE2-NEXT: pcmpgtd %xmm11, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[0,2] +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm11, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm11, %xmm4 @@ -1939,126 +1951,90 @@ ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm4 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm11, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_packus_v8i64_v8i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm7 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm2 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm9 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm6 +; SSSE3-NEXT: movdqa (%rdi), %xmm5 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm9 +; SSSE3-NEXT: movdqa 32(%rdi), %xmm3 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm7 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] ; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm11, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm10, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm7, %xmm2 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm11, %xmm2 -; SSSE3-NEXT: movdqa %xmm10, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm10, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm7, %xmm2 -; SSSE3-NEXT: movdqa %xmm6, %xmm3 -; SSSE3-NEXT: pxor %xmm11, %xmm3 -; SSSE3-NEXT: movdqa %xmm10, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm7, %xmm0 +; SSSE3-NEXT: pxor %xmm11, %xmm0 +; SSSE3-NEXT: movdqa %xmm10, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm7 -; SSSE3-NEXT: por %xmm6, %xmm7 -; SSSE3-NEXT: movdqa %xmm9, %xmm3 -; SSSE3-NEXT: pxor %xmm11, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm7 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm11, %xmm0 ; SSSE3-NEXT: movdqa %xmm10, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm9 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm11, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm7, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm5 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm9, %xmm0 ; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4 +; SSSE3-NEXT: movdqa %xmm10, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: pand %xmm5, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm7, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm4, %xmm9 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 ; SSSE3-NEXT: pxor %xmm11, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm11, %xmm5 @@ -2068,29 +2044,46 @@ ; SSSE3-NEXT: pand %xmm6, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm5 -; SSSE3-NEXT: pand %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm7, %xmm0 ; SSSE3-NEXT: pxor %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm11, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pand %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm11, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm1, %xmm3 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v8i64_v8i16: diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -1301,148 +1301,141 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(<8 x i64>* %p0) { ; SSE2-LABEL: trunc_usat_v8i64_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm6 -; SSE2-NEXT: movdqa 48(%rdi), %xmm7 +; SSE2-NEXT: movdqa (%rdi), %xmm6 +; SSE2-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-NEXT: movdqa 32(%rdi), %xmm9 +; SSE2-NEXT: movdqa 48(%rdi), %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002324991,9223372039002324991] -; SSE2-NEXT: movdqa %xmm10, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm6 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pandn %xmm8, %xmm6 -; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pxor %xmm2, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,2] ; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm9 ; SSE2-NEXT: pandn %xmm8, %xmm0 ; SSE2-NEXT: por %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v8i64_v8i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm4 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm9 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm6 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm7 +; SSSE3-NEXT: movdqa (%rdi), %xmm6 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 +; SSSE3-NEXT: movdqa 32(%rdi), %xmm9 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm5 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm6, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: pxor %xmm3, %xmm7 ; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002324991,9223372039002324991] -; SSSE3-NEXT: movdqa %xmm10, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm7, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm10, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pandn %xmm8, %xmm6 -; SSSE3-NEXT: por %xmm7, %xmm6 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm10, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm1, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm6, %xmm0 ; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm10, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm10, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm6 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: movdqa %xmm10, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm5 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm5, %xmm2 ; SSSE3-NEXT: pxor %xmm9, %xmm3 -; SSSE3-NEXT: movdqa %xmm10, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm10, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm9 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm9 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm9, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i64_v8i16: diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -136,32 +136,25 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { ; SSE2-LABEL: trunc8i64_8i16: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc8i64_8i16: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm3, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSSE3-NEXT: pshufb %xmm3, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc8i64_8i16: