Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -16222,8 +16222,10 @@ return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); } - // Truncate with PACKSS if we are truncating a vector zero/all-bits result. - if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In)) + // Truncate with PACKSS if we are truncating a vector with sign-bits that + // extend all the way to the packed/truncated value. + unsigned NumPackedBits = std::min(VT.getScalarSizeInBits(), 16); + if ((InVT.getScalarSizeInBits() - NumPackedBits) < DAG.ComputeNumSignBits(In)) if (SDValue V = truncateVectorWithPACKSS(VT, In, DL, DAG, Subtarget)) return V; @@ -34422,7 +34424,7 @@ return SDValue(); } -/// This function transforms vector truncation of 'all or none' bits values. +/// This function transforms vector truncation of 'extended sign-bits' values. /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations. static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL, SelectionDAG &DAG, @@ -34444,12 +34446,6 @@ MVT InVT = In.getValueType().getSimpleVT(); MVT InSVT = InVT.getScalarType(); - // Use PACKSS if the input is a splatted sign bit. - // e.g. Comparison result, sext_in_reg, etc. - unsigned NumSignBits = DAG.ComputeNumSignBits(In); - if (NumSignBits != InSVT.getSizeInBits()) - return SDValue(); - // Check we have a truncation suited for PACKSS. if (!VT.is128BitVector() && !VT.is256BitVector()) return SDValue(); @@ -34458,6 +34454,13 @@ if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64) return SDValue(); + // Use PACKSS if the input has sign-bits that extend all the way to the + // packed/truncated value. e.g. Comparison result, sext_in_reg, etc. + unsigned NumSignBits = DAG.ComputeNumSignBits(In); + unsigned NumPackedBits = std::min(SVT.getSizeInBits(), 16); + if (NumSignBits <= (InSVT.getSizeInBits() - NumPackedBits)) + return SDValue(); + return truncateVectorWithPACKSS(VT, In, DL, DAG, Subtarget); } Index: test/CodeGen/X86/avg.ll =================================================================== --- test/CodeGen/X86/avg.ll +++ test/CodeGen/X86/avg.ll @@ -2567,52 +2567,51 @@ ; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6 ; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 ; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 +; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm9 ; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $1, %ymm0, %ymm10 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm8 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm9 +; AVX2-NEXT: vpsrld $1, %ymm2, %ymm3 +; AVX2-NEXT: vpsrld $1, %ymm9, %ymm8 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 ; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 ; AVX2-NEXT: vpsrld $1, %ymm6, %ymm6 -; AVX2-NEXT: vpsrld $1, %ymm7, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-NEXT: vpsrld $1, %ymm7, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-NEXT: vpackssdw %xmm7, %xmm2, %xmm7 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm7 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm0 +; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm0 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 +; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 +; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 -; AVX2-NEXT: vpshufb %ymm2, %ymm9, %ymm5 -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm6 -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm8, %xmm4 +; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm10, %xmm4 +; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vmovdqu %ymm4, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; Index: test/CodeGen/X86/avx2-shift.ll =================================================================== --- test/CodeGen/X86/avx2-shift.ll +++ test/CodeGen/X86/avx2-shift.ll @@ -556,9 +556,8 @@ ; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; X32-NEXT: vpmovsxwd %xmm0, %ymm0 ; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; X32-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 -; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; X32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; @@ -567,9 +566,8 @@ ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; X64-NEXT: vpmovsxwd %xmm0, %ymm0 ; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; X64-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 -; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq %res = ashr <8 x i16> %lhs, %rhs Index: test/CodeGen/X86/avx2-vector-shifts.ll =================================================================== --- test/CodeGen/X86/avx2-vector-shifts.ll +++ test/CodeGen/X86/avx2-vector-shifts.ll @@ -499,9 +499,8 @@ ; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; X32-NEXT: vpmovsxwd %xmm0, %ymm0 ; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; X32-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 -; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; X32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; @@ -510,9 +509,8 @@ ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; X64-NEXT: vpmovsxwd %xmm0, %ymm0 ; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; X64-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 -; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq %ashr = ashr <8 x i16> %r, %a Index: test/CodeGen/X86/vector-shift-ashr-128.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-128.ll +++ test/CodeGen/X86/vector-shift-ashr-128.ll @@ -318,9 +318,8 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1249,9 +1248,8 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; Index: test/CodeGen/X86/vector-trunc.ll =================================================================== --- test/CodeGen/X86/vector-trunc.ll +++ test/CodeGen/X86/vector-trunc.ll @@ -404,9 +404,8 @@ ; AVX2-LABEL: trunc8i32_8i16_ashr: ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -695,62 +694,28 @@ } define void @trunc16i32_16i16_ashr(<16 x i32> %a) { -; SSE2-LABEL: trunc16i32_16i16_ashr: -; SSE2: # BB#0: # %entry -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc16i32_16i16_ashr: -; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: psrad $16, %xmm3 -; SSSE3-NEXT: psrad $16, %xmm2 -; SSSE3-NEXT: packssdw %xmm3, %xmm2 -; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: packssdw %xmm1, %xmm0 -; SSSE3-NEXT: movdqu %xmm2, (%rax) -; SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc16i32_16i16_ashr: -; SSE41: # BB#0: # %entry -; SSE41-NEXT: psrad $16, %xmm2 -; SSE41-NEXT: psrad $16, %xmm3 -; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: psrad $16, %xmm1 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: movdqu %xmm2, (%rax) -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: trunc16i32_16i16_ashr: +; SSE: # BB#0: # %entry +; SSE-NEXT: psrad $16, %xmm3 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: movdqu %xmm2, (%rax) +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc16i32_16i16_ashr: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 ; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 ; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -760,11 +725,8 @@ ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1 ; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpackssdw %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -935,53 +897,39 @@ define void @trunc16i32_16i8_ashr(<16 x i32> %a) { ; SSE-LABEL: trunc16i32_16i8_ashr: ; SSE: # BB#0: # %entry -; SSE-NEXT: psrad $24, %xmm0 ; SSE-NEXT: psrad $24, %xmm1 -; SSE-NEXT: psrad $24, %xmm2 +; SSE-NEXT: psrad $24, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: psrad $24, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: psrad $24, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: packsswb %xmm2, %xmm0 ; SSE-NEXT: movdqu %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc16i32_16i8_ashr: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 ; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $24, %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 ; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc16i32_16i8_ashr: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $24, %ymm1, %ymm1 -; AVX2-NEXT: vpackssdw %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1031,12 +979,12 @@ ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 -; AVX2-NEXT: vpackssdw %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vmovdqu %xmm0, (%rax) @@ -1153,12 +1101,10 @@ ; ; AVX1-LABEL: trunc16i16_16i8_ashr: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1167,10 +1113,7 @@ ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq