Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -14605,10 +14605,11 @@ if (NumUpperHalves == 1) { // AVX2 has efficient 32/64-bit element cross-lane shuffles. if (Subtarget.hasAVX2()) { - // extract128 + vunpckhps, is better than vblend + vpermps. - // TODO: Refine to account for unary shuffle, splat, and other masks? - if (EltWidth == 32 && NumLowerHalves && - HalfVT.is128BitVector() && !is128BitUnpackShuffleMask(HalfMask)) + // extract128 + vunpckhps/vshufps, is better than vblend + vpermps. + if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() && + !is128BitUnpackShuffleMask(HalfMask) && + (!isSingleSHUFPSMask(HalfMask) || + Subtarget.hasFastVariableShuffle())) return SDValue(); // If this is a unary shuffle (assume that the 2nd operand is // canonicalized to undef), then we can use vpermpd. Otherwise, we Index: llvm/test/CodeGen/X86/avx2-conversions.ll =================================================================== --- llvm/test/CodeGen/X86/avx2-conversions.ll +++ llvm/test/CodeGen/X86/avx2-conversions.ll @@ -7,9 +7,8 @@ define <4 x i32> @trunc4(<4 x i64> %A) nounwind { ; X32-SLOW-LABEL: trunc4: ; X32-SLOW: # %bb.0: -; X32-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; X32-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; X32-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; X32-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; X32-SLOW-NEXT: vzeroupper ; X32-SLOW-NEXT: retl ; @@ -23,9 +22,8 @@ ; ; X64-SLOW-LABEL: trunc4: ; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; X64-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; X64-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; X64-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; X64-SLOW-NEXT: vzeroupper ; X64-SLOW-NEXT: retq ; Index: llvm/test/CodeGen/X86/avx2-vector-shifts.ll =================================================================== --- llvm/test/CodeGen/X86/avx2-vector-shifts.ll +++ llvm/test/CodeGen/X86/avx2-vector-shifts.ll @@ -376,10 +376,10 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { ; X32-SLOW-LABEL: srl_trunc_and_v4i64: ; X32-SLOW: # %bb.0: -; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; X32-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; X32-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] -; X32-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X32-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; X32-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] +; X32-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1 ; X32-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X32-SLOW-NEXT: vzeroupper ; X32-SLOW-NEXT: retl @@ -396,10 +396,10 @@ ; ; X64-SLOW-LABEL: srl_trunc_and_v4i64: ; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; X64-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; X64-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] -; X64-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; X64-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] +; X64-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1 ; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X64-SLOW-NEXT: vzeroupper ; X64-SLOW-NEXT: retq Index: llvm/test/CodeGen/X86/combine-shl.ll =================================================================== --- llvm/test/CodeGen/X86/combine-shl.ll +++ llvm/test/CodeGen/X86/combine-shl.ll @@ -138,9 +138,9 @@ ; ; AVX-SLOW-LABEL: combine_vec_shl_trunc_and: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX-SLOW-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 ; AVX-SLOW-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq Index: llvm/test/CodeGen/X86/combine-sra.ll =================================================================== --- llvm/test/CodeGen/X86/combine-sra.ll +++ llvm/test/CodeGen/X86/combine-sra.ll @@ -168,9 +168,9 @@ ; ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -209,8 +209,8 @@ ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -248,8 +248,8 @@ ; ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq Index: llvm/test/CodeGen/X86/combine-srl.ll =================================================================== --- llvm/test/CodeGen/X86/combine-srl.ll +++ llvm/test/CodeGen/X86/combine-srl.ll @@ -205,8 +205,8 @@ ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -417,9 +417,9 @@ ; ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq Index: llvm/test/CodeGen/X86/oddshuffles.ll =================================================================== --- llvm/test/CodeGen/X86/oddshuffles.ll +++ llvm/test/CodeGen/X86/oddshuffles.ll @@ -621,16 +621,16 @@ ; ; AVX2-SLOW-LABEL: v12i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6> -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdi) -; AVX2-SLOW-NEXT: vmovaps %xmm2, 32(%rdi) +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6> +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2-SLOW-NEXT: vmovaps %xmm0, 32(%rdi) +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdi) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; Index: llvm/test/CodeGen/X86/reduce-trunc-shl.ll =================================================================== --- llvm/test/CodeGen/X86/reduce-trunc-shl.ll +++ llvm/test/CodeGen/X86/reduce-trunc-shl.ll @@ -13,11 +13,10 @@ ; ; AVX2-LABEL: trunc_shl_7_v4i32_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovaps (%rsi), %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX2-NEXT: vpslld $7, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi) -; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %val = load <4 x i64>, <4 x i64> addrspace(1)* %in %shl = shl <4 x i64> %val, Index: llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll =================================================================== --- llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll +++ llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll @@ -225,10 +225,9 @@ ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi) -; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32: @@ -706,8 +705,8 @@ ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -771,8 +770,8 @@ ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq Index: llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll =================================================================== --- llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -225,10 +225,9 @@ ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi) -; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32: @@ -672,8 +671,8 @@ ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v2i64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -736,8 +735,8 @@ ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -801,8 +800,8 @@ ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -865,8 +864,8 @@ ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -928,8 +927,8 @@ ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8_return_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -1070,11 +1069,10 @@ ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) -; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16: @@ -1202,11 +1200,10 @@ ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) -; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8: Index: llvm/test/CodeGen/X86/vector-trunc-math-widen.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-math-widen.ll +++ llvm/test/CodeGen/X86/vector-trunc-math-widen.ll @@ -32,9 +32,8 @@ ; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -102,13 +101,13 @@ ; ; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -255,25 +254,25 @@ ; ; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -517,8 +516,8 @@ ; ; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -578,11 +577,11 @@ ; ; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 @@ -705,21 +704,21 @@ ; ; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -901,9 +900,8 @@ ; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -971,13 +969,13 @@ ; ; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1124,25 +1122,25 @@ ; ; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -1354,8 +1352,8 @@ ; ; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -1415,11 +1413,11 @@ ; ; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 @@ -1542,21 +1540,21 @@ ; ; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -1772,10 +1770,10 @@ ; ; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -1878,19 +1876,19 @@ ; ; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 @@ -2125,15 +2123,15 @@ ; ; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpmulld %xmm7, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2],xmm8[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2] +; AVX2-SLOW-NEXT: vpmulld %xmm8, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2] ; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] @@ -2141,15 +2139,15 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] ; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] ; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 @@ -2453,8 +2451,8 @@ ; ; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -2514,11 +2512,11 @@ ; ; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 @@ -2700,11 +2698,11 @@ ; ; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] @@ -2712,11 +2710,11 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 @@ -2940,9 +2938,8 @@ ; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -3006,13 +3003,13 @@ ; ; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -3149,25 +3146,25 @@ ; ; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -3354,8 +3351,8 @@ ; ; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -3415,11 +3412,11 @@ ; ; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 @@ -3542,21 +3539,21 @@ ; ; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -3736,9 +3733,8 @@ ; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -3802,13 +3798,13 @@ ; ; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -3945,25 +3941,25 @@ ; ; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpxor %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpxor %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vxorps %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vxorps %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vxorps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vxorps %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -4150,8 +4146,8 @@ ; ; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -4211,11 +4207,11 @@ ; ; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 @@ -4338,21 +4334,21 @@ ; ; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -4532,9 +4528,8 @@ ; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4598,13 +4593,13 @@ ; ; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -4741,25 +4736,25 @@ ; ; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpor %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpor %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vorps %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vorps %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vorps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vorps %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -4946,8 +4941,8 @@ ; ; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -5007,11 +5002,11 @@ ; ; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 @@ -5134,21 +5129,21 @@ ; ; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 Index: llvm/test/CodeGen/X86/vector-trunc-math.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-math.ll +++ llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -32,9 +32,8 @@ ; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -102,13 +101,13 @@ ; ; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -255,25 +254,25 @@ ; ; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -517,8 +516,8 @@ ; ; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -578,11 +577,11 @@ ; ; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 @@ -705,21 +704,21 @@ ; ; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -901,9 +900,8 @@ ; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -971,13 +969,13 @@ ; ; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1124,25 +1122,25 @@ ; ; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -1354,8 +1352,8 @@ ; ; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -1415,11 +1413,11 @@ ; ; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 @@ -1542,21 +1540,21 @@ ; ; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -1772,10 +1770,10 @@ ; ; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -1878,19 +1876,19 @@ ; ; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 @@ -2125,15 +2123,15 @@ ; ; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpmulld %xmm7, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2],xmm8[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2] +; AVX2-SLOW-NEXT: vpmulld %xmm8, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2] ; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] @@ -2141,15 +2139,15 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] ; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] ; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 @@ -2453,8 +2451,8 @@ ; ; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -2514,11 +2512,11 @@ ; ; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 @@ -2700,11 +2698,11 @@ ; ; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] @@ -2712,11 +2710,11 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] ; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 @@ -2940,9 +2938,8 @@ ; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -3006,13 +3003,13 @@ ; ; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -3149,25 +3146,25 @@ ; ; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -3354,8 +3351,8 @@ ; ; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -3415,11 +3412,11 @@ ; ; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 @@ -3542,21 +3539,21 @@ ; ; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -3736,9 +3733,8 @@ ; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -3802,13 +3798,13 @@ ; ; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -3945,25 +3941,25 @@ ; ; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpxor %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpxor %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vxorps %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vxorps %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vxorps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vxorps %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -4150,8 +4146,8 @@ ; ; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -4211,11 +4207,11 @@ ; ; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 @@ -4338,21 +4334,21 @@ ; ; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -4532,9 +4528,8 @@ ; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4598,13 +4593,13 @@ ; ; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -4741,25 +4736,25 @@ ; ; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpor %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpor %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vorps %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vorps %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vorps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vorps %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -4946,8 +4941,8 @@ ; ; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -5007,11 +5002,11 @@ ; ; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 @@ -5134,21 +5129,21 @@ ; ; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 Index: llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll +++ llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll @@ -206,9 +206,8 @@ ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -615,20 +614,20 @@ ; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vpand %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32: Index: llvm/test/CodeGen/X86/vector-trunc-packus.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -206,9 +206,8 @@ ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -615,20 +614,20 @@ ; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vpand %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32: Index: llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll +++ llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll @@ -218,9 +218,8 @@ ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] ; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -644,19 +643,19 @@ ; AVX2-SLOW-LABEL: trunc_ssat_v8i64_v8i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-trunc-ssat.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -218,9 +218,8 @@ ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] ; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -644,19 +643,19 @@ ; AVX2-SLOW-LABEL: trunc_ssat_v8i64_v8i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll +++ llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll @@ -135,9 +135,8 @@ ; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] ; AVX2-SLOW-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -409,17 +408,17 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] ; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-trunc-usat.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -135,9 +135,8 @@ ; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] ; AVX2-SLOW-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -409,17 +408,17 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] ; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-trunc-widen.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-widen.ll +++ llvm/test/CodeGen/X86/vector-trunc-widen.ll @@ -29,10 +29,10 @@ ; ; AVX2-SLOW-LABEL: trunc8i64_8i32: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; @@ -72,10 +72,10 @@ ; ; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; @@ -117,13 +117,13 @@ ; ; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: @@ -207,11 +207,11 @@ ; ; AVX2-SLOW-LABEL: trunc8i64_8i16: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1383,10 +1383,10 @@ ; ; AVX2-SLOW-LABEL: trunc2x4i64_8i32: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-trunc.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc.ll +++ llvm/test/CodeGen/X86/vector-trunc.ll @@ -29,10 +29,10 @@ ; ; AVX2-SLOW-LABEL: trunc8i64_8i32: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; @@ -72,10 +72,10 @@ ; ; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; @@ -117,13 +117,13 @@ ; ; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: @@ -207,11 +207,11 @@ ; ; AVX2-SLOW-LABEL: trunc8i64_8i16: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -303,10 +303,10 @@ ; ; AVX2-SLOW-LABEL: trunc8i64_8i8: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 @@ -1393,10 +1393,10 @@ ; ; AVX2-SLOW-LABEL: trunc2x4i64_8i32: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; @@ -1507,10 +1507,10 @@ ; ; AVX2-SLOW-LABEL: trunc2x4i64_8i16: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0