Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -13357,18 +13357,19 @@ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + // If the shuffle is from a single input, directly + // generate a cross-lane VPERMD instruction. + if (V2.isUndef()) { + SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); + return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1); + } + // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return V; - // If the shuffle patterns aren't repeated but it is a single input, directly - // generate a cross-lane VPERMD instruction. - if (V2.isUndef()) { - SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); - return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1); - } // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). Index: test/CodeGen/X86/avx2-conversions.ll =================================================================== --- test/CodeGen/X86/avx2-conversions.ll +++ test/CodeGen/X86/avx2-conversions.ll @@ -5,16 +5,16 @@ define <4 x i32> @trunc4(<4 x i64> %A) nounwind { ; X32-LABEL: trunc4: ; X32: # BB#0: -; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X32-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; X32-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: trunc4: ; X64: # BB#0: -; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X64-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; X64-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq Index: test/CodeGen/X86/avx2-vector-shifts.ll =================================================================== --- test/CodeGen/X86/avx2-vector-shifts.ll +++ test/CodeGen/X86/avx2-vector-shifts.ll @@ -374,8 +374,8 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { ; X32-LABEL: srl_trunc_and_v4i64: ; X32: # BB#0: -; X32-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; X32-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; X32-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; X32-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; X32-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] ; X32-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 @@ -384,8 +384,8 @@ ; ; X64-LABEL: srl_trunc_and_v4i64: ; X64: # BB#0: -; X64-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; X64-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; X64-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; X64-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] ; X64-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 Index: test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll =================================================================== --- test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll +++ test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll @@ -460,8 +460,8 @@ ; CHECK-LABEL: test_2xi32_to_8xi32_mem: ; CHECK: # BB#0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,0,2,0,2,0,2] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> @@ -471,10 +471,10 @@ ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask0: ; CHECK: # BB#0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,0,2,0,2,0,2] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermd %ymm2, %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> @@ -487,10 +487,10 @@ ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask0: ; CHECK: # BB#0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,0,2,0,2,0,2] +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm0, %k1 +; CHECK-NEXT: vpermd %ymm1, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> @@ -502,10 +502,10 @@ ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask1: ; CHECK: # BB#0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,0,2,0,2,0,2] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermd %ymm2, %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> @@ -518,10 +518,10 @@ ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask1: ; CHECK: # BB#0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,0,2,0,2,0,2] +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm0, %k1 +; CHECK-NEXT: vpermd %ymm1, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> @@ -533,10 +533,10 @@ ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask2: ; CHECK: # BB#0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,0,2,0,2,0,2] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermd %ymm2, %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> @@ -549,10 +549,10 @@ ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask2: ; CHECK: # BB#0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,0,2,0,2,0,2] +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm0, %k1 +; CHECK-NEXT: vpermd %ymm1, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> @@ -564,10 +564,10 @@ ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask3: ; CHECK: # BB#0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,0,2,0,2,0,2] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermd %ymm2, %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> @@ -580,10 +580,10 @@ ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask3: ; CHECK: # BB#0: ; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,0,2,0,2,0,2] +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm0, %k1 +; CHECK-NEXT: vpermd %ymm1, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> Index: test/CodeGen/X86/combine-shl.ll =================================================================== --- test/CodeGen/X86/combine-shl.ll +++ test/CodeGen/X86/combine-shl.ll @@ -115,8 +115,8 @@ ; ; AVX-LABEL: combine_vec_shl_trunc_and: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper Index: test/CodeGen/X86/combine-sra.ll =================================================================== --- test/CodeGen/X86/combine-sra.ll +++ test/CodeGen/X86/combine-sra.ll @@ -182,8 +182,8 @@ ; ; AVX-LABEL: combine_vec_ashr_trunc_and: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper @@ -215,9 +215,8 @@ ; ; AVX-LABEL: combine_vec_ashr_trunc_lshr: ; AVX: # BB#0: -; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = <1,3,5,7,u,u,u,u> +; AVX-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -249,8 +248,8 @@ ; ; AVX-LABEL: combine_vec_ashr_trunc_ashr: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = <1,3,5,7,u,u,u,u> +; AVX-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq Index: test/CodeGen/X86/combine-srl.ll =================================================================== --- test/CodeGen/X86/combine-srl.ll +++ test/CodeGen/X86/combine-srl.ll @@ -218,8 +218,8 @@ ; AVX-LABEL: combine_vec_lshr_trunc_lshr1: ; AVX: # BB#0: ; AVX-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -273,8 +273,8 @@ ; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero1: ; AVX: # BB#0: ; AVX-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -448,8 +448,8 @@ ; ; AVX-LABEL: combine_vec_lshr_trunc_and: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -639,16 +639,16 @@ ; ; AVX2-LABEL: v12i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] -; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6> +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastsd %xmm1, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-NEXT: vmovaps %ymm0, (%rdi) -; AVX2-NEXT: vmovaps %xmm2, 32(%rdi) +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2-NEXT: vmovaps %xmm0, 32(%rdi) +; AVX2-NEXT: vmovaps %ymm2, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1417,12 +1417,12 @@ ; AVX2-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> ; AVX2-NEXT: vpermps %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] +; AVX2-NEXT: vmovaps {{.*#+}} ymm5 = +; AVX2-NEXT: vpermps %ymm2, %ymm5, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovups %ymm3, (%rsi) ; AVX2-NEXT: vmovups %ymm4, (%rdx) ; AVX2-NEXT: vmovups %ymm0, (%rcx) @@ -1608,25 +1608,25 @@ ; AVX2-NEXT: vmovups (%rsi), %ymm0 ; AVX2-NEXT: vmovups (%rdx), %ymm1 ; AVX2-NEXT: vmovups (%rcx), %ymm2 -; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = +; AVX2-NEXT: vpermps %ymm1, %ymm3, %ymm3 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX2-NEXT: vbroadcastsd %xmm2, %ymm4 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-NEXT: vbroadcastsd 24(%rsi), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7] +; AVX2-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX2-NEXT: vmovaps {{.*#+}} ymm4 = <5,u,u,6,u,u,7,u> +; AVX2-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-NEXT: vbroadcastsd 24(%rsi), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-NEXT: vmovups %ymm1, 64(%rdi) ; AVX2-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-NEXT: vmovups %ymm4, 64(%rdi) ; AVX2-NEXT: vmovups %ymm3, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq Index: test/CodeGen/X86/psubus.ll =================================================================== --- test/CodeGen/X86/psubus.ll +++ test/CodeGen/X86/psubus.ll @@ -1918,10 +1918,9 @@ ; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] Index: test/CodeGen/X86/reduce-trunc-shl.ll =================================================================== --- test/CodeGen/X86/reduce-trunc-shl.ll +++ test/CodeGen/X86/reduce-trunc-shl.ll @@ -13,8 +13,8 @@ ; ; AVX2-LABEL: trunc_shl_7_v4i32_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpslld $7, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi) ; AVX2-NEXT: vzeroupper Index: test/CodeGen/X86/shuffle-vs-trunc-256.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -292,8 +292,8 @@ ; ; AVX2-LABEL: trunc_v4i64_to_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermps (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps %xmm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -565,8 +565,8 @@ ; ; AVX2-LABEL: trunc_v4i64_to_v4i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX2-NEXT: vmovq %xmm0, (%rsi) ; AVX2-NEXT: vzeroupper @@ -695,8 +695,8 @@ ; ; AVX2-LABEL: trunc_v4i64_to_v4i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovd %xmm0, (%rsi) ; AVX2-NEXT: vzeroupper Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -944,8 +944,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00000010: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -961,8 +961,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00000200: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -978,8 +978,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00003000: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1096,8 +1096,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00001111: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1221,18 +1221,18 @@ ; ; AVX2-LABEL: shuffle_v8i32_08991abb: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_08991abb: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,1,1,10,2,3,3] -; AVX512VL-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,0,1,1,9,2,3,3] +; AVX512VL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1264,12 +1264,19 @@ ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i32_09ab1def: -; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v8i32_09ab1def: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i32_09ab1def: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,3,9,5,6,7] +; AVX512VL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1698,9 +1705,9 @@ ; ; AVX2-LABEL: shuffle_v8i32_6caa87e5: ; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-NEXT: retq ; @@ -1723,8 +1730,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_32103210: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1739,8 +1746,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_76547654: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1755,8 +1762,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_76543210: ; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -2539,8 +2539,8 @@ ; AVX2-LABEL: combine_unneeded_subvector1: ; AVX2: # BB#0: ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %b = add <8 x i32> %a, %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> Index: test/CodeGen/X86/vector-trunc-math.ll =================================================================== --- test/CodeGen/X86/vector-trunc-math.ll +++ test/CodeGen/X86/vector-trunc-math.ll @@ -31,8 +31,8 @@ ; AVX2-LABEL: trunc_add_v4i64_v4i32: ; AVX2: # BB#0: ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -94,10 +94,9 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -234,24 +233,21 @@ ; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3 ; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -469,8 +465,8 @@ ; ; AVX2-LABEL: trunc_add_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -522,10 +518,9 @@ ; ; AVX2-LABEL: trunc_add_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -636,24 +631,21 @@ ; ; AVX2-LABEL: trunc_add_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -814,8 +806,8 @@ ; AVX2-LABEL: trunc_sub_v4i64_v4i32: ; AVX2: # BB#0: ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -877,10 +869,9 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -1017,24 +1008,21 @@ ; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3 ; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1212,8 +1200,8 @@ ; AVX2-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX2: # BB#0: ; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1280,10 +1268,9 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -1421,24 +1408,21 @@ ; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3 ; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1625,10 +1609,9 @@ ; ; AVX2-LABEL: trunc_mul_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1722,18 +1705,15 @@ ; ; AVX2-LABEL: trunc_mul_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -2030,15 +2010,12 @@ ; ; AVX2-LABEL: trunc_mul_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpermd %ymm3, %ymm8, %ymm3 ; AVX2-NEXT: vpmulld %xmm7, %xmm3, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX2-NEXT: vpermd %ymm2, %ymm8, %ymm2 ; AVX2-NEXT: vpmulld %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] @@ -2046,15 +2023,11 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpermd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpermd %ymm1, %ymm8, %ymm1 ; AVX2-NEXT: vpmulld %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpermd %ymm4, %ymm8, %ymm4 +; AVX2-NEXT: vpermd %ymm0, %ymm8, %ymm0 ; AVX2-NEXT: vpmulld %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 @@ -2342,8 +2315,8 @@ ; ; AVX2-LABEL: trunc_mul_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2395,10 +2368,9 @@ ; ; AVX2-LABEL: trunc_mul_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -2615,28 +2587,25 @@ ; ; AVX2-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2832,8 +2801,8 @@ ; AVX2-LABEL: trunc_and_v4i64_v4i32: ; AVX2: # BB#0: ; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2891,10 +2860,9 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -3021,24 +2989,21 @@ ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3 ; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3201,8 +3166,8 @@ ; ; AVX2-LABEL: trunc_and_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3254,10 +3219,9 @@ ; ; AVX2-LABEL: trunc_and_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -3368,24 +3332,21 @@ ; ; AVX2-LABEL: trunc_and_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -3544,8 +3505,8 @@ ; AVX2-LABEL: trunc_xor_v4i64_v4i32: ; AVX2: # BB#0: ; AVX2-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3603,10 +3564,9 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -3733,24 +3693,21 @@ ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3913,8 +3870,8 @@ ; ; AVX2-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3966,10 +3923,9 @@ ; ; AVX2-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -4080,24 +4036,21 @@ ; ; AVX2-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -4256,8 +4209,8 @@ ; AVX2-LABEL: trunc_or_v4i64_v4i32: ; AVX2: # BB#0: ; AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: %xmm0 %xmm0 %ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -4315,10 +4268,9 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -4445,24 +4397,21 @@ ; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3 ; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -4625,8 +4574,8 @@ ; ; AVX2-LABEL: trunc_or_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -4678,10 +4627,9 @@ ; ; AVX2-LABEL: trunc_or_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -4792,24 +4740,21 @@ ; ; AVX2-LABEL: trunc_or_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper Index: test/CodeGen/X86/vector-trunc.ll =================================================================== --- test/CodeGen/X86/vector-trunc.ll +++ test/CodeGen/X86/vector-trunc.ll @@ -28,10 +28,9 @@ ; ; AVX2-LABEL: trunc8i64_8i32: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -105,10 +104,9 @@ ; ; AVX2-LABEL: trunc8i64_8i32_ashr: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <1,3,5,7,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -150,13 +148,10 @@ ; ; AVX2-LABEL: trunc8i64_8i32_lshr: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <1,3,5,7,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32_lshr: @@ -230,10 +225,9 @@ ; ; AVX2-LABEL: trunc8i64_8i16: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -285,10 +279,9 @@ ; ; AVX2-LABEL: trunc8i64_8i8: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 @@ -1370,10 +1363,9 @@ ; ; AVX2-LABEL: trunc2x4i64_8i32: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -1476,10 +1468,9 @@ ; ; AVX2-LABEL: trunc2x4i64_8i16: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0