Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -12158,13 +12158,14 @@ // If either input operand is a zero vector, use VPERM2X128 because its mask // allows us to replace the zero input with an implicit zero. if (!IsV1Zero && !IsV2Zero) { + // With AVX2, use VPERMQ/VPERMPD to allow memory folding. + if (Subtarget.hasAVX2() && V2.isUndef()) + return SDValue(); + // Check for patterns which can be matched with a single insert of a 128-bit // subvector. bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { - // With AVX2, use VPERMQ/VPERMPD to allow memory folding. - if (Subtarget.hasAVX2() && V2.isUndef()) - return SDValue(); // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise, // this will likely become vinsertf128 which can't fold a 256-bit memop. Index: test/CodeGen/X86/avx-vperm2x128.ll =================================================================== --- test/CodeGen/X86/avx-vperm2x128.ll +++ test/CodeGen/X86/avx-vperm2x128.ll @@ -228,10 +228,15 @@ } define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_uu67uu67: -; ALL: # BB#0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_uu67uu67: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_uu67uu67: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -258,10 +263,15 @@ } define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_uu674567: -; ALL: # BB#0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_uu674567: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_uu674567: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle Index: test/CodeGen/X86/avx512-shuffles/partial_permute.ll =================================================================== --- test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -2115,10 +2115,10 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; CHECK-NEXT: movb $2, %al ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %res = select <4 x i1> , <4 x i64> %shuf, <4 x i64> %vec2 @@ -2130,10 +2130,9 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; CHECK-NEXT: movb $2, %al ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %res = select <4 x i1> , <4 x i64> %shuf, <4 x i64> zeroinitializer @@ -3849,7 +3848,7 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> ret <4 x double> %res @@ -3859,10 +3858,10 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; CHECK-NEXT: movb $4, %al ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %res = select <4 x i1> , <4 x double> %shuf, <4 x double> %vec2 @@ -3874,10 +3873,9 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; CHECK-NEXT: movb $4, %al ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %res = select <4 x i1> , <4 x double> %shuf, <4 x double> zeroinitializer Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -722,21 +722,33 @@ } define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_76547654: -; ALL: # BB#0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8f32_76547654: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8f32_76547654: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_76543210: -; ALL: # BB#0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8f32_76543210: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8f32_76543210: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1809,11 +1821,17 @@ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i32_76547654: -; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v8i32_76547654: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i32_76547654: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1825,11 +1843,17 @@ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i32_76543210: -; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v8i32_76543210: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i32_76543210: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle }