Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -14600,7 +14600,11 @@ if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() && !is128BitUnpackShuffleMask(HalfMask)) return SDValue(); - if (EltWidth == 64) + // If this is a unary shuffle (assume that the 2nd operand is + // canonicalized to undef), then we can use vpermpd. Otherwise, we + // are better off extracting the upper half of 1 operand and using a + // narrow shuffle. + if (EltWidth == 64 && V2.isUndef()) return SDValue(); } // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. Index: llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -3981,10 +3981,9 @@ define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,6,2,6] -; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 -; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -3994,11 +3993,11 @@ ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,6,2,6] -; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4 -; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vcmpeqpd %xmm0, %xmm2, %k1 -; CHECK-NEXT: vblendmpd %xmm4, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3 +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0] +; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4011,11 +4010,10 @@ ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,6,2,6] -; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm3 -; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vcmpeqpd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vmovapd %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2 +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4367,11 +4365,8 @@ define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) { ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm1 -; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,3,6] -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vmovapd (%rdi), %xmm0 +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],mem[0] ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4380,13 +4375,10 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,3,6] -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovapd %xmm3, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vmovapd (%rdi), %xmm2 +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} = xmm2[1],mem[0] ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4398,13 +4390,10 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm1 -; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [1,6,3,6] -; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm2 -; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1} {z} -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vmovapd (%rdi), %xmm1 +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm1[1],mem[0] ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> Index: llvm/trunk/test/CodeGen/X86/pr34592.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pr34592.ll +++ llvm/trunk/test/CodeGen/X86/pr34592.ll @@ -10,7 +10,7 @@ ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: .cfi_def_cfa_register %rbp ; CHECK-NEXT: andq $-32, %rsp -; CHECK-NEXT: subq $352, %rsp # imm = 0x160 +; CHECK-NEXT: subq $320, %rsp # imm = 0x140 ; CHECK-NEXT: vmovaps 240(%rbp), %ymm8 ; CHECK-NEXT: vmovaps 208(%rbp), %ymm9 ; CHECK-NEXT: vmovaps 176(%rbp), %ymm10 @@ -24,8 +24,6 @@ ; CHECK-NEXT: vmovdqa %xmm6, %xmm9 ; CHECK-NEXT: # kill: def $ymm9 killed $xmm9 ; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: # implicit-def: $ymm0 ; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 ; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] @@ -34,9 +32,10 @@ ; CHECK-NEXT: vmovaps %xmm2, %xmm6 ; CHECK-NEXT: # implicit-def: $ymm2 ; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; CHECK-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm11[2,3],ymm7[4,5],ymm11[6,7] -; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm6 +; CHECK-NEXT: vmovq {{.*#+}} xmm6 = xmm6[0],zero +; CHECK-NEXT: # implicit-def: $ymm11 +; CHECK-NEXT: vmovaps %xmm6, %xmm11 ; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] ; CHECK-NEXT: vmovaps %xmm7, %xmm6 ; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7] Index: llvm/trunk/test/CodeGen/X86/trunc-subvector.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/trunc-subvector.ll +++ llvm/trunk/test/CodeGen/X86/trunc-subvector.ll @@ -108,11 +108,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -228,11 +226,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1904,62 +1904,22 @@ ret <8 x i32> %b } -; FIXME: AVX1 lowering is better than AVX2 (and AVX512?) - define <4 x i64> @unpckh_v4i64(<4 x i64> %x, <4 x i64> %y) { -; AVX1-LABEL: unpckh_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: unpckh_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: unpckh_v4i64: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: unpckh_v4i64: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,7,3,7] -; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; ALL-LABEL: unpckh_v4i64: +; ALL: # %bb.0: +; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 +; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; ALL-NEXT: retq %unpckh = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> ret <4 x i64> %unpckh } -; FIXME: AVX1 lowering is better than AVX2 (and AVX512?) - define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) { -; AVX1-LABEL: unpckh_v4f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: unpckh_v4f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: unpckh_v4f64: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: unpckh_v4f64: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [1,7,3,7] -; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-NEXT: retq +; ALL-LABEL: unpckh_v4f64: +; ALL: # %bb.0: +; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 +; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; ALL-NEXT: retq %unpckh = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> ret <4 x double> %unpckh } Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2290,10 +2290,9 @@ define <2 x i64> @test_v8i64_2_5 (<8 x i64> %v) { ; ALL-LABEL: test_v8i64_2_5: ; ALL: # %bb.0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3] -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm1 +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 +; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; ALL-NEXT: vzeroupper ; ALL-NEXT: ret{{[l|q]}} %res = shufflevector <8 x i64> %v, <8 x i64> undef, <2 x i32>