Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -8208,10 +8208,27 @@ V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( Ld->getMemOperand(), Offset, SVT.getStoreSize())); - } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { - // We can't broadcast from a vector register without AVX2, and we can only - // broadcast from the zero-element of a vector register. + } else if (!Subtarget->hasAVX2()) { + // We can't broadcast from a vector register without AVX2. return SDValue(); + } else if (BroadcastIdx != 0) { + // We can only broadcast from the zero-element of a vector register, + // but it can be advantageous to broadcast from the zero-element of a + // subvector. + if (!VT.is256BitVector() && !VT.is512BitVector()) + return SDValue(); + + // VPERMQ/VPERMPD can perform the cross-lane shuffle directly. + if (VT == MVT::v4f64 || VT == MVT::v4i64) + return SDValue(); + + unsigned EltSize = VT.getScalarSizeInBits(); + if (((BroadcastIdx * EltSize) % 128) != 0) + return SDValue(); + + MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize); + V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V, + DAG.getIntPtrConstant(BroadcastIdx, DL)); } V = DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, V); Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -2904,8 +2904,8 @@ ; ; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 -; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] @@ -3293,8 +3293,7 @@ ; AVX2-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8: ; AVX2: # BB#0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2006,8 +2006,7 @@ ; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX2: # BB#0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -851,8 +851,8 @@ ; ; AVX2-LABEL: shuffle_v8f32_44444444: ; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -2021,8 +2021,8 @@ ; ; AVX2-LABEL: shuffle_v8i32_44444444: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle Index: test/CodeGen/X86/vector-shuffle-512-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v16.ll +++ test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -4,6 +4,25 @@ target triple = "x86_64-unknown-unknown" +define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x float> %a, <16 x float> %b) { +; ALL-LABEL: shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastss %xmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> + ret <16 x float> %shuffle +} + +define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<16 x float> %a, <16 x float> %b) { +; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: +; ALL: # BB#0: +; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; ALL-NEXT: vbroadcastss %xmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> + ret <16 x float> %shuffle +} + define <16 x float> @shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d: ; ALL: # BB#0: @@ -70,6 +89,25 @@ ret <16 x float> %shuffle } +define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i32> %a, <16 x i32> %b) { +; ALL-LABEL: shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + ret <16 x i32> %shuffle +} + +define <16 x i32> @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<16 x i32> %a, <16 x i32> %b) { +; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04: +; ALL: # BB#0: +; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm0 +; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + ret <16 x i32> %shuffle +} + define <16 x i32> @shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f: ; ALL: # BB#0: Index: test/CodeGen/X86/vector-shuffle-512-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v32.ll +++ test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -3,6 +3,25 @@ target triple = "x86_64-unknown-unknown" +define <32 x i16> @shuffle_v32i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i16> %a) { +; ALL-LABEL: shuffle_v32i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastw %xmm0, %zmm0 +; ALL-NEXT: retq + %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> zeroinitializer + ret <32 x i16> %c +} + +define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<32 x i16> %a) { +; ALL-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: +; ALL: # BB#0: +; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm0 +; ALL-NEXT: vpbroadcastw %xmm0, %zmm0 +; ALL-NEXT: retq + %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> + ret <32 x i16> %c +} + define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f(<32 x i16> %a) { ; ALL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f: ; ALL: # BB#0: Index: test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v8.ll +++ test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -18,6 +18,38 @@ ret <8 x double> %shuffle } +define <8 x double> @shuffle_v8f64_22222222(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_22222222: +; AVX512F: # BB#0: +; AVX512F-NEXT: vextractf32x4 $1, %zmm0, %xmm0 +; AVX512F-NEXT: vbroadcastsd %xmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_22222222: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vextractf32x4 $1, %zmm0, %xmm0 +; AVX512F-32-NEXT: vbroadcastsd %xmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_44444444(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_44444444: +; AVX512F: # BB#0: +; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; AVX512F-NEXT: vbroadcastsd %xmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_44444444: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; AVX512F-32-NEXT: vbroadcastsd %xmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + ret <8 x double> %shuffle +} + define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00000010: ; AVX512F: # BB#0: @@ -994,6 +1026,38 @@ ret <8 x i64> %shuffle } +define <8 x i64> @shuffle_v8i64_44444444(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_44444444: +; AVX512F: # BB#0: +; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_44444444: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; AVX512F-32-NEXT: vpbroadcastq %xmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_66666666(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_66666666: +; AVX512F: # BB#0: +; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_66666666: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512F-32-NEXT: vpbroadcastq %xmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> + ret <8 x i64> %shuffle +} + define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) { ; ; AVX512F-LABEL: shuffle_v8i64_00000010: @@ -2102,7 +2166,7 @@ ; AVX512F-32-LABEL: test_vshuff64x2_512_maskz: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-32-NEXT: vpsllvq .LCPI122_0, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpsllvq .LCPI126_0, %zmm2, %zmm2 ; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1] ; AVX512F-32-NEXT: retl @@ -2123,7 +2187,7 @@ ; AVX512F-32-LABEL: test_vshufi64x2_512_mask: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-32-NEXT: vpsllvq .LCPI123_0, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpsllvq .LCPI127_0, %zmm2, %zmm2 ; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1] ; AVX512F-32-NEXT: retl @@ -2160,7 +2224,7 @@ ; AVX512F-32-LABEL: test_vshuff64x2_512_mem_mask: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1 -; AVX512F-32-NEXT: vpsllvq .LCPI125_0, %zmm1, %zmm1 +; AVX512F-32-NEXT: vpsllvq .LCPI129_0, %zmm1, %zmm1 ; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] @@ -2183,7 +2247,7 @@ ; AVX512F-32-LABEL: test_vshuff64x2_512_mem_maskz: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1 -; AVX512F-32-NEXT: vpsllvq .LCPI126_0, %zmm1, %zmm1 +; AVX512F-32-NEXT: vpsllvq .LCPI130_0, %zmm1, %zmm1 ; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] Index: test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v1.ll +++ test/CodeGen/X86/vector-shuffle-v1.ll @@ -166,8 +166,8 @@ ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z} -; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2 -; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastq %xmm1, %zmm1 ; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z} @@ -178,8 +178,8 @@ ; VL_BW_DQ: # BB#0: ; VL_BW_DQ-NEXT: kmovb %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 -; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; VL_BW_DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm0 +; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %zmm0 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0