Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7895,10 +7895,42 @@ "a sorted mask where the broadcast " "comes from V1."); - // Check if this is a broadcast of a scalar. We special case lowering for - // scalars so that we can more effectively fold with loads. + // Go up the chain of (vector) values to try and find a scalar load that + // we can combine with the broadcast. + for (;;) { + switch (V.getOpcode()) { + case ISD::CONCAT_VECTORS: { + int OperandSize = Mask.size() / V.getNumOperands(); + V = V.getOperand(BroadcastIdx / OperandSize); + BroadcastIdx %= OperandSize; + continue; + } + + case ISD::INSERT_SUBVECTOR: { + SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); + auto ConstantIdx = dyn_cast(V.getOperand(2)); + if (!ConstantIdx) + break; + + int BeginIdx = (int)ConstantIdx->getZExtValue(); + int EndIdx = + BeginIdx + (int)VInner.getValueType().getVectorNumElements(); + if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { + BroadcastIdx -= BeginIdx; + V = VInner; + } else { + V = VOuter; + } + continue; + } + } + break; + } + + // Check if this is a broadcast of a scalar. We special case lowering + // for scalars so that we can more effectively fold with loads. if (V.getOpcode() == ISD::BUILD_VECTOR || - (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { + (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); // If the scalar isn't a load we can't broadcast from it in AVX1, only with Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -734,3 +734,29 @@ %shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> ret <4 x i64> %shuffle } + +define <4 x double> @splat_mem_v4f64_2(double* %p) { +; ALL-LABEL: splat_mem_v4f64_2: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 +; ALL-NEXT: retq + %1 = load double* %p + %2 = insertelement <2 x double> undef, double %1, i32 0 + %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %3 +} + +define <4 x double> @splat_v4f64(<2 x double> %r) { +; AVX1-LABEL: splat_v4f64: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_v4f64: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: retq + %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %1 +} Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1579,3 +1579,29 @@ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } + +define <8 x float> @splat_mem_v8f32_2(float* %p) { +; ALL-LABEL: splat_mem_v8f32_2: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastss (%rdi), %ymm0 +; ALL-NEXT: retq + %1 = load float* %p + %2 = insertelement <4 x float> undef, float %1, i32 0 + %3 = shufflevector <4 x float> %2, <4 x float> undef, <8 x i32> zeroinitializer + ret <8 x float> %3 +} + +define <8 x float> @splat_v8f32(<4 x float> %r) { +; AVX1-LABEL: splat_v8f32: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_v8f32: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2-NEXT: retq + %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer + ret <8 x float> %1 +}