Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7895,10 +7895,36 @@ "a sorted mask where the broadcast " "comes from V1."); - // Check if this is a broadcast of a scalar. We special case lowering for - // scalars so that we can more effectively fold with loads. + // Go up the chain of (vector) values to try and find a scalar load that + // we can combine with the broadcast. + while (true) { + if (V.getOpcode() == ISD::CONCAT_VECTORS) { + int OperandSize = Mask.size() / V.getNumOperands(); + V = V.getOperand(BroadcastIdx / OperandSize); + BroadcastIdx %= OperandSize; + } else if (V.getOpcode() == ISD::INSERT_SUBVECTOR) { + SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); + SDValue SDIdx = V.getOperand(2); + if (!isa(SDIdx)) + break; + int Idx = (int)cast(SDIdx)->getZExtValue(); + if (BroadcastIdx >= Idx && + BroadcastIdx < + Idx + (int)VInner.getValueType().getVectorNumElements()) { + BroadcastIdx -= Idx; + V = VInner; + } else { + V = VOuter; + } + } else { + break; + } + } + + // Check if this is a broadcast of a scalar. We special case lowering + // for scalars so that we can more effectively fold with loads. if (V.getOpcode() == ISD::BUILD_VECTOR || - (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { + (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); // If the scalar isn't a load we can't broadcast from it in AVX1, only with Index: test/CodeGen/X86/vec_shuf-concat.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/vec_shuf-concat.ll @@ -0,0 +1,62 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 | FileCheck %s -check-prefix=AVX2 + +; These tests check that a vbroadcast instruction is used for a shufflevector +; splat. The first two functions check that a memory to register vbroadcast +; is used for a load/splat pair (single and double). This form of the +; instruction is available on both AVX and AVX2. The register to register +; vbroadcast, however, is not available with AVX. The last two functions +; test that a single splat is lowered into a vbroadcast only when AVX2 is +; supported. + +define <8 x float> @loadSplat4x(float* %p) { + %1 = load float* %p + %2 = insertelement <4 x float> undef, float %1, i32 0 + %3 = shufflevector <4 x float> %2, <4 x float> undef, <8 x i32> zeroinitializer + ret <8 x float> %3 + +; AVX: loadSplat4x +; AVX: vbroadcastss (%rdi), %ymm0 +; AVX-NEXT: ret +; AVX2: loadSplat4x +; AVX2: vbroadcastss (%rdi), %ymm0 +; AVX2-NEXT: ret +} + +define <4 x double> @loadSplat8x(double* %p) { + %1 = load double* %p + %2 = insertelement <2 x double> undef, double %1, i32 0 + %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %3 + +; AVX: loadSplat8x +; AVX: vbroadcastsd (%rdi), %ymm0 +; AVX-NEXT: ret +; AVX2: loadSplat8x +; AVX2: vbroadcastsd (%rdi), %ymm0 +; AVX2-NEXT: ret +} + +define <8 x float> @splat4x(<4 x float> %r) { + %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer + ret <8 x float> %1 + +; AVX: splat4x +; AVX-NOT: vbroadcast +; AVX: ret +; AVX2: splat4x +; AVX2: vbroadcastss %xmm0, %ymm0 +; AVX2-NEXT: ret +} + +define <4 x double> @splat8x(<2 x double> %r) { + %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %1 + +; AVX: splat8x +; AVX-NOT: vbroadcast +; AVX: ret +; AVX2: splat8x +; AVX2: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: ret +}