Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -7895,10 +7895,36 @@
                                             "a sorted mask where the broadcast "
                                             "comes from V1.");
 
-  // Check if this is a broadcast of a scalar. We special case lowering for
-  // scalars so that we can more effectively fold with loads.
+  // Go up the chain of (vector) values to try and find a scalar load that
+  // we can combine with the broadcast.
+  while (true) {
+    if (V.getOpcode() == ISD::CONCAT_VECTORS) {
+      int OperandSize = Mask.size() / V.getNumOperands();
+      V = V.getOperand(BroadcastIdx / OperandSize);
+      BroadcastIdx %= OperandSize;
+    } else if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
+      SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
+      SDValue SDIdx = V.getOperand(2);
+      if (!isa<ConstantSDNode>(SDIdx))
+        break;
+      int Idx = (int)cast<ConstantSDNode>(SDIdx)->getZExtValue();
+      if (BroadcastIdx >= Idx &&
+          BroadcastIdx <
+              Idx + (int)VInner.getValueType().getVectorNumElements()) {
+        BroadcastIdx -= Idx;
+        V = VInner;
+      } else {
+        V = VOuter;
+      }
+    } else {
+      break;
+    }
+  }
+
+  // Check if this is a broadcast of a scalar. We special case lowering
+  // for scalars so that we can more effectively fold with loads.
   if (V.getOpcode() == ISD::BUILD_VECTOR ||
-        (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
+      (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
     V = V.getOperand(BroadcastIdx);
 
     // If the scalar isn't a load we can't broadcast from it in AVX1, only with
Index: test/CodeGen/X86/vec_shuf-concat.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/vec_shuf-concat.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2  | FileCheck %s -check-prefix=AVX2
+
+; These tests check that a vbroadcast instruction is used for a shufflevector
+; splat. The first two functions check that a memory to register vbroadcast
+; is used for a load/splat pair (single and double). This form of the
+; instruction is available on both AVX and AVX2. The register to register
+; vbroadcast, however, is not available with AVX. The last two functions
+; test that a single splat is lowered into a vbroadcast only when AVX2 is
+; supported.
+
+define <8 x float> @loadSplat4x(float* %p) {
+  %1 = load float* %p
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %3 = shufflevector <4 x float> %2, <4 x float> undef, <8 x i32> zeroinitializer
+  ret <8 x float> %3
+
+; AVX: loadSplat4x
+; AVX: vbroadcastss	(%rdi), %ymm0
+; AVX-NEXT: ret
+; AVX2: loadSplat4x
+; AVX2: vbroadcastss	(%rdi), %ymm0
+; AVX2-NEXT: ret
+}
+
+define <4 x double> @loadSplat8x(double* %p) {
+  %1 = load double* %p
+  %2 = insertelement <2 x double> undef, double %1, i32 0
+  %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> zeroinitializer
+  ret <4 x double> %3
+
+; AVX: loadSplat8x
+; AVX: vbroadcastsd	(%rdi), %ymm0
+; AVX-NEXT: ret
+; AVX2: loadSplat8x
+; AVX2: vbroadcastsd	(%rdi), %ymm0
+; AVX2-NEXT: ret
+}
+
+define <8 x float> @splat4x(<4 x float> %r) {
+  %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer
+  ret <8 x float> %1
+
+; AVX: splat4x
+; AVX-NOT: vbroadcast
+; AVX: ret
+; AVX2: splat4x
+; AVX2: vbroadcastss	%xmm0, %ymm0
+; AVX2-NEXT: ret
+}
+
+define <4 x double> @splat8x(<2 x double> %r) {
+  %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer
+  ret <4 x double> %1
+
+; AVX: splat8x
+; AVX-NOT: vbroadcast
+; AVX: ret
+; AVX2: splat8x
+; AVX2: vbroadcastsd	%xmm0, %ymm0
+; AVX2-NEXT: ret
+}