Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
@@ -5394,8 +5394,18 @@
     IsUnary = true;
     break;
   case X86ISD::VBROADCAST: {
-    // We only decode broadcasts of same-sized vectors at the moment.
-    if (N->getOperand(0).getValueType() == VT) {
+    SDValue N0 = N->getOperand(0);
+    // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
+    // add the pre-extracted value to the Ops vector.
+    if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        N0.getOperand(0).getValueType() == VT &&
+        N0.getConstantOperandVal(1) == 0)
+      Ops.push_back(N0.getOperand(0));
+
+    // We only decode broadcasts of same-sized vectors, unless the broadcast
+    // came from an extract from the original width. If we found one, we
+    // pushed it the Ops vector above.
+    if (N0.getValueType() == VT || !Ops.empty()) {
       DecodeVectorBroadcast(VT, Mask);
       IsUnary = true;
       break;
@@ -9729,6 +9739,12 @@
     BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
   }
 
+  // We only support broadcasting from 128-bit vectors to minimize the
+  // number of patterns we need to deal with in isel. So extract down to
+  // 128-bits.
+  if (SrcVT.getSizeInBits() > 128)
+    V = extract128BitVector(V, 0, DAG, DL);
+
   return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
 }
 
Index: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
+++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
@@ -714,8 +714,7 @@
 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
 ; KNL_64-LABEL: test14:
 ; KNL_64:       # BB#0:
-; KNL_64-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
-; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL_64-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
 ; KNL_64-NEXT:    vpbroadcastq %xmm0, %zmm0
 ; KNL_64-NEXT:    vmovd %esi, %xmm1
 ; KNL_64-NEXT:    vpbroadcastd %xmm1, %ymm1
@@ -731,8 +730,7 @@
 ;
 ; KNL_32-LABEL: test14:
 ; KNL_32:       # BB#0:
-; KNL_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
-; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; KNL_32-NEXT:    vpbroadcastd %xmm0, %zmm0
 ; KNL_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
 ; KNL_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
@@ -742,8 +740,7 @@
 ;
 ; SKX-LABEL: test14:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
-; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
 ; SKX-NEXT:    vpbroadcastq %xmm0, %zmm0
 ; SKX-NEXT:    vpbroadcastd %esi, %ymm1
 ; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
@@ -758,8 +755,7 @@
 ;
 ; SKX_32-LABEL: test14:
 ; SKX_32:       # BB#0:
-; SKX_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
-; SKX_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; SKX_32-NEXT:    vpbroadcastd %xmm0, %zmm0
 ; SKX_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
 ; SKX_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -126,7 +126,6 @@
 ;
 ; KNL64-LABEL: expand3:
 ; KNL64:       # BB#0:
-; KNL64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; KNL64-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; KNL64-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; KNL64-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
@@ -142,7 +141,6 @@
 ;
 ; KNL32-LABEL: expand3:
 ; KNL32:       # BB#0:
-; KNL32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; KNL32-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; KNL32-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; KNL32-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
Index: llvm/trunk/test/CodeGen/X86/widened-broadcast.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/widened-broadcast.ll
+++ llvm/trunk/test/CodeGen/X86/widened-broadcast.ll
@@ -51,14 +51,12 @@
 ;
 ; AVX2-LABEL: load_splat_8f32_4f32_01010101:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: load_splat_8f32_4f32_01010101:
 ; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %ld = load <4 x float>, <4 x float>* %ptr
@@ -131,14 +129,12 @@
 ;
 ; AVX2-LABEL: load_splat_8i32_4i32_01010101:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: load_splat_8i32_4i32_01010101:
 ; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %ld = load <4 x i32>, <4 x i32>* %ptr
@@ -242,14 +238,12 @@
 ;
 ; AVX2-LABEL: load_splat_16i16_8i16_0101010101010101:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: load_splat_16i16_8i16_0101010101010101:
 ; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX512-NEXT:    vbroadcastss (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %ld = load <8 x i16>, <8 x i16>* %ptr
@@ -272,14 +266,12 @@
 ;
 ; AVX2-LABEL: load_splat_16i16_8i16_0123012301230123:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: load_splat_16i16_8i16_0123012301230123:
 ; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %ld = load <8 x i16>, <8 x i16>* %ptr
@@ -442,14 +434,12 @@
 ;
 ; AVX2-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vpbroadcastw (%rdi), %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
 ; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX512-NEXT:    vpbroadcastw (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %ld = load <16 x i8>, <16 x i8>* %ptr
@@ -472,14 +462,12 @@
 ;
 ; AVX2-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
 ; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX512-NEXT:    vbroadcastss (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %ld = load <16 x i8>, <16 x i8>* %ptr
@@ -502,14 +490,12 @@
 ;
 ; AVX2-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
 ; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %ld = load <16 x i8>, <16 x i8>* %ptr