Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -7287,6 +7287,23 @@
 
   // Check if this is a broadcast of a scalar. We special case lowering
   // for scalars so that we can more effectively fold with loads.
+  // First, look through bitcast: if the original value has a larger element
+  // type than the shuffle, the broadcast element is in essence truncated.
+  // Make that explicit to ease folding.
+  if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) {
+    EVT EltVT = VT.getVectorElementType();
+    SDValue V0 = V.getOperand(0);
+    EVT V0VT = V0.getValueType();
+
+    if (V0VT.isInteger() && V0VT.getVectorElementType().bitsGT(EltVT) &&
+        ((V0.getOpcode() == ISD::BUILD_VECTOR ||
+         (V0.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)))) {
+      V = DAG.getNode(ISD::TRUNCATE, DL, EltVT, V0.getOperand(BroadcastIdx));
+      BroadcastIdx = 0;
+    }
+  }
+
+  // Also check the simpler case, where we can directly reuse the scalar.
   if (V.getOpcode() == ISD::BUILD_VECTOR ||
       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
     V = V.getOperand(BroadcastIdx);
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td
+++ lib/Target/X86/X86InstrSSE.td
@@ -8337,6 +8337,13 @@
   def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
           (VBROADCASTSDYrr VR128:$src)>;
 
+  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
+  // This means we'll encounter truncated i32 loads; match that here.
+  def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+            (VPBROADCASTWrm addr:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+            (VPBROADCASTWYrm addr:$src)>;
+
   // Provide aliases for broadcast from the same register class that
   // automatically does the extract.
   def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))),
Index: test/CodeGen/X86/vector-shuffle-pbroadcast-load.ll
===================================================================
--- test/CodeGen/X86/vector-shuffle-pbroadcast-load.ll
+++ test/CodeGen/X86/vector-shuffle-pbroadcast-load.ll
@@ -67,8 +67,7 @@
 ;
 ; AVX2-LABEL: test_256_vpbroadcastw_i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vpbroadcastw (%rdi), %ymm0
 ; AVX2-NEXT:    retq
   %tmp = load i32, i32* %arg, align 4
   %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
@@ -147,8 +146,7 @@
 ;
 ; AVX2-LABEL: test_256_vpbroadcastb_i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vpbroadcastb (%rdi), %ymm0
 ; AVX2-NEXT:    retq
   %tmp = load i32, i32* %arg, align 4
   %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
@@ -189,9 +187,7 @@
 ;
 ; AVX2-LABEL: test_256_vpbroadcastb:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    movsbl (%rdi), %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vpbroadcastb (%rdi), %ymm0
 ; AVX2-NEXT:    retq
   %tmp = load i8, i8* %arg, align 1
   %tmp1 = sext i8 %tmp to i32
@@ -264,8 +260,7 @@
 ;
 ; AVX2-LABEL: test_128_vpbroadcastw_i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastw (%rdi), %xmm0
 ; AVX2-NEXT:    retq
   %tmp = load i32, i32* %arg, align 4
   %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
@@ -338,8 +333,7 @@
 ;
 ; AVX2-LABEL: test_128_vpbroadcastb_i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
 ; AVX2-NEXT:    retq
   %tmp = load i32, i32* %arg, align 4
   %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
@@ -377,9 +371,7 @@
 ;
 ; AVX2-LABEL: test_128_vpbroadcastb:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    movsbl (%rdi), %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
 ; AVX2-NEXT:    retq
   %tmp = load i8, i8* %arg, align 1
   %tmp1 = sext i8 %tmp to i32