diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39978,6 +39978,12 @@
     }
   }
 
+  // For broadcasts, unless we *only* demand the 0'th element,
+  // stop attempts at simplification here, we aren't going to improve things,
+  // this is better than any potential shuffle.
+  if (isTargetShuffleSplat(Op) && !DemandedElts.isOneValue())
+    return false;
+
   // Get target/faux shuffle mask.
   APInt OpUndef, OpZero;
   SmallVector<int, 64> OpMask;
diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
--- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
+++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
@@ -284,7 +284,7 @@
 define <4 x i64> @vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_binary(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK-LABEL: vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_binary:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; CHECK-NEXT:    vbroadcastsd %xmm1, %ymm1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; CHECK-NEXT:    retq
   %r = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 4>
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -752,9 +752,9 @@
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX2-SLOW-NEXT:    vpbroadcastd %xmm3, %xmm4
-; AVX2-SLOW-NEXT:    vpbroadcastq %xmm3, %xmm5
-; AVX2-SLOW-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
+; AVX2-SLOW-NEXT:    vpbroadcastd %xmm3, %xmm5
+; AVX2-SLOW-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3]
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -645,8 +645,7 @@
 ; X86-AVX1-LABEL: pinsrd_from_shufflevector_i32:
 ; X86-AVX1:       ## %bb.0: ## %entry
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vpermilps $0, (%eax), %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0x08,0x00]
-; X86-AVX1-NEXT:    ## xmm1 = mem[0,0,0,0]
+; X86-AVX1-NEXT:    vbroadcastss (%eax), %xmm1 ## encoding: [0xc4,0xe2,0x79,0x18,0x08]
 ; X86-AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
 ; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
@@ -669,8 +668,7 @@
 ;
 ; X64-AVX1-LABEL: pinsrd_from_shufflevector_i32:
 ; X64-AVX1:       ## %bb.0: ## %entry
-; X64-AVX1-NEXT:    vpermilps $0, (%rdi), %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0x0f,0x00]
-; X64-AVX1-NEXT:    ## xmm1 = mem[0,0,0,0]
+; X64-AVX1-NEXT:    vbroadcastss (%rdi), %xmm1 ## encoding: [0xc4,0xe2,0x79,0x18,0x0f]
 ; X64-AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
 ; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]