Index: include/llvm/IR/Instructions.h
===================================================================
--- include/llvm/IR/Instructions.h
+++ include/llvm/IR/Instructions.h
@@ -2237,6 +2237,12 @@
     return Mask;
   }
 
+  /// Determine if the shuffle mask is a splat, possibly with undefined mask
+  /// indices as well. Returns true if the same shuffle index is found in all
+  /// defined elements and optionally returns the splat index. Returns false
+  /// if the mask is not a splat or all mask indices are undefined.
+  bool isSplat(int *SplatIndex = nullptr) const;
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::ShuffleVector;
Index: lib/Analysis/CostModel.cpp
===================================================================
--- lib/Analysis/CostModel.cpp
+++ lib/Analysis/CostModel.cpp
@@ -516,6 +516,10 @@
     SmallVector<int, 16> Mask = Shuffle->getShuffleMask();
 
     if (NumVecElems == Mask.size()) {
+      int BroadcastIndex = -1;
+      if (Shuffle->isSplat(&BroadcastIndex) && BroadcastIndex == 0)
+        return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast,
+                                   VecTypOp0, 0, nullptr);
       if (isReverseVectorMask(Mask))
         return TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, VecTypOp0,
                                    0, nullptr);
Index: lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- lib/CodeGen/CodeGenPrepare.cpp
+++ lib/CodeGen/CodeGenPrepare.cpp
@@ -4885,18 +4885,6 @@
   return true;
 }
 
-static bool isBroadcastShuffle(ShuffleVectorInst *SVI) {
-  SmallVector<int, 16> Mask(SVI->getShuffleMask());
-  int SplatElem = -1;
-  for (unsigned i = 0; i < Mask.size(); ++i) {
-    if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem)
-      return false;
-    SplatElem = Mask[i];
-  }
-
-  return true;
-}
-
 /// Some targets have expensive vector shifts if the lanes aren't all the same
 /// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases
 /// it's often worth sinking a shufflevector splat down to its use so that
@@ -4910,7 +4898,7 @@
 
   // We only expect better codegen by sinking a shuffle if we can recognise a
   // constant splat.
-  if (!isBroadcastShuffle(SVI))
+  if (!SVI->isSplat())
     return false;
 
   // InsertedShuffles - Only insert a shuffle in each block once.
Index: lib/IR/Instructions.cpp
===================================================================
--- lib/IR/Instructions.cpp
+++ lib/IR/Instructions.cpp
@@ -1894,6 +1894,23 @@
   }
 }
 
+bool ShuffleVectorInst::isSplat(int *SplatIndex /* = nullptr */) const {
+  SmallVector<int, 16> Mask;
+  getShuffleMask(Mask);
+
+  int SplatElem = -1;
+  for (int M : Mask) {
+    if (M < 0)
+      continue;
+    if (0 <= SplatElem && SplatElem != M)
+      return false;
+    SplatElem = M;
+  }
+
+  if (0 <= SplatElem && SplatIndex)
+    *SplatIndex = SplatElem;
+  return (0 <= SplatElem);
+}
 
 //===----------------------------------------------------------------------===//
 //                             InsertValueInst Class
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -598,11 +598,18 @@
 
 int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                Type *SubTp) {
-  if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate) {
+  if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate ||
+      Kind == TTI::SK_Broadcast) {
     // 64-bit packed float vectors (v2f32) are widened to type v4f32.
     // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 
+    // For Broadcasts we are splatting the first element from the first input
+    // register, so only need to reference that input and all the output
+    // registers are the same.
+    if (Kind == TTI::SK_Broadcast)
+      LT.first = 1;
+
     static const CostTblEntry AVX512VBMIShuffleTbl[] = {
       { TTI::SK_Reverse, MVT::v64i8,  1 }, // vpermb
       { TTI::SK_Reverse, MVT::v32i8,  1 }  // vpermb
@@ -614,10 +621,13 @@
         return LT.first * Entry->Cost;
 
     static const CostTblEntry AVX512BWShuffleTbl[] = {
-      { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
-      { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
-      { TTI::SK_Reverse, MVT::v64i8,  6 }  // vextracti64x4 + 2*vperm2i128
-                                           // + 2*pshufb + vinserti64x4
+      { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw
+      { TTI::SK_Broadcast, MVT::v64i8,  1 }, // vpbroadcastb
+
+      { TTI::SK_Reverse,   MVT::v32i16, 1 }, // vpermw
+      { TTI::SK_Reverse,   MVT::v16i16, 1 }, // vpermw
+      { TTI::SK_Reverse,   MVT::v64i8,  6 }  // vextracti64x4 + 2*vperm2i128
+                                             // + 2*pshufb + vinserti64x4
     };
 
     if (ST->hasBWI())
@@ -626,10 +636,15 @@
         return LT.first * Entry->Cost;
 
     static const CostTblEntry AVX512ShuffleTbl[] = {
-      { TTI::SK_Reverse, MVT::v8f64,  1 }, // vpermpd
-      { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
-      { TTI::SK_Reverse, MVT::v8i64,  1 }, // vpermq
-      { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
+      { TTI::SK_Broadcast, MVT::v8f64,  1 }, // vbroadcastpd
+      { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps
+      { TTI::SK_Broadcast, MVT::v8i64,  1 }, // vpbroadcastq
+      { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd
+
+      { TTI::SK_Reverse,   MVT::v8f64,  1 }, // vpermpd
+      { TTI::SK_Reverse,   MVT::v16f32, 1 }, // vpermps
+      { TTI::SK_Reverse,   MVT::v8i64,  1 }, // vpermq
+      { TTI::SK_Reverse,   MVT::v16i32, 1 }  // vpermd
     };
 
     if (ST->hasAVX512())
@@ -638,6 +653,13 @@
         return LT.first * Entry->Cost;
 
     static const CostTblEntry AVX2ShuffleTbl[] = {
+      { TTI::SK_Broadcast, MVT::v4f64,  1 }, // vbroadcastpd
+      { TTI::SK_Broadcast, MVT::v8f32,  1 }, // vbroadcastps
+      { TTI::SK_Broadcast, MVT::v4i64,  1 }, // vpbroadcastq
+      { TTI::SK_Broadcast, MVT::v8i32,  1 }, // vpbroadcastd
+      { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
+      { TTI::SK_Broadcast, MVT::v32i8,  1 }, // vpbroadcastb
+
       { TTI::SK_Reverse,   MVT::v4f64,  1 }, // vpermpd
       { TTI::SK_Reverse,   MVT::v8f32,  1 }, // vpermps
       { TTI::SK_Reverse,   MVT::v4i64,  1 }, // vpermq
@@ -654,6 +676,13 @@
         return LT.first * Entry->Cost;
 
     static const CostTblEntry AVX1ShuffleTbl[] = {
+      { TTI::SK_Broadcast, MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
+      { TTI::SK_Broadcast, MVT::v8f32,  2 }, // vperm2f128 + vpermilps
+      { TTI::SK_Broadcast, MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
+      { TTI::SK_Broadcast, MVT::v8i32,  2 }, // vperm2f128 + vpermilps
+      { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
+      { TTI::SK_Broadcast, MVT::v32i8,  2 }, // vpshufb + vinsertf128
+
       { TTI::SK_Reverse,   MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
       { TTI::SK_Reverse,   MVT::v8f32,  2 }, // vperm2f128 + vpermilps
       { TTI::SK_Reverse,   MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
@@ -689,6 +718,9 @@
         return LT.first * Entry->Cost;
 
     static const CostTblEntry SSSE3ShuffleTbl[] = {
+      { TTI::SK_Broadcast, MVT::v8i16,  1 }, // pshufb
+      { TTI::SK_Broadcast, MVT::v16i8,  1 }, // pshufb
+
       { TTI::SK_Reverse,   MVT::v8i16,  1 }, // pshufb
       { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
 
@@ -701,6 +733,12 @@
         return LT.first * Entry->Cost;
 
     static const CostTblEntry SSE2ShuffleTbl[] = {
+      { TTI::SK_Broadcast, MVT::v2f64,  1 }, // shufpd
+      { TTI::SK_Broadcast, MVT::v2i64,  1 }, // pshufd
+      { TTI::SK_Broadcast, MVT::v4i32,  1 }, // pshufd
+      { TTI::SK_Broadcast, MVT::v8i16,  2 }, // pshuflw  + pshufd
+      { TTI::SK_Broadcast, MVT::v16i8,  3 }, // unpck + pshuflw + pshufd
+
       { TTI::SK_Reverse,   MVT::v2f64,  1 }, // shufpd
       { TTI::SK_Reverse,   MVT::v2i64,  1 }, // pshufd
       { TTI::SK_Reverse,   MVT::v4i32,  1 }, // pshufd
@@ -720,6 +758,7 @@
         return LT.first * Entry->Cost;
 
     static const CostTblEntry SSE1ShuffleTbl[] = {
+      { TTI::SK_Broadcast, MVT::v4f32,  1 }, // shufps
       { TTI::SK_Reverse,   MVT::v4f32,  1 }, // shufps
       { TTI::SK_Alternate, MVT::v4f32,  2 }  // 2*shufps
     };
Index: test/Analysis/CostModel/X86/shuffle-broadcast.ll
===================================================================
--- test/Analysis/CostModel/X86/shuffle-broadcast.ll
+++ test/Analysis/CostModel/X86/shuffle-broadcast.ll
@@ -18,14 +18,150 @@
   %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer
 
   ; SSE: cost of 1 {{.*}} %V256 = shufflevector
-  ; AVX: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
   ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
   %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer
 
   ; SSE: cost of 1 {{.*}} %V512 = shufflevector
-  ; AVX: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
   ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
   %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer
 
   ret void
 }
+
+; CHECK-LABEL: 'test_vXi64'
+define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXf32'
+define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
+  ; SSE: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V64 = shufflevector
+  %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi32'
+define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512) {
+  ; SSE: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V64 = shufflevector
+  %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi16'
+define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
+  ; SSE2: cost of 2 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
+
+  ; SSE2: cost of 2 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 3 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
+
+  ; SSE2: cost of 2 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 3 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi8'
+define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
+  ; SSE2: cost of 3 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
+
+  ; SSE2: cost of 3 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
+
+  ; SSE2: cost of 3 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
+
+  ret void
+}