Index: include/llvm/IR/Instructions.h
===================================================================
--- include/llvm/IR/Instructions.h
+++ include/llvm/IR/Instructions.h
@@ -2237,6 +2237,12 @@
     return Mask;
   }
 
+  /// Determine if the shuffle mask is a splat, possibly with undefined mask
+  /// indices as well. Returns true if the same shuffle index is found in all
+  /// defined elements and optionally returns the splat index. Returns false
+  /// if the mask is not a splat or all mask indices are undefined.
+  bool isSplat(int *SplatIndex = nullptr) const;
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::ShuffleVector;
Index: lib/Analysis/CostModel.cpp
===================================================================
--- lib/Analysis/CostModel.cpp
+++ lib/Analysis/CostModel.cpp
@@ -495,6 +495,10 @@
     SmallVector<int, 16> Mask = Shuffle->getShuffleMask();
 
     if (NumVecElems == Mask.size()) {
+      int BroadcastIndex = -1;
+      if (Shuffle->isSplat(&BroadcastIndex) && BroadcastIndex == 0)
+        return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast,
+                                   VecTypOp0, 0, nullptr);
       if (isReverseVectorMask(Mask))
         return TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, VecTypOp0,
                                    0, nullptr);
Index: lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- lib/CodeGen/CodeGenPrepare.cpp
+++ lib/CodeGen/CodeGenPrepare.cpp
@@ -4389,7 +4389,7 @@
   I->insertAfter(LI);
   // CGP does not check if the zext would be speculatively executed when moved
   // to the same basic block as the load. Preserving its original location would
-  // pessimize the debugging experience, as well as negatively impact the 
+  // pessimize the debugging experience, as well as negatively impact the
   // quality of sample pgo. We don't want to use "line 0" as that has a
   // size cost in the line-table section and logically the zext can be seen as
   // part of the load. Therefore we conservatively reuse the same debug location
@@ -4881,18 +4881,6 @@
   return true;
 }
 
-static bool isBroadcastShuffle(ShuffleVectorInst *SVI) {
-  SmallVector<int, 16> Mask(SVI->getShuffleMask());
-  int SplatElem = -1;
-  for (unsigned i = 0; i < Mask.size(); ++i) {
-    if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem)
-      return false;
-    SplatElem = Mask[i];
-  }
-
-  return true;
-}
-
 /// Some targets have expensive vector shifts if the lanes aren't all the same
 /// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases
 /// it's often worth sinking a shufflevector splat down to its use so that
@@ -4906,7 +4894,7 @@
 
   // We only expect better codegen by sinking a shuffle if we can recognise a
   // constant splat.
-  if (!isBroadcastShuffle(SVI))
+  if (!SVI->isSplat())
     return false;
 
   // InsertedShuffles - Only insert a shuffle in each block once.
Index: lib/IR/Instructions.cpp
===================================================================
--- lib/IR/Instructions.cpp
+++ lib/IR/Instructions.cpp
@@ -258,7 +258,7 @@
          "Calling a function with bad signature!");
 
   for (unsigned i = 0; i != Args.size(); ++i)
-    assert((i >= FTy->getNumParams() || 
+    assert((i >= FTy->getNumParams() ||
             FTy->getParamType(i) == Args[i]->getType()) &&
            "Calling a function with a bad signature!");
 #endif
@@ -340,7 +340,7 @@
     if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
         Index)
       return getArgOperand(Index-1);
-      
+
   return nullptr;
 }
 
@@ -424,7 +424,7 @@
   assert(((!InsertBefore && InsertAtEnd) || (InsertBefore && !InsertAtEnd)) &&
          "createMalloc needs either InsertBefore or InsertAtEnd");
 
-  // malloc(type) becomes: 
+  // malloc(type) becomes:
   //       bitcast (i8* malloc(typeSize)) to type*
   // malloc(type, arraySize) becomes:
   //       bitcast (i8* malloc(typeSize*arraySize)) to type*
@@ -531,7 +531,7 @@
 /// responsibility of the caller.
 Instruction *CallInst::CreateMalloc(BasicBlock *InsertAtEnd,
                                     Type *IntPtrTy, Type *AllocTy,
-                                    Value *AllocSize, Value *ArraySize, 
+                                    Value *AllocSize, Value *ArraySize,
                                     Function *MallocF, const Twine &Name) {
   return createMalloc(nullptr, InsertAtEnd, IntPtrTy, AllocTy, AllocSize,
                       ArraySize, None, MallocF, Name);
@@ -627,7 +627,7 @@
          "Invoking a function with bad signature");
 
   for (unsigned i = 0, e = Args.size(); i != e; i++)
-    assert((i >= FTy->getNumParams() || 
+    assert((i >= FTy->getNumParams() ||
             FTy->getParamType(i) == Args[i]->getType()) &&
            "Invoking a function with a bad signature!");
 #endif
@@ -687,7 +687,7 @@
     if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
         Index)
       return getArgOperand(Index-1);
-      
+
   return nullptr;
 }
 
@@ -1075,7 +1075,7 @@
 //                      UnreachableInst Implementation
 //===----------------------------------------------------------------------===//
 
-UnreachableInst::UnreachableInst(LLVMContext &Context, 
+UnreachableInst::UnreachableInst(LLVMContext &Context,
                                  Instruction *InsertBefore)
   : TerminatorInst(Type::getVoidTy(Context), Instruction::Unreachable,
                    nullptr, 0, InsertBefore) {
@@ -1258,7 +1258,7 @@
 bool AllocaInst::isStaticAlloca() const {
   // Must be constant size.
   if (!isa<ConstantInt>(getArraySize())) return false;
-  
+
   // Must be in the entry block.
   const BasicBlock *Parent = getParent();
   return Parent == &Parent->getParent()->front() && !isUsedWithInAlloca();
@@ -1311,7 +1311,7 @@
   setName(Name);
 }
 
-LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile, 
+LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile,
                    unsigned Align, AtomicOrdering Order,
                    SynchronizationScope SynchScope,
                    BasicBlock *InsertAE)
@@ -1568,7 +1568,7 @@
 //                       FenceInst Implementation
 //===----------------------------------------------------------------------===//
 
-FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, 
+FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering,
                      SynchronizationScope SynchScope,
                      Instruction *InsertBefore)
   : Instruction(Type::getVoidTy(C), Fence, nullptr, 0, InsertBefore) {
@@ -1576,7 +1576,7 @@
   setSynchScope(SynchScope);
 }
 
-FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, 
+FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering,
                      SynchronizationScope SynchScope,
                      BasicBlock *InsertAtEnd)
   : Instruction(Type::getVoidTy(C), Fence, nullptr, 0, InsertAtEnd) {
@@ -1767,14 +1767,14 @@
   setName(Name);
 }
 
-bool InsertElementInst::isValidOperands(const Value *Vec, const Value *Elt, 
+bool InsertElementInst::isValidOperands(const Value *Vec, const Value *Elt,
                                         const Value *Index) {
   if (!Vec->getType()->isVectorTy())
     return false;   // First operand of insertelement must be vector type.
-  
+
   if (Elt->getType() != cast<VectorType>(Vec->getType())->getElementType())
     return false;// Second operand of insertelement must be vector element type.
-    
+
   if (!Index->getType()->isIntegerTy())
     return false;  // Third operand of insertelement must be i32.
   return true;
@@ -1825,7 +1825,7 @@
   // V1 and V2 must be vectors of the same type.
   if (!V1->getType()->isVectorTy() || V1->getType() != V2->getType())
     return false;
-  
+
   // Mask must be vector of i32.
   VectorType *MaskTy = dyn_cast<VectorType>(Mask->getType());
   if (!MaskTy || !MaskTy->getElementType()->isIntegerTy(32))
@@ -1847,7 +1847,7 @@
     }
     return true;
   }
-  
+
   if (const ConstantDataSequential *CDS =
         dyn_cast<ConstantDataSequential>(Mask)) {
     unsigned V1Size = cast<VectorType>(V1->getType())->getNumElements();
@@ -1856,7 +1856,7 @@
         return false;
     return true;
   }
-  
+
   // The bitcode reader can create a place holder for a forward reference
   // used as the shuffle mask. When this occurs, the shuffle mask will
   // fall into this case and fail. To avoid this error, do this bit of
@@ -1881,12 +1881,12 @@
 void ShuffleVectorInst::getShuffleMask(Constant *Mask,
                                        SmallVectorImpl<int> &Result) {
   unsigned NumElts = Mask->getType()->getVectorNumElements();
-  
+
   if (ConstantDataSequential *CDS=dyn_cast<ConstantDataSequential>(Mask)) {
     for (unsigned i = 0; i != NumElts; ++i)
       Result.push_back(CDS->getElementAsInteger(i));
     return;
-  }    
+  }
   for (unsigned i = 0; i != NumElts; ++i) {
     Constant *C = Mask->getAggregateElement(i);
     Result.push_back(isa<UndefValue>(C) ? -1 :
@@ -1894,12 +1894,29 @@
   }
 }
 
+bool ShuffleVectorInst::isSplat(int *SplatIndex /* = nullptr */) const {
+  SmallVector<int, 16> Mask;
+  getShuffleMask(Mask);
+
+  int SplatElem = -1;
+  for (int M : Mask) {
+    if (M < 0)
+      continue;
+    if (0 <= SplatElem && SplatElem != M)
+      return false;
+    SplatElem = M;
+  }
+
+  if (0 <= SplatElem && SplatIndex)
+    *SplatIndex = SplatElem;
+  return (0 <= SplatElem);
+}
 
 //===----------------------------------------------------------------------===//
 //                             InsertValueInst Class
 //===----------------------------------------------------------------------===//
 
-void InsertValueInst::init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs, 
+void InsertValueInst::init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
                            const Twine &Name) {
   assert(getNumOperands() == 2 && "NumOperands not initialized?");
 
@@ -1996,7 +2013,7 @@
   setName(Name);
 }
 
-BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2, 
+BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2,
                                Type *Ty, const Twine &Name,
                                BasicBlock *InsertAtEnd)
   : Instruction(Ty, iType,
@@ -2032,11 +2049,11 @@
            "Tried to create a floating-point operation on a "
            "non-floating-point type!");
     break;
-  case UDiv: 
-  case SDiv: 
+  case UDiv:
+  case SDiv:
     assert(getType() == LHS->getType() &&
            "Arithmetic operation should return same type as operands!");
-    assert((getType()->isIntegerTy() || (getType()->isVectorTy() && 
+    assert((getType()->isIntegerTy() || (getType()->isVectorTy() &&
             cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
            "Incorrect operand type (not integer) for S/UDIV");
     break;
@@ -2046,11 +2063,11 @@
     assert(getType()->isFPOrFPVectorTy() &&
            "Incorrect operand type (not floating point) for FDIV");
     break;
-  case URem: 
-  case SRem: 
+  case URem:
+  case SRem:
     assert(getType() == LHS->getType() &&
            "Arithmetic operation should return same type as operands!");
-    assert((getType()->isIntegerTy() || (getType()->isVectorTy() && 
+    assert((getType()->isIntegerTy() || (getType()->isVectorTy() &&
             cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
            "Incorrect operand type (not integer) for S/UREM");
     break;
@@ -2066,7 +2083,7 @@
     assert(getType() == LHS->getType() &&
            "Shift operation should return same type as operands!");
     assert((getType()->isIntegerTy() ||
-            (getType()->isVectorTy() && 
+            (getType()->isVectorTy() &&
              cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
            "Tried to create a shift operation on a non-integral type!");
     break;
@@ -2075,7 +2092,7 @@
     assert(getType() == LHS->getType() &&
            "Logical operation should return same type as operands!");
     assert((getType()->isIntegerTy() ||
-            (getType()->isVectorTy() && 
+            (getType()->isVectorTy() &&
              cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
            "Tried to create a logical operation on a non-integral type!");
     break;
@@ -2292,7 +2309,7 @@
   Type *DstTy = getType();
   if (SrcTy == DstTy)
     return true;
-  
+
   // Pointer to pointer is always lossless.
   if (SrcTy->isPointerTy())
     return DstTy->isPointerTy();
@@ -2301,10 +2318,10 @@
 
 /// This function determines if the CastInst does not require any bits to be
 /// changed in order to effect the cast. Essentially, it identifies cases where
-/// no code gen is necessary for the cast, hence the name no-op cast.  For 
+/// no code gen is necessary for the cast, hence the name no-op cast.  For
 /// example, the following are all no-op casts:
 /// # bitcast i32* %x to i8*
-/// # bitcast <2 x i32> %x to <4 x i16> 
+/// # bitcast <2 x i32> %x to <4 x i16>
 /// # ptrtoint i32* %x to i32     ; on 32-bit plaforms only
 /// @brief Determine if the described cast is a no-op.
 bool CastInst::isNoopCast(Instruction::CastOps Opcode,
@@ -2315,7 +2332,7 @@
     default: llvm_unreachable("Invalid CastOp");
     case Instruction::Trunc:
     case Instruction::ZExt:
-    case Instruction::SExt: 
+    case Instruction::SExt:
     case Instruction::FPTrunc:
     case Instruction::FPExt:
     case Instruction::UIToFP:
@@ -2368,7 +2385,7 @@
   Type *DstIntPtrTy) {
   // Define the 144 possibilities for these two cast instructions. The values
   // in this matrix determine what to do in a given situation and select the
-  // case in the switch below.  The rows correspond to firstOp, the columns 
+  // case in the switch below.  The rows correspond to firstOp, the columns
   // correspond to secondOp.  In looking at the table below, keep in mind
   // the following cast properties:
   //
@@ -2436,16 +2453,16 @@
   int ElimCase = CastResults[firstOp-Instruction::CastOpsBegin]
                             [secondOp-Instruction::CastOpsBegin];
   switch (ElimCase) {
-    case 0: 
+    case 0:
       // Categorically disallowed.
       return 0;
-    case 1: 
+    case 1:
       // Allowed, use first cast's opcode.
       return firstOp;
-    case 2: 
+    case 2:
       // Allowed, use second cast's opcode.
       return secondOp;
-    case 3: 
+    case 3:
       // No-op cast in second op implies firstOp as long as the DestTy
       // is integer and we are not converting between a vector and a
       // non-vector type.
@@ -2458,7 +2475,7 @@
       if (DstTy->isFloatingPointTy())
         return firstOp;
       return 0;
-    case 5: 
+    case 5:
       // No-op cast in first op implies secondOp as long as the SrcTy
       // is an integer.
       if (SrcTy->isIntegerTy())
@@ -2578,7 +2595,7 @@
     case 17:
       // (sitofp (zext x)) -> (uitofp x)
       return Instruction::UIToFP;
-    case 99: 
+    case 99:
       // Cast combination can't happen (error in input). This is for all cases
       // where the MidTy is not the same for the two cast instructions.
       llvm_unreachable("Invalid Cast Combination");
@@ -2587,7 +2604,7 @@
   }
 }
 
-CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty, 
+CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
   const Twine &Name, Instruction *InsertBefore) {
   assert(castIsValid(op, S, Ty) && "Invalid cast!");
   // Construct and return the appropriate CastInst subclass
@@ -2631,7 +2648,7 @@
   }
 }
 
-CastInst *CastInst::CreateZExtOrBitCast(Value *S, Type *Ty, 
+CastInst *CastInst::CreateZExtOrBitCast(Value *S, Type *Ty,
                                         const Twine &Name,
                                         Instruction *InsertBefore) {
   if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
@@ -2639,7 +2656,7 @@
   return Create(Instruction::ZExt, S, Ty, Name, InsertBefore);
 }
 
-CastInst *CastInst::CreateZExtOrBitCast(Value *S, Type *Ty, 
+CastInst *CastInst::CreateZExtOrBitCast(Value *S, Type *Ty,
                                         const Twine &Name,
                                         BasicBlock *InsertAtEnd) {
   if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
@@ -2647,7 +2664,7 @@
   return Create(Instruction::ZExt, S, Ty, Name, InsertAtEnd);
 }
 
-CastInst *CastInst::CreateSExtOrBitCast(Value *S, Type *Ty, 
+CastInst *CastInst::CreateSExtOrBitCast(Value *S, Type *Ty,
                                         const Twine &Name,
                                         Instruction *InsertBefore) {
   if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
@@ -2655,7 +2672,7 @@
   return Create(Instruction::SExt, S, Ty, Name, InsertBefore);
 }
 
-CastInst *CastInst::CreateSExtOrBitCast(Value *S, Type *Ty, 
+CastInst *CastInst::CreateSExtOrBitCast(Value *S, Type *Ty,
                                         const Twine &Name,
                                         BasicBlock *InsertAtEnd) {
   if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
@@ -2672,7 +2689,7 @@
 }
 
 CastInst *CastInst::CreateTruncOrBitCast(Value *S, Type *Ty,
-                                         const Twine &Name, 
+                                         const Twine &Name,
                                          BasicBlock *InsertAtEnd) {
   if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
     return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd);
@@ -2765,7 +2782,7 @@
   return Create(opcode, C, Ty, Name, InsertBefore);
 }
 
-CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty, 
+CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty,
                                       bool isSigned, const Twine &Name,
                                       BasicBlock *InsertAtEnd) {
   assert(C->getType()->isIntOrIntVectorTy() && Ty->isIntOrIntVectorTy() &&
@@ -2779,8 +2796,8 @@
   return Create(opcode, C, Ty, Name, InsertAtEnd);
 }
 
-CastInst *CastInst::CreateFPCast(Value *C, Type *Ty, 
-                                 const Twine &Name, 
+CastInst *CastInst::CreateFPCast(Value *C, Type *Ty,
+                                 const Twine &Name,
                                  Instruction *InsertBefore) {
   assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() &&
          "Invalid cast");
@@ -2792,8 +2809,8 @@
   return Create(opcode, C, Ty, Name, InsertBefore);
 }
 
-CastInst *CastInst::CreateFPCast(Value *C, Type *Ty, 
-                                 const Twine &Name, 
+CastInst *CastInst::CreateFPCast(Value *C, Type *Ty,
+                                 const Twine &Name,
                                  BasicBlock *InsertAtEnd) {
   assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() &&
          "Invalid cast");
@@ -2836,7 +2853,7 @@
       return DestBits == SrcBits;
                                       // Casting from something else
     return SrcTy->isPointerTy();
-  } 
+  }
   if (DestTy->isFloatingPointTy()) {  // Casting to floating pt
     if (SrcTy->isIntegerTy())                // Casting from integral
       return true;
@@ -2853,7 +2870,7 @@
     if (SrcTy->isPointerTy())                // Casting from pointer
       return true;
     return SrcTy->isIntegerTy();             // Casting from integral
-  } 
+  }
   if (DestTy->isX86_MMXTy()) {
     if (SrcTy->isVectorTy())
       return DestBits == SrcBits;       // 64-bit vector to MMX
@@ -2960,10 +2977,10 @@
         return BitCast;                             // Same size, No-op cast
       }
     } else if (SrcTy->isFloatingPointTy()) {        // Casting from floating pt
-      if (DestIsSigned) 
+      if (DestIsSigned)
         return FPToSI;                              // FP -> sint
       else
-        return FPToUI;                              // FP -> uint 
+        return FPToUI;                              // FP -> uint
     } else if (SrcTy->isVectorTy()) {
       assert(DestBits == SrcBits &&
              "Casting vector to integer of different width");
@@ -3024,7 +3041,7 @@
 /// could be broken out into the separate constructors but it is useful to have
 /// it in one place and to eliminate the redundant code for getting the sizes
 /// of the types involved.
-bool 
+bool
 CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
 
   // Check for type sanity on the arguments
@@ -3055,7 +3072,7 @@
   case Instruction::ZExt:
     return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() &&
       SrcLength == DstLength && SrcBitSize < DstBitSize;
-  case Instruction::SExt: 
+  case Instruction::SExt:
     return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() &&
       SrcLength == DstLength && SrcBitSize < DstBitSize;
   case Instruction::FPTrunc:
@@ -3148,138 +3165,138 @@
 
 TruncInst::TruncInst(
   Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
-) : CastInst(Ty, Trunc, S, Name, InsertAtEnd) { 
+) : CastInst(Ty, Trunc, S, Name, InsertAtEnd) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal Trunc");
 }
 
 ZExtInst::ZExtInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
-)  : CastInst(Ty, ZExt, S, Name, InsertBefore) { 
+)  : CastInst(Ty, ZExt, S, Name, InsertBefore) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal ZExt");
 }
 
 ZExtInst::ZExtInst(
   Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
-)  : CastInst(Ty, ZExt, S, Name, InsertAtEnd) { 
+)  : CastInst(Ty, ZExt, S, Name, InsertAtEnd) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal ZExt");
 }
 SExtInst::SExtInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
-) : CastInst(Ty, SExt, S, Name, InsertBefore) { 
+) : CastInst(Ty, SExt, S, Name, InsertBefore) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal SExt");
 }
 
 SExtInst::SExtInst(
   Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
-)  : CastInst(Ty, SExt, S, Name, InsertAtEnd) { 
+)  : CastInst(Ty, SExt, S, Name, InsertAtEnd) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal SExt");
 }
 
 FPTruncInst::FPTruncInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
-) : CastInst(Ty, FPTrunc, S, Name, InsertBefore) { 
+) : CastInst(Ty, FPTrunc, S, Name, InsertBefore) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPTrunc");
 }
 
 FPTruncInst::FPTruncInst(
   Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
-) : CastInst(Ty, FPTrunc, S, Name, InsertAtEnd) { 
+) : CastInst(Ty, FPTrunc, S, Name, InsertAtEnd) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPTrunc");
 }
 
 FPExtInst::FPExtInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
-) : CastInst(Ty, FPExt, S, Name, InsertBefore) { 
+) : CastInst(Ty, FPExt, S, Name, InsertBefore) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPExt");
 }
 
 FPExtInst::FPExtInst(
   Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
-) : CastInst(Ty, FPExt, S, Name, InsertAtEnd) { 
+) : CastInst(Ty, FPExt, S, Name, InsertAtEnd) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPExt");
 }
 
 UIToFPInst::UIToFPInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
-) : CastInst(Ty, UIToFP, S, Name, InsertBefore) { 
+) : CastInst(Ty, UIToFP, S, Name, InsertBefore) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal UIToFP");
 }
 
 UIToFPInst::UIToFPInst(
   Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
-) : CastInst(Ty, UIToFP, S, Name, InsertAtEnd) { 
+) : CastInst(Ty, UIToFP, S, Name, InsertAtEnd) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal UIToFP");
 }
 
 SIToFPInst::SIToFPInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
-) : CastInst(Ty, SIToFP, S, Name, InsertBefore) { 
+) : CastInst(Ty, SIToFP, S, Name, InsertBefore) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal SIToFP");
 }
 
 SIToFPInst::SIToFPInst(
   Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
-) : CastInst(Ty, SIToFP, S, Name, InsertAtEnd) { 
+) : CastInst(Ty, SIToFP, S, Name, InsertAtEnd) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal SIToFP");
 }
 
 FPToUIInst::FPToUIInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
-) : CastInst(Ty, FPToUI, S, Name, InsertBefore) { 
+) : CastInst(Ty, FPToUI, S, Name, InsertBefore) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToUI");
 }
 
 FPToUIInst::FPToUIInst(
   Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
-) : CastInst(Ty, FPToUI, S, Name, InsertAtEnd) { 
+) : CastInst(Ty, FPToUI, S, Name, InsertAtEnd) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToUI");
 }
 
 FPToSIInst::FPToSIInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
-) : CastInst(Ty, FPToSI, S, Name, InsertBefore) { 
+) : CastInst(Ty, FPToSI, S, Name, InsertBefore) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToSI");
 }
 
 FPToSIInst::FPToSIInst(
   Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
-) : CastInst(Ty, FPToSI, S, Name, InsertAtEnd) { 
+) : CastInst(Ty, FPToSI, S, Name, InsertAtEnd) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToSI");
 }
 
 PtrToIntInst::PtrToIntInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
-) : CastInst(Ty, PtrToInt, S, Name, InsertBefore) { 
+) : CastInst(Ty, PtrToInt, S, Name, InsertBefore) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt");
 }
 
 PtrToIntInst::PtrToIntInst(
   Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
-) : CastInst(Ty, PtrToInt, S, Name, InsertAtEnd) { 
+) : CastInst(Ty, PtrToInt, S, Name, InsertAtEnd) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt");
 }
 
 IntToPtrInst::IntToPtrInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
-) : CastInst(Ty, IntToPtr, S, Name, InsertBefore) { 
+) : CastInst(Ty, IntToPtr, S, Name, InsertBefore) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal IntToPtr");
 }
 
 IntToPtrInst::IntToPtrInst(
   Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
-) : CastInst(Ty, IntToPtr, S, Name, InsertAtEnd) { 
+) : CastInst(Ty, IntToPtr, S, Name, InsertAtEnd) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal IntToPtr");
 }
 
 BitCastInst::BitCastInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
-) : CastInst(Ty, BitCast, S, Name, InsertBefore) { 
+) : CastInst(Ty, BitCast, S, Name, InsertBefore) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal BitCast");
 }
 
 BitCastInst::BitCastInst(
   Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
-) : CastInst(Ty, BitCast, S, Name, InsertAtEnd) { 
+) : CastInst(Ty, BitCast, S, Name, InsertAtEnd) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal BitCast");
 }
 
@@ -3336,7 +3353,7 @@
       return new ICmpInst(CmpInst::Predicate(predicate),
                           S1, S2, Name);
   }
-  
+
   if (InsertBefore)
     return new FCmpInst(InsertBefore, CmpInst::Predicate(predicate),
                         S1, S2, Name);
@@ -3446,8 +3463,8 @@
 ICmpInst::Predicate ICmpInst::getSignedPredicate(Predicate pred) {
   switch (pred) {
     default: llvm_unreachable("Unknown icmp predicate!");
-    case ICMP_EQ: case ICMP_NE: 
-    case ICMP_SGT: case ICMP_SLT: case ICMP_SGE: case ICMP_SLE: 
+    case ICMP_EQ: case ICMP_NE:
+    case ICMP_SGT: case ICMP_SLT: case ICMP_SGE: case ICMP_SLE:
        return pred;
     case ICMP_UGT: return ICMP_SGT;
     case ICMP_ULT: return ICMP_SLT;
@@ -3459,8 +3476,8 @@
 ICmpInst::Predicate ICmpInst::getUnsignedPredicate(Predicate pred) {
   switch (pred) {
     default: llvm_unreachable("Unknown icmp predicate!");
-    case ICMP_EQ: case ICMP_NE: 
-    case ICMP_UGT: case ICMP_ULT: case ICMP_UGE: case ICMP_ULE: 
+    case ICMP_EQ: case ICMP_NE:
+    case ICMP_UGT: case ICMP_ULT: case ICMP_UGE: case ICMP_ULE:
        return pred;
     case ICMP_SGT: return ICMP_UGT;
     case ICMP_SLT: return ICMP_ULT;
@@ -3482,7 +3499,7 @@
     case ICMP_ULT: return ICMP_UGT;
     case ICMP_UGE: return ICMP_ULE;
     case ICMP_ULE: return ICMP_UGE;
-  
+
     case FCMP_FALSE: case FCMP_TRUE:
     case FCMP_OEQ: case FCMP_ONE:
     case FCMP_UEQ: case FCMP_UNE:
@@ -3519,7 +3536,7 @@
 bool CmpInst::isUnsigned(Predicate predicate) {
   switch (predicate) {
     default: return false;
-    case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: case ICmpInst::ICMP_UGT: 
+    case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE: return true;
   }
 }
@@ -3527,7 +3544,7 @@
 bool CmpInst::isSigned(Predicate predicate) {
   switch (predicate) {
     default: return false;
-    case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: case ICmpInst::ICMP_SGT: 
+    case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: case ICmpInst::ICMP_SGT:
     case ICmpInst::ICMP_SGE: return true;
   }
 }
@@ -3535,17 +3552,17 @@
 bool CmpInst::isOrdered(Predicate predicate) {
   switch (predicate) {
     default: return false;
-    case FCmpInst::FCMP_OEQ: case FCmpInst::FCMP_ONE: case FCmpInst::FCMP_OGT: 
-    case FCmpInst::FCMP_OLT: case FCmpInst::FCMP_OGE: case FCmpInst::FCMP_OLE: 
+    case FCmpInst::FCMP_OEQ: case FCmpInst::FCMP_ONE: case FCmpInst::FCMP_OGT:
+    case FCmpInst::FCMP_OLT: case FCmpInst::FCMP_OGE: case FCmpInst::FCMP_OLE:
     case FCmpInst::FCMP_ORD: return true;
   }
 }
-      
+
 bool CmpInst::isUnordered(Predicate predicate) {
   switch (predicate) {
     default: return false;
-    case FCmpInst::FCMP_UEQ: case FCmpInst::FCMP_UNE: case FCmpInst::FCMP_UGT: 
-    case FCmpInst::FCMP_ULT: case FCmpInst::FCMP_UGE: case FCmpInst::FCMP_ULE: 
+    case FCmpInst::FCMP_UEQ: case FCmpInst::FCMP_UNE: case FCmpInst::FCMP_UGT:
+    case FCmpInst::FCMP_ULT: case FCmpInst::FCMP_UGE: case FCmpInst::FCMP_ULE:
     case FCmpInst::FCMP_UNO: return true;
   }
 }
@@ -3664,7 +3681,7 @@
 /// from the switch instruction.
 void SwitchInst::removeCase(CaseIt i) {
   unsigned idx = i.getCaseIndex();
-  
+
   assert(2 + idx*2 < getNumOperands() && "Case index out of range!!!");
 
   unsigned NumOps = getNumOperands();
@@ -3725,7 +3742,7 @@
 void IndirectBrInst::growOperands() {
   unsigned e = getNumOperands();
   unsigned NumOps = e*2;
-  
+
   ReservedSpace = NumOps;
   growHungoffUses(ReservedSpace);
 }
@@ -3771,13 +3788,13 @@
 /// indirectbr instruction.
 void IndirectBrInst::removeDestination(unsigned idx) {
   assert(idx < getNumOperands()-1 && "Successor index out of range!");
-  
+
   unsigned NumOps = getNumOperands();
   Use *OL = getOperandList();
 
   // Replace this value with the last one.
   OL[idx+1] = OL[NumOps-1];
-  
+
   // Nuke the last value.
   OL[NumOps-1].set(nullptr);
   setNumHungOffUseOperands(NumOps-1);
@@ -3840,7 +3857,7 @@
 StoreInst *StoreInst::cloneImpl() const {
   return new StoreInst(getOperand(0), getOperand(1), isVolatile(),
                        getAlignment(), getOrdering(), getSynchScope());
-  
+
 }
 
 AtomicCmpXchgInst *AtomicCmpXchgInst::cloneImpl() const {
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -598,9 +598,93 @@
 
 int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                Type *SubTp) {
-  // We only estimate the cost of reverse and alternate shuffles.
-  if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
-    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+  if (Kind == TTI::SK_Broadcast) {
+    // Broadcast is a special case where when we've legalized to multiple
+    // registers we will just repeatedly use the first register.
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+    static const CostTblEntry AVX512BWShuffleTbl[] = {
+      { ISD::VECTOR_SHUFFLE, MVT::v32i16, 1 }, // vpbroadcastw
+      { ISD::VECTOR_SHUFFLE, MVT::v64i8,  1 }  // vpbroadcastb
+    };
+
+    if (ST->hasBWI())
+      if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return Entry->Cost;
+
+    static const CostTblEntry AVX512ShuffleTbl[] = {
+      { ISD::VECTOR_SHUFFLE, MVT::v8f64,  1 }, // vbroadcastpd
+      { ISD::VECTOR_SHUFFLE, MVT::v16f32, 1 }, // vbroadcastps
+      { ISD::VECTOR_SHUFFLE, MVT::v8i64,  1 }, // vpbroadcastq
+      { ISD::VECTOR_SHUFFLE, MVT::v16i32, 1 }  // vpbroadcastd
+    };
+
+    if (ST->hasAVX512())
+      if (const auto *Entry =
+              CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return Entry->Cost;
+
+    static const CostTblEntry AVX2ShuffleTbl[] = {
+      { ISD::VECTOR_SHUFFLE, MVT::v4f64,  1 }, // vbroadcastpd
+      { ISD::VECTOR_SHUFFLE, MVT::v8f32,  1 }, // vbroadcastps
+      { ISD::VECTOR_SHUFFLE, MVT::v4i64,  1 }, // vpbroadcastq
+      { ISD::VECTOR_SHUFFLE, MVT::v8i32,  1 }, // vpbroadcastd
+      { ISD::VECTOR_SHUFFLE, MVT::v16i16, 1 }, // vpbroadcastw
+      { ISD::VECTOR_SHUFFLE, MVT::v32i8,  1 }  // vpbroadcastb
+    };
+
+    if (ST->hasAVX2())
+      if (const auto *Entry =
+              CostTableLookup(AVX2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return Entry->Cost;
+
+    static const CostTblEntry AVX1ShuffleTbl[] = {
+      { ISD::VECTOR_SHUFFLE, MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
+      { ISD::VECTOR_SHUFFLE, MVT::v8f32,  2 }, // vperm2f128 + vpermilps
+      { ISD::VECTOR_SHUFFLE, MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
+      { ISD::VECTOR_SHUFFLE, MVT::v8i32,  2 }, // vperm2f128 + vpermilps
+      { ISD::VECTOR_SHUFFLE, MVT::v16i16, 3 }, // pshuflw + pshufd + vinsertf128
+      { ISD::VECTOR_SHUFFLE, MVT::v32i8,  2 }  // pshufb + vinsertf128
+    };
+
+    if (ST->hasAVX())
+      if (const auto *Entry =
+              CostTableLookup(AVX1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return Entry->Cost;
+
+    static const CostTblEntry SSSE3ShuffleTbl[] = {
+      { ISD::VECTOR_SHUFFLE, MVT::v8i16, 1 }, // pshufb
+      { ISD::VECTOR_SHUFFLE, MVT::v16i8, 1 }  // pshufb
+    };
+
+    if (ST->hasSSSE3())
+      if (const auto *Entry =
+              CostTableLookup(SSSE3ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return Entry->Cost;
+
+    static const CostTblEntry SSE2ShuffleTbl[] = {
+      { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 }, // shufpd
+      { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 }, // pshufd
+      { ISD::VECTOR_SHUFFLE, MVT::v4i32, 1 }, // pshufd
+      { ISD::VECTOR_SHUFFLE, MVT::v8i16, 2 }, // pshuflw  + pshufd
+      { ISD::VECTOR_SHUFFLE, MVT::v16i8, 3 }  // unpck + pshuflw + pshufd
+    };
+
+    if (ST->hasSSE2())
+      if (const auto *Entry =
+              CostTableLookup(SSE2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return Entry->Cost;
+
+    static const CostTblEntry SSE1ShuffleTbl[] = {
+        { ISD::VECTOR_SHUFFLE, MVT::v4f32, 1 }, // shufps
+    };
+
+    if (ST->hasSSE1())
+      if (const auto *Entry =
+              CostTableLookup(SSE1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return Entry->Cost;
+  }
 
   if (Kind == TTI::SK_Reverse) {
     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
@@ -792,7 +876,6 @@
     if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl,
                                             ISD::VECTOR_SHUFFLE, LT.second))
       return LT.first * Entry->Cost;
-    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
   }
 
   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
Index: test/Analysis/CostModel/X86/shuffle-broadcast.ll
===================================================================
--- test/Analysis/CostModel/X86/shuffle-broadcast.ll
+++ test/Analysis/CostModel/X86/shuffle-broadcast.ll
@@ -9,23 +9,161 @@
 ;
 ; Verify the cost model for broadcast shuffles.
 ;
+; Broadcast is special in that after legalization we can reuse the first broadcasted register.
+;
 
 ; CHECK-LABEL: 'test_vXf64'
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
-  ; SSE: Unknown cost {{.*}} %V128 = shufflevector
-  ; AVX: Unknown cost {{.*}} %V128 = shufflevector
-  ; AVX512: Unknown cost {{.*}} %V128 = shufflevector
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
   %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer
 
-  ; SSE: Unknown cost {{.*}} %V256 = shufflevector
-  ; AVX: Unknown cost {{.*}} %V256 = shufflevector
-  ; AVX512: Unknown cost {{.*}} %V256 = shufflevector
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
   %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer
 
-  ; SSE: Unknown cost {{.*}} %V512 = shufflevector
-  ; AVX: Unknown cost {{.*}} %V512 = shufflevector
-  ; AVX512: Unknown cost {{.*}} %V512 = shufflevector
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
   %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer
 
   ret void
 }
+
+; CHECK-LABEL: 'test_vXi64'
+define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXf32'
+define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
+  ; SSE: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V64 = shufflevector
+  %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi32'
+define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512) {
+  ; SSE: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V64 = shufflevector
+  %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi16'
+define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
+  ; SSE2: cost of 2 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
+
+  ; SSE2: cost of 2 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 3 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
+
+  ; SSE2: cost of 2 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 3 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi8'
+define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
+  ; SSE2: cost of 3 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
+
+  ; SSE2: cost of 3 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
+
+  ; SSE2: cost of 3 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
+
+  ret void
+}