diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -935,31 +935,28 @@
                                               ArrayRef<int> Mask,
                                               VectorType *Ty, int &Index,
                                               VectorType *&SubTy) const {
-    int Limit = Mask.size() * 2;
-    if (Mask.empty() ||
-        // Extra check required by isSingleSourceMaskImpl function (called by
-        // ShuffleVectorInst::isSingleSourceMask).
-        any_of(Mask, [Limit](int I) { return I >= Limit; }))
+    if (Mask.empty())
       return Kind;
+    int NumSrcElts = Ty->getElementCount().getKnownMinValue();
     switch (Kind) {
     case TTI::SK_PermuteSingleSrc:
-      if (ShuffleVectorInst::isReverseMask(Mask))
+      if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
         return TTI::SK_Reverse;
-      if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
+      if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
         return TTI::SK_Broadcast;
       break;
     case TTI::SK_PermuteTwoSrc: {
       int NumSubElts;
       if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
-                                 Mask, Mask.size(), NumSubElts, Index)) {
+                                 Mask, NumSrcElts, NumSubElts, Index)) {
         SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
         return TTI::SK_InsertSubvector;
       }
-      if (ShuffleVectorInst::isSelectMask(Mask))
+      if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
         return TTI::SK_Select;
-      if (ShuffleVectorInst::isTransposeMask(Mask))
+      if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
         return TTI::SK_Transpose;
-      if (ShuffleVectorInst::isSpliceMask(Mask, Index))
+      if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
         return TTI::SK_Splice;
       break;
     }
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -2104,13 +2104,14 @@
   /// Return true if this shuffle mask chooses elements from exactly one source
   /// vector.
   /// Example: <7,5,undef,7>
-  /// This assumes that vector operands are the same length as the mask.
-  static bool isSingleSourceMask(ArrayRef<int> Mask);
-  static bool isSingleSourceMask(const Constant *Mask) {
+  /// This assumes that vector operands (of length \p NumSrcElts) are the same
+  /// length as the mask.
+  static bool isSingleSourceMask(ArrayRef<int> Mask, int NumSrcElts);
+  static bool isSingleSourceMask(const Constant *Mask, int NumSrcElts) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
-    return isSingleSourceMask(MaskAsInts);
+    return isSingleSourceMask(MaskAsInts, NumSrcElts);
   }
 
   /// Return true if this shuffle chooses elements from exactly one source
@@ -2118,7 +2119,8 @@
   /// Example: shufflevector <4 x n> A, <4 x n> B, <3,0,undef,3>
   /// TODO: Optionally allow length-changing shuffles.
   bool isSingleSource() const {
-    return !changesLength() && isSingleSourceMask(ShuffleMask);
+    return !changesLength() &&
+           isSingleSourceMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle mask chooses elements from exactly one source
@@ -2126,8 +2128,8 @@
   /// necessarily a no-op because it may change the number of elements from its
   /// input vectors or it may provide demanded bits knowledge via undef lanes.
   /// Example: <undef,undef,2,3>
-  static bool isIdentityMask(ArrayRef<int> Mask);
-  static bool isIdentityMask(const Constant *Mask) {
+  static bool isIdentityMask(ArrayRef<int> Mask, int NumSrcElts);
+  static bool isIdentityMask(const Constant *Mask, int NumSrcElts) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
 
     // Not possible to express a shuffle mask for a scalable vector for this
@@ -2137,7 +2139,7 @@
 
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
-    return isIdentityMask(MaskAsInts);
+    return isIdentityMask(MaskAsInts, NumSrcElts);
   }
 
   /// Return true if this shuffle chooses elements from exactly one source
@@ -2150,7 +2152,7 @@
     if (isa<ScalableVectorType>(getType()))
       return false;
 
-    return !changesLength() && isIdentityMask(ShuffleMask);
+    return !changesLength() && isIdentityMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle lengthens exactly one source vector with
@@ -2174,12 +2176,12 @@
   /// In that case, the shuffle is better classified as an identity shuffle.
   /// This assumes that vector operands are the same length as the mask
   /// (a length-changing shuffle can never be equivalent to a vector select).
-  static bool isSelectMask(ArrayRef<int> Mask);
-  static bool isSelectMask(const Constant *Mask) {
+  static bool isSelectMask(ArrayRef<int> Mask, int NumSrcElts);
+  static bool isSelectMask(const Constant *Mask, int NumSrcElts) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
-    return isSelectMask(MaskAsInts);
+    return isSelectMask(MaskAsInts, NumSrcElts);
   }
 
   /// Return true if this shuffle chooses elements from its source vectors
@@ -2191,19 +2193,20 @@
   /// In that case, the shuffle is better classified as an identity shuffle.
   /// TODO: Optionally allow length-changing shuffles.
   bool isSelect() const {
-    return !changesLength() && isSelectMask(ShuffleMask);
+    return !changesLength() && isSelectMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle mask swaps the order of elements from exactly
   /// one source vector.
   /// Example: <7,6,undef,4>
-  /// This assumes that vector operands are the same length as the mask.
-  static bool isReverseMask(ArrayRef<int> Mask);
-  static bool isReverseMask(const Constant *Mask) {
+  /// This assumes that vector operands (of length \p NumSrcElts) are the same
+  /// length as the mask.
+  static bool isReverseMask(ArrayRef<int> Mask, int NumSrcElts);
+  static bool isReverseMask(const Constant *Mask, int NumSrcElts) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
-    return isReverseMask(MaskAsInts);
+    return isReverseMask(MaskAsInts, NumSrcElts);
   }
 
   /// Return true if this shuffle swaps the order of elements from exactly
@@ -2211,19 +2214,20 @@
   /// Example: shufflevector <4 x n> A, <4 x n> B, <3,undef,1,undef>
   /// TODO: Optionally allow length-changing shuffles.
   bool isReverse() const {
-    return !changesLength() && isReverseMask(ShuffleMask);
+    return !changesLength() && isReverseMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle mask chooses all elements with the same value
   /// as the first element of exactly one source vector.
   /// Example: <4,undef,undef,4>
-  /// This assumes that vector operands are the same length as the mask.
-  static bool isZeroEltSplatMask(ArrayRef<int> Mask);
-  static bool isZeroEltSplatMask(const Constant *Mask) {
+  /// This assumes that vector operands (of length \p NumSrcElts) are the same
+  /// length as the mask.
+  static bool isZeroEltSplatMask(ArrayRef<int> Mask, int NumSrcElts);
+  static bool isZeroEltSplatMask(const Constant *Mask, int NumSrcElts) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
-    return isZeroEltSplatMask(MaskAsInts);
+    return isZeroEltSplatMask(MaskAsInts, NumSrcElts);
   }
 
   /// Return true if all elements of this shuffle are the same value as the
@@ -2233,7 +2237,8 @@
   /// TODO: Optionally allow length-changing shuffles.
   /// TODO: Optionally allow splats from other elements.
   bool isZeroEltSplat() const {
-    return !changesLength() && isZeroEltSplatMask(ShuffleMask);
+    return !changesLength() &&
+           isZeroEltSplatMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle mask is a transpose mask.
@@ -2268,12 +2273,12 @@
   ///   ; Transposed matrix
   ///   t0 = < a, e, c, g > = shufflevector m0, m1 < 0, 4, 2, 6 >
   ///   t1 = < b, f, d, h > = shufflevector m0, m1 < 1, 5, 3, 7 >
-  static bool isTransposeMask(ArrayRef<int> Mask);
-  static bool isTransposeMask(const Constant *Mask) {
+  static bool isTransposeMask(ArrayRef<int> Mask, int NumSrcElts);
+  static bool isTransposeMask(const Constant *Mask, int NumSrcElts) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
-    return isTransposeMask(MaskAsInts);
+    return isTransposeMask(MaskAsInts, NumSrcElts);
   }
 
   /// Return true if this shuffle transposes the elements of its inputs without
@@ -2282,19 +2287,21 @@
   /// exact specification.
   /// Example: shufflevector <4 x n> A, <4 x n> B, <0,4,2,6>
   bool isTranspose() const {
-    return !changesLength() && isTransposeMask(ShuffleMask);
+    return !changesLength() && isTransposeMask(ShuffleMask, ShuffleMask.size());
   }
 
   /// Return true if this shuffle mask is a splice mask, concatenating the two
   /// inputs together and then extracts an original width vector starting from
   /// the splice index.
   /// Example: shufflevector <4 x n> A, <4 x n> B, <1,2,3,4>
-  static bool isSpliceMask(ArrayRef<int> Mask, int &Index);
-  static bool isSpliceMask(const Constant *Mask, int &Index) {
+  /// This assumes that vector operands (of length \p NumSrcElts) are the same
+  /// length as the mask.
+  static bool isSpliceMask(ArrayRef<int> Mask, int NumSrcElts, int &Index);
+  static bool isSpliceMask(const Constant *Mask, int NumSrcElts, int &Index) {
     assert(Mask->getType()->isVectorTy() && "Shuffle needs vector constant.");
     SmallVector<int, 16> MaskAsInts;
     getShuffleMask(Mask, MaskAsInts);
-    return isSpliceMask(MaskAsInts, Index);
+    return isSpliceMask(MaskAsInts, NumSrcElts, Index);
   }
 
   /// Return true if this shuffle splices two inputs without changing the length
@@ -2302,7 +2309,8 @@
   /// then extracts an original width vector starting from the splice index.
   /// Example: shufflevector <4 x n> A, <4 x n> B, <1,2,3,4>
   bool isSplice(int &Index) const {
-    return !changesLength() && isSpliceMask(ShuffleMask, Index);
+    return !changesLength() &&
+           isSpliceMask(ShuffleMask, ShuffleMask.size(), Index);
   }
 
   /// Return true if this shuffle mask is an extract subvector mask.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23843,7 +23843,7 @@
 
   // Profitability check: only deal with extractions from the first subvector
   // unless the mask becomes an identity mask.
-  if (!ShuffleVectorInst::isIdentityMask(NewMask) ||
+  if (!ShuffleVectorInst::isIdentityMask(NewMask, NewMask.size()) ||
       any_of(NewMask, [](int M) { return M < 0; }))
     for (auto &DemandedSubvector : DemandedSubvectors)
       if (DemandedSubvector.second != 0)
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2328,10 +2328,10 @@
   return UsesLHS || UsesRHS;
 }
 
-bool ShuffleVectorInst::isSingleSourceMask(ArrayRef<int> Mask) {
+bool ShuffleVectorInst::isSingleSourceMask(ArrayRef<int> Mask, int NumSrcElts) {
   // We don't have vector operand size information, so assume operands are the
   // same size as the mask.
-  return isSingleSourceMaskImpl(Mask, Mask.size());
+  return isSingleSourceMaskImpl(Mask, NumSrcElts);
 }
 
 static bool isIdentityMaskImpl(ArrayRef<int> Mask, int NumOpElts) {
@@ -2346,65 +2346,75 @@
   return true;
 }
 
-bool ShuffleVectorInst::isIdentityMask(ArrayRef<int> Mask) {
+bool ShuffleVectorInst::isIdentityMask(ArrayRef<int> Mask, int NumSrcElts) {
+  if (Mask.size() != static_cast<unsigned>(NumSrcElts))
+    return false;
   // We don't have vector operand size information, so assume operands are the
   // same size as the mask.
-  return isIdentityMaskImpl(Mask, Mask.size());
+  return isIdentityMaskImpl(Mask, NumSrcElts);
 }
 
-bool ShuffleVectorInst::isReverseMask(ArrayRef<int> Mask) {
-  if (!isSingleSourceMask(Mask))
+bool ShuffleVectorInst::isReverseMask(ArrayRef<int> Mask, int NumSrcElts) {
+  if (Mask.size() != static_cast<unsigned>(NumSrcElts))
+    return false;
+  if (!isSingleSourceMask(Mask, NumSrcElts))
     return false;
 
   // The number of elements in the mask must be at least 2.
-  int NumElts = Mask.size();
-  if (NumElts < 2)
+  if (NumSrcElts < 2)
     return false;
 
-  for (int i = 0; i < NumElts; ++i) {
-    if (Mask[i] == -1)
+  for (int I = 0, E = Mask.size(); I < E; ++I) {
+    if (Mask[I] == -1)
       continue;
-    if (Mask[i] != (NumElts - 1 - i) && Mask[i] != (NumElts + NumElts - 1 - i))
+    if (Mask[I] != (NumSrcElts - 1 - I) &&
+        Mask[I] != (NumSrcElts + NumSrcElts - 1 - I))
       return false;
   }
   return true;
 }
 
-bool ShuffleVectorInst::isZeroEltSplatMask(ArrayRef<int> Mask) {
-  if (!isSingleSourceMask(Mask))
+bool ShuffleVectorInst::isZeroEltSplatMask(ArrayRef<int> Mask, int NumSrcElts) {
+  if (Mask.size() != static_cast<unsigned>(NumSrcElts))
     return false;
-  for (int i = 0, NumElts = Mask.size(); i < NumElts; ++i) {
-    if (Mask[i] == -1)
+  if (!isSingleSourceMask(Mask, NumSrcElts))
+    return false;
+  for (int I = 0, E = Mask.size(); I < E; ++I) {
+    if (Mask[I] == -1)
       continue;
-    if (Mask[i] != 0 && Mask[i] != NumElts)
+    if (Mask[I] != 0 && Mask[I] != NumSrcElts)
       return false;
   }
   return true;
 }
 
-bool ShuffleVectorInst::isSelectMask(ArrayRef<int> Mask) {
+bool ShuffleVectorInst::isSelectMask(ArrayRef<int> Mask, int NumSrcElts) {
+  if (Mask.size() != static_cast<unsigned>(NumSrcElts))
+    return false;
   // Select is differentiated from identity. It requires using both sources.
-  if (isSingleSourceMask(Mask))
+  if (isSingleSourceMask(Mask, NumSrcElts))
     return false;
-  for (int i = 0, NumElts = Mask.size(); i < NumElts; ++i) {
-    if (Mask[i] == -1)
+  for (int I = 0, E = Mask.size(); I < E; ++I) {
+    if (Mask[I] == -1)
       continue;
-    if (Mask[i] != i && Mask[i] != (NumElts + i))
+    if (Mask[I] != I && Mask[I] != (NumSrcElts + I))
       return false;
   }
   return true;
 }
 
-bool ShuffleVectorInst::isTransposeMask(ArrayRef<int> Mask) {
+bool ShuffleVectorInst::isTransposeMask(ArrayRef<int> Mask, int NumSrcElts) {
   // Example masks that will return true:
   // v1 = <a, b, c, d>
   // v2 = <e, f, g, h>
   // trn1 = shufflevector v1, v2 <0, 4, 2, 6> = <a, e, c, g>
   // trn2 = shufflevector v1, v2 <1, 5, 3, 7> = <b, f, d, h>
 
+  if (Mask.size() != static_cast<unsigned>(NumSrcElts))
+    return false;
   // 1. The number of elements in the mask must be a power-of-2 and at least 2.
-  int NumElts = Mask.size();
-  if (NumElts < 2 || !isPowerOf2_32(NumElts))
+  int Sz = Mask.size();
+  if (Sz < 2 || !isPowerOf2_32(Sz))
     return false;
 
   // 2. The first element of the mask must be either a 0 or a 1.
@@ -2413,23 +2423,26 @@
 
   // 3. The difference between the first 2 elements must be equal to the
   // number of elements in the mask.
-  if ((Mask[1] - Mask[0]) != NumElts)
+  if ((Mask[1] - Mask[0]) != NumSrcElts)
     return false;
 
   // 4. The difference between consecutive even-numbered and odd-numbered
   // elements must be equal to 2.
-  for (int i = 2; i < NumElts; ++i) {
-    int MaskEltVal = Mask[i];
+  for (int I = 2; I < Sz; ++I) {
+    int MaskEltVal = Mask[I];
     if (MaskEltVal == -1)
       return false;
-    int MaskEltPrevVal = Mask[i - 2];
+    int MaskEltPrevVal = Mask[I - 2];
     if (MaskEltVal - MaskEltPrevVal != 2)
       return false;
   }
   return true;
 }
 
-bool ShuffleVectorInst::isSpliceMask(ArrayRef<int> Mask, int &Index) {
+bool ShuffleVectorInst::isSpliceMask(ArrayRef<int> Mask, int NumSrcElts,
+                                     int &Index) {
+  if (Mask.size() != static_cast<unsigned>(NumSrcElts))
+    return false;
   // Example: shufflevector <4 x n> A, <4 x n> B, <1,2,3,4>
   int StartIndex = -1;
   for (int I = 0, E = Mask.size(); I != E; ++I) {
@@ -2440,7 +2453,7 @@
     if (StartIndex == -1) {
       // Don't support a StartIndex that begins in the second input, or if the
       // first non-undef index would access below the StartIndex.
-      if (MaskEltVal < I || E <= (MaskEltVal - I))
+      if (MaskEltVal < I || NumSrcElts <= (MaskEltVal - I))
         return false;
 
       StartIndex = MaskEltVal - I;
@@ -2735,7 +2748,7 @@
   // case.
   if (isa<ScalableVectorType>(getType()))
     return false;
-  if (!isSingleSourceMask(ShuffleMask))
+  if (!isSingleSourceMask(ShuffleMask, VF))
     return false;
 
   return isOneUseSingleSourceMask(ShuffleMask, VF);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11747,7 +11747,7 @@
 
   if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
        (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
-      ShuffleVectorInst::isReverseMask(ShuffleMask)) {
+      ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
     SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
     return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
                        DAG.getConstant(8, dl, MVT::i32));
@@ -25786,7 +25786,8 @@
   unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
   unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
   if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
-    if (ShuffleVectorInst::isReverseMask(ShuffleMask) && Op2.isUndef()) {
+    if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
+        Op2.isUndef()) {
       Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
       return convertFromScalableVector(DAG, VT, Op);
     }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -8370,7 +8370,7 @@
   unsigned EltSize = VT.getScalarSizeInBits();
   if (EltSize >= 32 ||
       ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
-      ShuffleVectorInst::isIdentityMask(M) ||
+      ShuffleVectorInst::isIdentityMask(M, M.size()) ||
       isVREVMask(M, VT, 64) ||
       isVREVMask(M, VT, 32) ||
       isVREVMask(M, VT, 16))
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4198,7 +4198,8 @@
 
   assert(VT.getVectorElementType() == MVT::i1);
 
-  if (!ShuffleVectorInst::isReverseMask(SVN->getMask()) ||
+  if (!ShuffleVectorInst::isReverseMask(SVN->getMask(),
+                                        SVN->getMask().size()) ||
       !SVN->getOperand(1).isUndef())
     return SDValue();
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -366,7 +366,10 @@
           LT.second.getVectorElementType().getSizeInBits() ==
               Tp->getElementType()->getPrimitiveSizeInBits() &&
           LT.second.getVectorNumElements() <
-              cast<FixedVectorType>(Tp)->getNumElements()) {
+              cast<FixedVectorType>(Tp)->getNumElements() &&
+          divideCeil(Mask.size(),
+                     cast<FixedVectorType>(Tp)->getNumElements()) ==
+              static_cast<unsigned>(*LT.first.getValue())) {
         unsigned NumRegs = *LT.first.getValue();
         unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
         unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
@@ -542,7 +545,7 @@
     InstructionCost Cost = MemCost;
     for (unsigned Index : Indices) {
       FixedVectorType *SubVecTy =
-          FixedVectorType::get(FVTy->getElementType(), VF);
+          FixedVectorType::get(FVTy->getElementType(), VF * Factor);
       auto Mask = createStrideMask(Index, Factor, VF);
       InstructionCost ShuffleCost =
           getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1635,7 +1635,7 @@
             NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
             [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
              &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
-              if (!ShuffleVectorInst::isIdentityMask(RegMask)) {
+              if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
                 // Check if the previous register can be just copied to the next
                 // one.
                 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -2122,8 +2122,8 @@
     NewMask[i] = Mask[i] < (signed)NumElts ? Mask[i] : Mask1[i];
 
   // A select mask with undef elements might look like an identity mask.
-  assert((ShuffleVectorInst::isSelectMask(NewMask) ||
-          ShuffleVectorInst::isIdentityMask(NewMask)) &&
+  assert((ShuffleVectorInst::isSelectMask(NewMask, NumElts) ||
+          ShuffleVectorInst::isIdentityMask(NewMask, NumElts)) &&
          "Unexpected shuffle mask");
   return new ShuffleVectorInst(X, Y, NewMask);
 }
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3769,7 +3769,7 @@
     inversePermutation(Order, MaskOrder);
   }
   reorderReuses(MaskOrder, Mask);
-  if (ShuffleVectorInst::isIdentityMask(MaskOrder)) {
+  if (ShuffleVectorInst::isIdentityMask(MaskOrder, MaskOrder.size())) {
     Order.clear();
     return;
   }
@@ -4288,7 +4288,7 @@
 static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
                                                unsigned Sz) {
   ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
-  if (ShuffleVectorInst::isIdentityMask(FirstCluster))
+  if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
     return false;
   for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
     ArrayRef<int> Cluster = Mask.slice(I, Sz);
@@ -6597,9 +6597,17 @@
                              bool IsStrict) {
     int Limit = Mask.size();
     int VF = VecTy->getNumElements();
-    return (VF == Limit || !IsStrict) &&
-           all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
-           ShuffleVectorInst::isIdentityMask(Mask);
+    int Index = -1;
+    return (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit)) ||
+           (!IsStrict &&
+            ((ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
+              Index == 0) ||
+             (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
+                ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
+                return all_of(Slice,
+                              [](int I) { return I == PoisonMaskElem; }) ||
+                       ShuffleVectorInst::isIdentityMask(Slice, VF);
+              }))));
   }
 
   /// Tries to combine 2 different masks into single one.
@@ -6669,7 +6677,8 @@
       if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
         if (!IdentityOp || !SinglePermute ||
             (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
-             !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask))) {
+             !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,
+                                                    IdentityMask.size()))) {
           IdentityOp = SV;
           // Store current mask in the IdentityMask so later we did not lost
           // this info if IdentityOp is selected as the best candidate for the
@@ -6739,7 +6748,7 @@
     }
     if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
         !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
-        ShuffleVectorInst::isZeroEltSplatMask(Mask)) {
+        ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())) {
       if (IdentityOp) {
         V = IdentityOp;
         assert(Mask.size() == IdentityMask.size() &&
@@ -6755,7 +6764,7 @@
                                /*IsStrict=*/true) ||
                 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
                  Shuffle->isZeroEltSplat() &&
-                 ShuffleVectorInst::isZeroEltSplatMask(Mask)));
+                 ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())));
       }
       V = Op;
       return false;
@@ -6860,11 +6869,9 @@
           CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
         }
       }
-      const int Limit = CombinedMask1.size() * 2;
-      if (Op1 == Op2 && Limit == 2 * VF &&
-          all_of(CombinedMask1, [=](int Idx) { return Idx < Limit; }) &&
-          (ShuffleVectorInst::isIdentityMask(CombinedMask1) ||
-           (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1) &&
+      if (Op1 == Op2 &&
+          (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
+           (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
             isa<ShuffleVectorInst>(Op1) &&
             cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
                 ArrayRef(CombinedMask1))))
@@ -7035,7 +7042,10 @@
   /// extracted values from \p VL.
   InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
                                      TTI::ShuffleKind ShuffleKind) {
-    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
+    auto *VecTy = cast<FixedVectorType>(
+        cast<ExtractElementInst>(*find_if(VL, [](Value *V) {
+          return isa<ExtractElementInst>(V);
+        }))->getVectorOperandType());
     unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
 
     if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc ||
@@ -7097,11 +7107,9 @@
     const TargetTransformInfo &TTI;
 
     static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
-      int Limit = 2 * VF;
       return Mask.empty() ||
              (VF == Mask.size() &&
-              all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&
-              ShuffleVectorInst::isIdentityMask(Mask));
+              ShuffleVectorInst::isIdentityMask(Mask, VF));
     }
 
   public:
@@ -7350,9 +7358,7 @@
     ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
     if (CommonMask.empty())
       return Cost;
-    int Limit = CommonMask.size() * 2;
-    if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) &&
-        ShuffleVectorInst::isIdentityMask(CommonMask))
+    if (ShuffleVectorInst::isIdentityMask(CommonMask, CommonMask.size()))
       return Cost;
     return Cost +
            createShuffle(InVectors.front(),
@@ -7532,7 +7538,7 @@
   }
   if (NeedToShuffleReuses)
     ::addMask(Mask, E->ReuseShuffleIndices);
-  if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask))
+  if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
     CommonCost =
         TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
   assert((E->State == TreeEntry::Vectorize ||
@@ -7872,7 +7878,7 @@
         } else {
           SmallVector<int> Mask;
           inversePermutation(OpTE->ReorderIndices, Mask);
-          if (ShuffleVectorInst::isReverseMask(Mask))
+          if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
             CCH = TTI::CastContextHint::Reversed;
         }
       }
@@ -8729,9 +8735,7 @@
     unsigned VecVF = TE->getVectorFactor();
     if (VF != VecVF &&
         (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
-         (all_of(Mask,
-                 [VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&
-          !ShuffleVectorInst::isIdentityMask(Mask)))) {
+         !ShuffleVectorInst::isIdentityMask(Mask, VF))) {
       SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
       std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
                 OrigMask.begin());
@@ -8760,9 +8764,7 @@
       assert((TEs.size() == 1 || TEs.size() == 2) &&
              "Expected exactly 1 or 2 tree entries.");
       if (TEs.size() == 1) {
-        int Limit = 2 * Mask.size();
-        if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) ||
-            !ShuffleVectorInst::isIdentityMask(Mask)) {
+        if (!ShuffleVectorInst::isIdentityMask(Mask, Mask.size())) {
           InstructionCost C =
               TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);
           LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
@@ -9531,7 +9533,7 @@
         return V1;
       unsigned VF = Mask.size();
       unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
-      if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask))
+      if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
         return V1;
       Value *Vec = Builder.CreateShuffleVector(V1, Mask);
       if (auto *I = dyn_cast<Instruction>(Vec)) {
@@ -9928,9 +9930,7 @@
       return false;
     unsigned I =
         *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
-    int Sz = Mask.size();
-    if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) &&
-        ShuffleVectorInst::isIdentityMask(Mask))
+    if (ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
       std::iota(Mask.begin(), Mask.end(), 0);
     else
       std::fill(Mask.begin(), Mask.end(), I);
@@ -10174,11 +10174,11 @@
         (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
              TTI::SK_PermuteSingleSrc &&
          none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
-         ShuffleVectorInst::isIdentityMask(ExtractMask)) ||
+         ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
         (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
              TTI::SK_PermuteSingleSrc &&
          none_of(Mask, [&](int I) { return I >= MSz; }) &&
-         ShuffleVectorInst::isIdentityMask(Mask));
+         ShuffleVectorInst::isIdentityMask(Mask, MSz));
     bool EnoughConstsForShuffle =
         IsSingleShuffle &&
         (none_of(GatheredScalars,
@@ -10469,7 +10469,7 @@
       if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
           NumElts != NumScalars) {
         if (IsFirstUndef.all()) {
-          if (!ShuffleVectorInst::isIdentityMask(InsertMask)) {
+          if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
             SmallBitVector IsFirstPoison =
                 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
             if (!IsFirstPoison.all()) {
@@ -11278,7 +11278,7 @@
             // non-resizing mask.
             if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
                                    ->getNumElements() ||
-                !ShuffleVectorInst::isIdentityMask(Mask))
+                !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
               return CreateShuffle(Vals.front(), nullptr, Mask);
             return Vals.front();
           }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -464,45 +464,21 @@
 define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-LABEL: @combine_load_factor2_i64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
 ; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[I]]
 ; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
@@ -291,14 +291,19 @@
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2
+; GFX8-NEXT:    [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
+; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2
+; GFX8-NEXT:    [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
 ; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
+; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX8-NEXT:    [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
+; GFX8-NEXT:    [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
+; GFX8-NEXT:    ret <4 x i16> [[INS_3]]
 ;
 bb:
   %arg0.0 = extractelement <4 x i16> %arg0, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -291,14 +291,19 @@
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2
+; GFX8-NEXT:    [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
+; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2
+; GFX8-NEXT:    [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
 ; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
+; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX8-NEXT:    [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
+; GFX8-NEXT:    [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
+; GFX8-NEXT:    ret <4 x i16> [[INS_3]]
 ;
 bb:
   %arg0.0 = extractelement <4 x i16> %arg0, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
@@ -4,10 +4,15 @@
 define <2 x i16> @uadd_sat_v9i16_combine_vi16(<9 x i16> %arg0, <9 x i16> %arg1) {
 ; CHECK-LABEL: @uadd_sat_v9i16_combine_vi16(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <9 x i16> [[ARG0:%.*]], <9 x i16> poison, <2 x i32> <i32 poison, i32 8>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <9 x i16> [[ARG1:%.*]], <9 x i16> poison, <2 x i32> <i32 7, i32 8>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; CHECK-NEXT:    ret <2 x i16> [[TMP2]]
+; CHECK-NEXT:    [[ARG0_1:%.*]] = extractelement <9 x i16> undef, i64 7
+; CHECK-NEXT:    [[ARG0_2:%.*]] = extractelement <9 x i16> [[ARG0:%.*]], i64 8
+; CHECK-NEXT:    [[ARG1_1:%.*]] = extractelement <9 x i16> [[ARG1:%.*]], i64 7
+; CHECK-NEXT:    [[ARG1_2:%.*]] = extractelement <9 x i16> [[ARG1]], i64 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; CHECK-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_1]], i64 0
+; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <2 x i16> [[INS_1]], i16 [[ADD_2]], i64 1
+; CHECK-NEXT:    ret <2 x i16> [[INS_2]]
 ;
 bb:
   %arg0.1 = extractelement <9 x i16> undef, i64 7
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
@@ -4,20 +4,23 @@
 define <4 x half> @phis(i1 %cmp1, <4 x half> %in1, <4 x half> %in2)  {
 ; CHECK-LABEL: @phis(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x half> [[IN1:%.*]], i64 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x half> [[IN1]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]]
 ; CHECK:       bb0:
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x half> [[IN2:%.*]], i64 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x half> [[IN2]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY]] ], [ [[TMP3]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x half> [[TMP8]]
+; CHECK-NEXT:    [[C2:%.*]] = phi half [ [[A2]], [[ENTRY:%.*]] ], [ [[B2]], [[BB0]] ]
+; CHECK-NEXT:    [[C3:%.*]] = phi half [ [[A3]], [[ENTRY]] ], [ [[B3]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[O2:%.*]] = insertelement <4 x half> [[TMP3]], half [[C2]], i64 2
+; CHECK-NEXT:    [[O3:%.*]] = insertelement <4 x half> [[O2]], half [[C3]], i64 3
+; CHECK-NEXT:    ret <4 x half> [[O3]]
 ;
 entry:
   %a0 = extractelement <4 x half> %in1, i64 0
@@ -49,20 +52,23 @@
 define <4 x half> @phis_reverse(i1 %cmp1, <4 x half> %in1, <4 x half> %in2)  {
 ; CHECK-LABEL: @phis_reverse(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x half> [[IN1:%.*]], i64 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x half> [[IN1]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]]
 ; CHECK:       bb0:
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x half> [[IN2:%.*]], i64 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x half> [[IN2]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY]] ], [ [[TMP3]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x half> [[TMP8]]
+; CHECK-NEXT:    [[C3:%.*]] = phi half [ [[A3]], [[ENTRY:%.*]] ], [ [[B3]], [[BB0]] ]
+; CHECK-NEXT:    [[C2:%.*]] = phi half [ [[A2]], [[ENTRY]] ], [ [[B2]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[O2:%.*]] = insertelement <4 x half> [[TMP3]], half [[C2]], i64 2
+; CHECK-NEXT:    [[O3:%.*]] = insertelement <4 x half> [[O2]], half [[C3]], i64 3
+; CHECK-NEXT:    ret <4 x half> [[O3]]
 ;
 entry:
   %a0 = extractelement <4 x half> %in1, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -155,11 +155,13 @@
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @exp_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -171,11 +173,13 @@
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -208,11 +212,13 @@
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_exp_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -224,11 +230,13 @@
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -261,11 +269,13 @@
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @log_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -277,11 +287,13 @@
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -314,11 +326,13 @@
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_log_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -330,11 +344,13 @@
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -367,11 +383,13 @@
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @sin_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -383,11 +401,13 @@
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -420,11 +440,13 @@
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_sin_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -436,11 +458,13 @@
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
@@ -2,9 +2,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
 
 define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SSE-LABEL: @ceil_floor(
@@ -51,24 +51,47 @@
 ;
 ; AVX-LABEL: @ceil_floor(
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
+; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; AVX-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
+; AVX-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
-; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
+; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX-NEXT:    ret <8 x float> [[R71]]
+;
+; AVX2-LABEL: @ceil_floor(
+; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
+; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; AVX2-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX2-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; AVX2-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
+; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    ret <8 x float> [[R71]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
@@ -2,9 +2,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
 
 define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SSE-LABEL: @ceil_floor(
@@ -51,24 +51,47 @@
 ;
 ; AVX-LABEL: @ceil_floor(
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
+; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; AVX-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
+; AVX-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i64 0
-; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
-; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
+; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX-NEXT:    ret <8 x float> [[R71]]
+;
+; AVX2-LABEL: @ceil_floor(
+; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
+; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; AVX2-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX2-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; AVX2-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i64 0
+; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    ret <8 x float> [[R71]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -167,24 +167,42 @@
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[R031]]
+; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; SSE-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = fadd double [[A2]], [[A3]]
+; SSE-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
+; SSE-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
+; SSE-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
+; SSE-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
+; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
+; SSE-NEXT:    ret <4 x double> [[R03]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x double> [[R031]]
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = fadd double [[A2]], [[A3]]
+; SLM-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
+; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
+; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
+; SLM-NEXT:    ret <4 x double> [[R03]]
 ;
 ; AVX-LABEL: @test_v4f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -213,16 +231,29 @@
 
 ; PR50392
 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64_partial_swizzle(
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; CHECK-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
-; CHECK-NEXT:    ret <4 x double> [[R03]]
+; SSE-LABEL: @test_v4f64_partial_swizzle(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; SSE-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; SSE-NEXT:    ret <4 x double> [[R031]]
+;
+; SLM-LABEL: @test_v4f64_partial_swizzle(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SLM-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; SLM-NEXT:    ret <4 x double> [[R031]]
 ;
   %a0 = extractelement <4 x double> %a, i64 0
   %a1 = extractelement <4 x double> %a, i64 1
@@ -247,13 +278,23 @@
 ; SSE-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 0, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 0, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
+; SLM-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
+; SLM-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP10]], [[TMP11]]
+; SLM-NEXT:    [[R033:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[R052:%.*]] = shufflevector <8 x float> [[R033]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[R071:%.*]] = shufflevector <8 x float> [[R052]], <8 x float> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SLM-NEXT:    ret <8 x float> [[R071]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -367,14 +408,48 @@
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SSE-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[RV151]]
+; SSE-NEXT:    [[B0:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <16 x i16> [[B]], i64 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <16 x i16> [[B]], i64 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <16 x i16> [[B]], i64 3
+; SSE-NEXT:    [[B4:%.*]] = extractelement <16 x i16> [[B]], i64 4
+; SSE-NEXT:    [[B5:%.*]] = extractelement <16 x i16> [[B]], i64 5
+; SSE-NEXT:    [[B6:%.*]] = extractelement <16 x i16> [[B]], i64 6
+; SSE-NEXT:    [[B7:%.*]] = extractelement <16 x i16> [[B]], i64 7
+; SSE-NEXT:    [[B8:%.*]] = extractelement <16 x i16> [[B]], i64 8
+; SSE-NEXT:    [[B9:%.*]] = extractelement <16 x i16> [[B]], i64 9
+; SSE-NEXT:    [[B10:%.*]] = extractelement <16 x i16> [[B]], i64 10
+; SSE-NEXT:    [[B11:%.*]] = extractelement <16 x i16> [[B]], i64 11
+; SSE-NEXT:    [[B12:%.*]] = extractelement <16 x i16> [[B]], i64 12
+; SSE-NEXT:    [[B13:%.*]] = extractelement <16 x i16> [[B]], i64 13
+; SSE-NEXT:    [[B14:%.*]] = extractelement <16 x i16> [[B]], i64 14
+; SSE-NEXT:    [[B15:%.*]] = extractelement <16 x i16> [[B]], i64 15
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = add <4 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[R4:%.*]] = add i16 [[B0]], [[B1]]
+; SSE-NEXT:    [[R5:%.*]] = add i16 [[B2]], [[B3]]
+; SSE-NEXT:    [[R6:%.*]] = add i16 [[B4]], [[B5]]
+; SSE-NEXT:    [[R7:%.*]] = add i16 [[B6]], [[B7]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 9, i32 11, i32 13, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R12:%.*]] = add i16 [[B8]], [[B9]]
+; SSE-NEXT:    [[R13:%.*]] = add i16 [[B10]], [[B11]]
+; SSE-NEXT:    [[R14:%.*]] = add i16 [[B12]], [[B13]]
+; SSE-NEXT:    [[R15:%.*]] = add i16 [[B14]], [[B15]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[RV4:%.*]] = insertelement <16 x i16> [[TMP7]], i16 [[R4]], i64 4
+; SSE-NEXT:    [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[R5]], i64 5
+; SSE-NEXT:    [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[R6]], i64 6
+; SSE-NEXT:    [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[R7]], i64 7
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[RV111:%.*]] = shufflevector <16 x i16> [[RV7]], <16 x i16> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[RV12:%.*]] = insertelement <16 x i16> [[RV111]], i16 [[R12]], i64 12
+; SSE-NEXT:    [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i64 13
+; SSE-NEXT:    [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i64 14
+; SSE-NEXT:    [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i64 15
+; SSE-NEXT:    ret <16 x i16> [[RV15]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -167,24 +167,42 @@
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[R031]]
+; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; SSE-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = fadd double [[A2]], [[A3]]
+; SSE-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
+; SSE-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i64 0
+; SSE-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
+; SSE-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
+; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
+; SSE-NEXT:    ret <4 x double> [[R03]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x double> [[R031]]
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = fadd double [[A2]], [[A3]]
+; SLM-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i64 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
+; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
+; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
+; SLM-NEXT:    ret <4 x double> [[R03]]
 ;
 ; AVX-LABEL: @test_v4f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -213,16 +231,29 @@
 
 ; PR50392
 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64_partial_swizzle(
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> undef, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; CHECK-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
-; CHECK-NEXT:    ret <4 x double> [[R03]]
+; SSE-LABEL: @test_v4f64_partial_swizzle(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; SSE-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i64 0
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; SSE-NEXT:    ret <4 x double> [[R031]]
+;
+; SLM-LABEL: @test_v4f64_partial_swizzle(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i64 0
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SLM-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; SLM-NEXT:    ret <4 x double> [[R031]]
 ;
   %a0 = extractelement <4 x double> %a, i64 0
   %a1 = extractelement <4 x double> %a, i64 1
@@ -247,13 +278,23 @@
 ; SSE-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 0, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 0, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
+; SLM-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
+; SLM-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP10]], [[TMP11]]
+; SLM-NEXT:    [[R033:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[R052:%.*]] = shufflevector <8 x float> [[R033]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[R071:%.*]] = shufflevector <8 x float> [[R052]], <8 x float> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SLM-NEXT:    ret <8 x float> [[R071]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -367,14 +408,48 @@
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SSE-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[RV151]]
+; SSE-NEXT:    [[B0:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <16 x i16> [[B]], i64 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <16 x i16> [[B]], i64 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <16 x i16> [[B]], i64 3
+; SSE-NEXT:    [[B4:%.*]] = extractelement <16 x i16> [[B]], i64 4
+; SSE-NEXT:    [[B5:%.*]] = extractelement <16 x i16> [[B]], i64 5
+; SSE-NEXT:    [[B6:%.*]] = extractelement <16 x i16> [[B]], i64 6
+; SSE-NEXT:    [[B7:%.*]] = extractelement <16 x i16> [[B]], i64 7
+; SSE-NEXT:    [[B8:%.*]] = extractelement <16 x i16> [[B]], i64 8
+; SSE-NEXT:    [[B9:%.*]] = extractelement <16 x i16> [[B]], i64 9
+; SSE-NEXT:    [[B10:%.*]] = extractelement <16 x i16> [[B]], i64 10
+; SSE-NEXT:    [[B11:%.*]] = extractelement <16 x i16> [[B]], i64 11
+; SSE-NEXT:    [[B12:%.*]] = extractelement <16 x i16> [[B]], i64 12
+; SSE-NEXT:    [[B13:%.*]] = extractelement <16 x i16> [[B]], i64 13
+; SSE-NEXT:    [[B14:%.*]] = extractelement <16 x i16> [[B]], i64 14
+; SSE-NEXT:    [[B15:%.*]] = extractelement <16 x i16> [[B]], i64 15
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = add <4 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[R4:%.*]] = add i16 [[B0]], [[B1]]
+; SSE-NEXT:    [[R5:%.*]] = add i16 [[B2]], [[B3]]
+; SSE-NEXT:    [[R6:%.*]] = add i16 [[B4]], [[B5]]
+; SSE-NEXT:    [[R7:%.*]] = add i16 [[B6]], [[B7]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 9, i32 11, i32 13, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R12:%.*]] = add i16 [[B8]], [[B9]]
+; SSE-NEXT:    [[R13:%.*]] = add i16 [[B10]], [[B11]]
+; SSE-NEXT:    [[R14:%.*]] = add i16 [[B12]], [[B13]]
+; SSE-NEXT:    [[R15:%.*]] = add i16 [[B14]], [[B15]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[RV4:%.*]] = insertelement <16 x i16> [[TMP7]], i16 [[R4]], i64 4
+; SSE-NEXT:    [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[R5]], i64 5
+; SSE-NEXT:    [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[R6]], i64 6
+; SSE-NEXT:    [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[R7]], i64 7
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[RV111:%.*]] = shufflevector <16 x i16> [[RV7]], <16 x i16> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[RV12:%.*]] = insertelement <16 x i16> [[RV111]], i16 [[R12]], i64 12
+; SSE-NEXT:    [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i64 13
+; SSE-NEXT:    [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i64 14
+; SSE-NEXT:    [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i64 15
+; SSE-NEXT:    ret <16 x i16> [[RV15]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
@@ -146,24 +146,42 @@
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[R031]]
+; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
+; SSE-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = fsub double [[A2]], [[A3]]
+; SSE-NEXT:    [[R3:%.*]] = fsub double [[B2]], [[B3]]
+; SSE-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
+; SSE-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
+; SSE-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
+; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
+; SSE-NEXT:    ret <4 x double> [[R03]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x double> [[R031]]
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SLM-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = fsub double [[A2]], [[A3]]
+; SLM-NEXT:    [[R3:%.*]] = fsub double [[B2]], [[B3]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
+; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
+; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
+; SLM-NEXT:    ret <4 x double> [[R03]]
 ;
 ; AVX-LABEL: @test_v4f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -198,13 +216,23 @@
 ; SSE-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <2 x i32> <i32 0, i32 2>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 0, i32 2>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 1, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x float> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
+; SLM-NEXT:    [[TMP9:%.*]] = fsub <2 x float> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
+; SLM-NEXT:    [[TMP12:%.*]] = fsub <2 x float> [[TMP10]], [[TMP11]]
+; SLM-NEXT:    [[R033:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[R052:%.*]] = shufflevector <8 x float> [[R033]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[R071:%.*]] = shufflevector <8 x float> [[R052]], <8 x float> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SLM-NEXT:    ret <8 x float> [[R071]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -318,14 +346,48 @@
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SSE-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[RV151]]
+; SSE-NEXT:    [[B0:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <16 x i16> [[B]], i64 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <16 x i16> [[B]], i64 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <16 x i16> [[B]], i64 3
+; SSE-NEXT:    [[B4:%.*]] = extractelement <16 x i16> [[B]], i64 4
+; SSE-NEXT:    [[B5:%.*]] = extractelement <16 x i16> [[B]], i64 5
+; SSE-NEXT:    [[B6:%.*]] = extractelement <16 x i16> [[B]], i64 6
+; SSE-NEXT:    [[B7:%.*]] = extractelement <16 x i16> [[B]], i64 7
+; SSE-NEXT:    [[B8:%.*]] = extractelement <16 x i16> [[B]], i64 8
+; SSE-NEXT:    [[B9:%.*]] = extractelement <16 x i16> [[B]], i64 9
+; SSE-NEXT:    [[B10:%.*]] = extractelement <16 x i16> [[B]], i64 10
+; SSE-NEXT:    [[B11:%.*]] = extractelement <16 x i16> [[B]], i64 11
+; SSE-NEXT:    [[B12:%.*]] = extractelement <16 x i16> [[B]], i64 12
+; SSE-NEXT:    [[B13:%.*]] = extractelement <16 x i16> [[B]], i64 13
+; SSE-NEXT:    [[B14:%.*]] = extractelement <16 x i16> [[B]], i64 14
+; SSE-NEXT:    [[B15:%.*]] = extractelement <16 x i16> [[B]], i64 15
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = sub <4 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[R4:%.*]] = sub i16 [[B0]], [[B1]]
+; SSE-NEXT:    [[R5:%.*]] = sub i16 [[B2]], [[B3]]
+; SSE-NEXT:    [[R6:%.*]] = sub i16 [[B4]], [[B5]]
+; SSE-NEXT:    [[R7:%.*]] = sub i16 [[B6]], [[B7]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 9, i32 11, i32 13, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R12:%.*]] = sub i16 [[B8]], [[B9]]
+; SSE-NEXT:    [[R13:%.*]] = sub i16 [[B10]], [[B11]]
+; SSE-NEXT:    [[R14:%.*]] = sub i16 [[B12]], [[B13]]
+; SSE-NEXT:    [[R15:%.*]] = sub i16 [[B14]], [[B15]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[RV4:%.*]] = insertelement <16 x i16> [[TMP7]], i16 [[R4]], i64 4
+; SSE-NEXT:    [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[R5]], i64 5
+; SSE-NEXT:    [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[R6]], i64 6
+; SSE-NEXT:    [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[R7]], i64 7
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[RV111:%.*]] = shufflevector <16 x i16> [[RV7]], <16 x i16> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[RV12:%.*]] = insertelement <16 x i16> [[RV111]], i16 [[R12]], i64 12
+; SSE-NEXT:    [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i64 13
+; SSE-NEXT:    [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i64 14
+; SSE-NEXT:    [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i64 15
+; SSE-NEXT:    ret <16 x i16> [[RV15]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -146,24 +146,42 @@
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[R031]]
+; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
+; SSE-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = fsub double [[A2]], [[A3]]
+; SSE-NEXT:    [[R3:%.*]] = fsub double [[B2]], [[B3]]
+; SSE-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i64 0
+; SSE-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
+; SSE-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
+; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
+; SSE-NEXT:    ret <4 x double> [[R03]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x double> [[R031]]
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i64 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i64 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i64 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i64 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SLM-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = fsub double [[A2]], [[A3]]
+; SLM-NEXT:    [[R3:%.*]] = fsub double [[B2]], [[B3]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i64 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i64 1
+; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i64 2
+; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i64 3
+; SLM-NEXT:    ret <4 x double> [[R03]]
 ;
 ; AVX-LABEL: @test_v4f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -198,13 +216,23 @@
 ; SSE-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <2 x i32> <i32 0, i32 2>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 0, i32 2>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 1, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x float> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
+; SLM-NEXT:    [[TMP9:%.*]] = fsub <2 x float> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
+; SLM-NEXT:    [[TMP12:%.*]] = fsub <2 x float> [[TMP10]], [[TMP11]]
+; SLM-NEXT:    [[R033:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[R052:%.*]] = shufflevector <8 x float> [[R033]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[R071:%.*]] = shufflevector <8 x float> [[R052]], <8 x float> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SLM-NEXT:    ret <8 x float> [[R071]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -318,14 +346,48 @@
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SSE-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[RV151]]
+; SSE-NEXT:    [[B0:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <16 x i16> [[B]], i64 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <16 x i16> [[B]], i64 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <16 x i16> [[B]], i64 3
+; SSE-NEXT:    [[B4:%.*]] = extractelement <16 x i16> [[B]], i64 4
+; SSE-NEXT:    [[B5:%.*]] = extractelement <16 x i16> [[B]], i64 5
+; SSE-NEXT:    [[B6:%.*]] = extractelement <16 x i16> [[B]], i64 6
+; SSE-NEXT:    [[B7:%.*]] = extractelement <16 x i16> [[B]], i64 7
+; SSE-NEXT:    [[B8:%.*]] = extractelement <16 x i16> [[B]], i64 8
+; SSE-NEXT:    [[B9:%.*]] = extractelement <16 x i16> [[B]], i64 9
+; SSE-NEXT:    [[B10:%.*]] = extractelement <16 x i16> [[B]], i64 10
+; SSE-NEXT:    [[B11:%.*]] = extractelement <16 x i16> [[B]], i64 11
+; SSE-NEXT:    [[B12:%.*]] = extractelement <16 x i16> [[B]], i64 12
+; SSE-NEXT:    [[B13:%.*]] = extractelement <16 x i16> [[B]], i64 13
+; SSE-NEXT:    [[B14:%.*]] = extractelement <16 x i16> [[B]], i64 14
+; SSE-NEXT:    [[B15:%.*]] = extractelement <16 x i16> [[B]], i64 15
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = sub <4 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[R4:%.*]] = sub i16 [[B0]], [[B1]]
+; SSE-NEXT:    [[R5:%.*]] = sub i16 [[B2]], [[B3]]
+; SSE-NEXT:    [[R6:%.*]] = sub i16 [[B4]], [[B5]]
+; SSE-NEXT:    [[R7:%.*]] = sub i16 [[B6]], [[B7]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <4 x i32> <i32 9, i32 11, i32 13, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R12:%.*]] = sub i16 [[B8]], [[B9]]
+; SSE-NEXT:    [[R13:%.*]] = sub i16 [[B10]], [[B11]]
+; SSE-NEXT:    [[R14:%.*]] = sub i16 [[B12]], [[B13]]
+; SSE-NEXT:    [[R15:%.*]] = sub i16 [[B14]], [[B15]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[RV4:%.*]] = insertelement <16 x i16> [[TMP7]], i16 [[R4]], i64 4
+; SSE-NEXT:    [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[R5]], i64 5
+; SSE-NEXT:    [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[R6]], i64 6
+; SSE-NEXT:    [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[R7]], i64 7
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[RV111:%.*]] = shufflevector <16 x i16> [[RV7]], <16 x i16> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[RV12:%.*]] = insertelement <16 x i16> [[RV111]], i16 [[R12]], i64 12
+; SSE-NEXT:    [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i64 13
+; SSE-NEXT:    [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i64 14
+; SSE-NEXT:    [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i64 15
+; SSE-NEXT:    ret <16 x i16> [[RV15]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
@@ -28,15 +28,13 @@
 ;
 ; SSE42-LABEL: @reduce_and4(
 ; SSE42-NEXT:  entry:
-; SSE42-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V4:%.*]])
-; SSE42-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V3:%.*]])
-; SSE42-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP0]], [[TMP1]]
-; SSE42-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V2:%.*]])
-; SSE42-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP2]]
-; SSE42-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V1:%.*]])
-; SSE42-NEXT:    [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP3]]
-; SSE42-NEXT:    [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[ACC:%.*]]
-; SSE42-NEXT:    ret i32 [[OP_RDX3]]
+; SSE42-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; SSE42-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; SSE42-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
+; SSE42-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
+; SSE42-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]]
+; SSE42-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
+; SSE42-NEXT:    ret i32 [[OP_RDX1]]
 ;
 ; AVX-LABEL: @reduce_and4(
 ; AVX-NEXT:  entry:
@@ -103,15 +101,13 @@
 ; SSE2-NEXT:    ret i32 [[OP_RDX1]]
 ;
 ; SSE42-LABEL: @reduce_and4_transpose(
-; SSE42-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V4:%.*]])
-; SSE42-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V3:%.*]])
-; SSE42-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP2]]
-; SSE42-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V2:%.*]])
-; SSE42-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP3]]
-; SSE42-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V1:%.*]])
-; SSE42-NEXT:    [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP4]]
-; SSE42-NEXT:    [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[ACC:%.*]]
-; SSE42-NEXT:    ret i32 [[OP_RDX3]]
+; SSE42-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; SSE42-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; SSE42-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
+; SSE42-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
+; SSE42-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]]
+; SSE42-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
+; SSE42-NEXT:    ret i32 [[OP_RDX1]]
 ;
 ; AVX-LABEL: @reduce_and4_transpose(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -1024,71 +1024,141 @@
   Constant *C7 = ConstantInt::get(Int32Ty, 7);
 
   Constant *Identity = ConstantVector::get({C0, CU, C2, C3, C4});
-  EXPECT_TRUE(ShuffleVectorInst::isIdentityMask(Identity));
-  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(Identity)); // identity is distinguished from select
-  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(Identity));
-  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(Identity)); // identity is always single source
-  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(Identity));
-  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(Identity));
+  EXPECT_TRUE(ShuffleVectorInst::isIdentityMask(
+      Identity, cast<FixedVectorType>(Identity->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(
+      Identity,
+      cast<FixedVectorType>(Identity->getType())
+          ->getNumElements())); // identity is distinguished from select
+  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(
+      Identity, cast<FixedVectorType>(Identity->getType())->getNumElements()));
+  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(
+      Identity, cast<FixedVectorType>(Identity->getType())
+                    ->getNumElements())); // identity is always single source
+  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(
+      Identity, cast<FixedVectorType>(Identity->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(
+      Identity, cast<FixedVectorType>(Identity->getType())->getNumElements()));
 
   Constant *Select = ConstantVector::get({CU, C1, C5});
-  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(Select));
-  EXPECT_TRUE(ShuffleVectorInst::isSelectMask(Select));
-  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(Select));
-  EXPECT_FALSE(ShuffleVectorInst::isSingleSourceMask(Select));
-  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(Select));
-  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(Select));
-  
+  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(
+      Select, cast<FixedVectorType>(Select->getType())->getNumElements()));
+  EXPECT_TRUE(ShuffleVectorInst::isSelectMask(
+      Select, cast<FixedVectorType>(Select->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(
+      Select, cast<FixedVectorType>(Select->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isSingleSourceMask(
+      Select, cast<FixedVectorType>(Select->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(
+      Select, cast<FixedVectorType>(Select->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(
+      Select, cast<FixedVectorType>(Select->getType())->getNumElements()));
+
   Constant *Reverse = ConstantVector::get({C3, C2, C1, CU});
-  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(Reverse));
-  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(Reverse));
-  EXPECT_TRUE(ShuffleVectorInst::isReverseMask(Reverse));
-  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(Reverse)); // reverse is always single source
-  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(Reverse));
-  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(Reverse));
+  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(
+      Reverse, cast<FixedVectorType>(Reverse->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(
+      Reverse, cast<FixedVectorType>(Reverse->getType())->getNumElements()));
+  EXPECT_TRUE(ShuffleVectorInst::isReverseMask(
+      Reverse, cast<FixedVectorType>(Reverse->getType())->getNumElements()));
+  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(
+      Reverse, cast<FixedVectorType>(Reverse->getType())
+                   ->getNumElements())); // reverse is always single source
+  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(
+      Reverse, cast<FixedVectorType>(Reverse->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(
+      Reverse, cast<FixedVectorType>(Reverse->getType())->getNumElements()));
 
   Constant *SingleSource = ConstantVector::get({C2, C2, C0, CU});
-  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(SingleSource));
-  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(SingleSource));
-  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(SingleSource));
-  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(SingleSource));
-  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(SingleSource));
-  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(SingleSource));
+  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(
+      SingleSource,
+      cast<FixedVectorType>(SingleSource->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(
+      SingleSource,
+      cast<FixedVectorType>(SingleSource->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(
+      SingleSource,
+      cast<FixedVectorType>(SingleSource->getType())->getNumElements()));
+  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(
+      SingleSource,
+      cast<FixedVectorType>(SingleSource->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(
+      SingleSource,
+      cast<FixedVectorType>(SingleSource->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(
+      SingleSource,
+      cast<FixedVectorType>(SingleSource->getType())->getNumElements()));
 
   Constant *ZeroEltSplat = ConstantVector::get({C0, C0, CU, C0});
-  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(ZeroEltSplat));
-  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(ZeroEltSplat));
-  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(ZeroEltSplat));
-  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(ZeroEltSplat)); // 0-splat is always single source
-  EXPECT_TRUE(ShuffleVectorInst::isZeroEltSplatMask(ZeroEltSplat));
-  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(ZeroEltSplat));
+  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(
+      ZeroEltSplat,
+      cast<FixedVectorType>(ZeroEltSplat->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(
+      ZeroEltSplat,
+      cast<FixedVectorType>(ZeroEltSplat->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(
+      ZeroEltSplat,
+      cast<FixedVectorType>(ZeroEltSplat->getType())->getNumElements()));
+  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(
+      ZeroEltSplat, cast<FixedVectorType>(ZeroEltSplat->getType())
+                        ->getNumElements())); // 0-splat is always single source
+  EXPECT_TRUE(ShuffleVectorInst::isZeroEltSplatMask(
+      ZeroEltSplat,
+      cast<FixedVectorType>(ZeroEltSplat->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isTransposeMask(
+      ZeroEltSplat,
+      cast<FixedVectorType>(ZeroEltSplat->getType())->getNumElements()));
 
   Constant *Transpose = ConstantVector::get({C0, C4, C2, C6});
-  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(Transpose));
-  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(Transpose));
-  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(Transpose));
-  EXPECT_FALSE(ShuffleVectorInst::isSingleSourceMask(Transpose));
-  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(Transpose));
-  EXPECT_TRUE(ShuffleVectorInst::isTransposeMask(Transpose));
+  EXPECT_FALSE(ShuffleVectorInst::isIdentityMask(
+      Transpose,
+      cast<FixedVectorType>(Transpose->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isSelectMask(
+      Transpose,
+      cast<FixedVectorType>(Transpose->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isReverseMask(
+      Transpose,
+      cast<FixedVectorType>(Transpose->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isSingleSourceMask(
+      Transpose,
+      cast<FixedVectorType>(Transpose->getType())->getNumElements()));
+  EXPECT_FALSE(ShuffleVectorInst::isZeroEltSplatMask(
+      Transpose,
+      cast<FixedVectorType>(Transpose->getType())->getNumElements()));
+  EXPECT_TRUE(ShuffleVectorInst::isTransposeMask(
+      Transpose,
+      cast<FixedVectorType>(Transpose->getType())->getNumElements()));
 
   // More tests to make sure the logic is/stays correct...
-  EXPECT_TRUE(ShuffleVectorInst::isIdentityMask(ConstantVector::get({CU, C1, CU, C3})));
-  EXPECT_TRUE(ShuffleVectorInst::isIdentityMask(ConstantVector::get({C4, CU, C6, CU})));
-
-  EXPECT_TRUE(ShuffleVectorInst::isSelectMask(ConstantVector::get({C4, C1, C6, CU})));
-  EXPECT_TRUE(ShuffleVectorInst::isSelectMask(ConstantVector::get({CU, C1, C6, C3})));
-
-  EXPECT_TRUE(ShuffleVectorInst::isReverseMask(ConstantVector::get({C7, C6, CU, C4})));
-  EXPECT_TRUE(ShuffleVectorInst::isReverseMask(ConstantVector::get({C3, CU, C1, CU})));
-
-  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(ConstantVector::get({C7, C5, CU, C7})));
-  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(ConstantVector::get({C3, C0, CU, C3})));
-
-  EXPECT_TRUE(ShuffleVectorInst::isZeroEltSplatMask(ConstantVector::get({C4, CU, CU, C4})));
-  EXPECT_TRUE(ShuffleVectorInst::isZeroEltSplatMask(ConstantVector::get({CU, C0, CU, C0})));
-
-  EXPECT_TRUE(ShuffleVectorInst::isTransposeMask(ConstantVector::get({C1, C5, C3, C7})));
-  EXPECT_TRUE(ShuffleVectorInst::isTransposeMask(ConstantVector::get({C1, C3})));
+  EXPECT_TRUE(ShuffleVectorInst::isIdentityMask(
+      ConstantVector::get({CU, C1, CU, C3}), 4));
+  EXPECT_TRUE(ShuffleVectorInst::isIdentityMask(
+      ConstantVector::get({C4, CU, C6, CU}), 4));
+
+  EXPECT_TRUE(ShuffleVectorInst::isSelectMask(
+      ConstantVector::get({C4, C1, C6, CU}), 4));
+  EXPECT_TRUE(ShuffleVectorInst::isSelectMask(
+      ConstantVector::get({CU, C1, C6, C3}), 4));
+
+  EXPECT_TRUE(ShuffleVectorInst::isReverseMask(
+      ConstantVector::get({C7, C6, CU, C4}), 4));
+  EXPECT_TRUE(ShuffleVectorInst::isReverseMask(
+      ConstantVector::get({C3, CU, C1, CU}), 4));
+
+  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(
+      ConstantVector::get({C7, C5, CU, C7}), 4));
+  EXPECT_TRUE(ShuffleVectorInst::isSingleSourceMask(
+      ConstantVector::get({C3, C0, CU, C3}), 4));
+
+  EXPECT_TRUE(ShuffleVectorInst::isZeroEltSplatMask(
+      ConstantVector::get({C4, CU, CU, C4}), 4));
+  EXPECT_TRUE(ShuffleVectorInst::isZeroEltSplatMask(
+      ConstantVector::get({CU, C0, CU, C0}), 4));
+
+  EXPECT_TRUE(ShuffleVectorInst::isTransposeMask(
+      ConstantVector::get({C1, C5, C3, C7}), 4));
+  EXPECT_TRUE(
+      ShuffleVectorInst::isTransposeMask(ConstantVector::get({C1, C3}), 2));
 
   // Nothing special about the values here - just re-using inputs to reduce code. 
   Constant *V0 = ConstantVector::get({C0, C1, C2, C3});
diff --git a/llvm/unittests/IR/ShuffleVectorInstTest.cpp b/llvm/unittests/IR/ShuffleVectorInstTest.cpp
--- a/llvm/unittests/IR/ShuffleVectorInstTest.cpp
+++ b/llvm/unittests/IR/ShuffleVectorInstTest.cpp
@@ -14,51 +14,84 @@
 namespace {
 
 TEST(ShuffleVectorInst, isIdentityMask) {
-  ASSERT_TRUE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3}));
-  ASSERT_TRUE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3, -1}));
-  ASSERT_TRUE(ShuffleVectorInst::isIdentityMask({0, 1, -1, 3}));
+  ASSERT_TRUE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3}, 4));
+  ASSERT_TRUE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3, -1}, 5));
+  ASSERT_TRUE(ShuffleVectorInst::isIdentityMask({0, 1, -1, 3}, 4));
 
-  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 4}));
-  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, -1, 2, 4}));
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3, -1}, 4));
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, -1, 3}, 3));
+
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3}, 5));
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 3, -1}, 6));
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, -1, 3}, 5));
+
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, 1, 2, 4}, 4));
+  ASSERT_FALSE(ShuffleVectorInst::isIdentityMask({0, -1, 2, 4}, 4));
 }
 
 TEST(ShuffleVectorInst, isSelectMask) {
-  ASSERT_TRUE(ShuffleVectorInst::isSelectMask({0, 5, 6, 3}));
+  ASSERT_TRUE(ShuffleVectorInst::isSelectMask({0, 5, 6, 3}, 4));
+
+  ASSERT_FALSE(ShuffleVectorInst::isSelectMask({0, 5, 6, 3}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isSelectMask({0, 5, 6, 3}, 5));
 
-  ASSERT_FALSE(ShuffleVectorInst::isSelectMask({0, 1, 2, 3}));
+  ASSERT_FALSE(ShuffleVectorInst::isSelectMask({0, 1, 2, 3}, 4));
 }
 
 TEST(ShuffleVectorInst, isReverseMask) {
-  ASSERT_TRUE(ShuffleVectorInst::isReverseMask({3, 2, 1, 0}));
-  ASSERT_TRUE(ShuffleVectorInst::isReverseMask({-1, -1, 1, 0}));
+  ASSERT_TRUE(ShuffleVectorInst::isReverseMask({3, 2, 1, 0}, 4));
+  ASSERT_TRUE(ShuffleVectorInst::isReverseMask({-1, -1, 1, 0}, 4));
 
-  ASSERT_FALSE(ShuffleVectorInst::isReverseMask({4, 3, 2, 1}));
+  ASSERT_FALSE(ShuffleVectorInst::isReverseMask({3, 2, 1, 0}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isReverseMask({-1, -1, 1, 0}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isReverseMask({3, 2, 1, 0}, 5));
+  ASSERT_FALSE(ShuffleVectorInst::isReverseMask({-1, -1, 1, 0}, 5));
+
+  ASSERT_FALSE(ShuffleVectorInst::isReverseMask({4, 3, 2, 1}, 4));
 }
 
 TEST(ShuffleVectorInst, isZeroEltSplatMask) {
-  ASSERT_TRUE(ShuffleVectorInst::isZeroEltSplatMask({0, 0, 0, 0}));
-  ASSERT_TRUE(ShuffleVectorInst::isZeroEltSplatMask({0, -1, 0, -1}));
+  ASSERT_TRUE(ShuffleVectorInst::isZeroEltSplatMask({0, 0, 0, 0}, 4));
+  ASSERT_TRUE(ShuffleVectorInst::isZeroEltSplatMask({0, -1, 0, -1}, 4));
+
+  ASSERT_FALSE(ShuffleVectorInst::isZeroEltSplatMask({0, 0, 0, 0}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isZeroEltSplatMask({0, -1, 0, -1}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isZeroEltSplatMask({0, 0, 0, 0}, 5));
+  ASSERT_FALSE(ShuffleVectorInst::isZeroEltSplatMask({0, -1, 0, -1}, 5));
 
-  ASSERT_FALSE(ShuffleVectorInst::isZeroEltSplatMask({1, 1, 1, 1}));
+  ASSERT_FALSE(ShuffleVectorInst::isZeroEltSplatMask({1, 1, 1, 1}, 4));
 }
 
 TEST(ShuffleVectorInst, isTransposeMask) {
-  ASSERT_TRUE(ShuffleVectorInst::isTransposeMask({0, 4, 2, 6}));
-  ASSERT_TRUE(ShuffleVectorInst::isTransposeMask({1, 5, 3, 7}));
+  ASSERT_TRUE(ShuffleVectorInst::isTransposeMask({0, 4, 2, 6}, 4));
+  ASSERT_TRUE(ShuffleVectorInst::isTransposeMask({1, 5, 3, 7}, 4));
 
-  ASSERT_FALSE(ShuffleVectorInst::isTransposeMask({2, 6, 4, 8}));
+  ASSERT_FALSE(ShuffleVectorInst::isTransposeMask({0, 4, 2, 6}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isTransposeMask({1, 5, 3, 7}, 3));
+  ASSERT_FALSE(ShuffleVectorInst::isTransposeMask({0, 4, 2, 6}, 5));
+  ASSERT_FALSE(ShuffleVectorInst::isTransposeMask({1, 5, 3, 7}, 5));
+
+  ASSERT_FALSE(ShuffleVectorInst::isTransposeMask({2, 6, 4, 8}, 4));
 }
 
 TEST(ShuffleVectorInst, isSpliceMask) {
   int Index;
 
-  ASSERT_TRUE(ShuffleVectorInst::isSpliceMask({0, 1, 2, 3}, Index));
+  ASSERT_TRUE(ShuffleVectorInst::isSpliceMask({0, 1, 2, 3}, 4, Index));
   ASSERT_EQ(0, Index);
 
-  ASSERT_TRUE(ShuffleVectorInst::isSpliceMask({1, 2, 3, 4, 5, 6, 7}, Index));
+  ASSERT_TRUE(ShuffleVectorInst::isSpliceMask({1, 2, 3, 4, 5, 6, 7}, 7, Index));
   ASSERT_EQ(1, Index);
 
-  ASSERT_FALSE(ShuffleVectorInst::isSpliceMask({4, 5, 6, 7}, Index));
+  ASSERT_FALSE(ShuffleVectorInst::isSpliceMask({0, 1, 2, 3}, 3, Index));
+  ASSERT_FALSE(
+      ShuffleVectorInst::isSpliceMask({1, 2, 3, 4, 5, 6, 7}, 6, Index));
+  ASSERT_FALSE(ShuffleVectorInst::isSpliceMask({0, 1, 2, 3}, 5, Index));
+  ASSERT_FALSE(
+      ShuffleVectorInst::isSpliceMask({1, 2, 3, 4, 5, 6, 7}, 8, Index));
+
+  ASSERT_FALSE(ShuffleVectorInst::isSpliceMask({4, 5, 6, 7}, 4, Index));
 }
 
 TEST(ShuffleVectorInst, isExtractSubvectorMask) {