diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
@@ -195,20 +196,14 @@
 
 /// \returns true if all of the instructions in \p VL are in the same block or
 /// false otherwise.
-static bool allSameBlock(ArrayRef<Value *> VL) {
-  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
-  if (!I0)
-    return false;
+template <typename T> static bool allSameBlock(T &&VL) {
+  if (empty(VL))
+    return true;
+  auto *I0 = cast<Instruction>(*VL.begin());
   BasicBlock *BB = I0->getParent();
-  for (int I = 1, E = VL.size(); I < E; I++) {
-    auto *II = dyn_cast<Instruction>(VL[I]);
-    if (!II)
-      return false;
-
-    if (BB != II->getParent())
-      return false;
-  }
-  return true;
+  return all_of(drop_begin(VL, 1), [BB](Value *V) {
+    return BB == cast<Instruction>(V)->getParent();
+  });
 }
 
 /// \returns True if all of the values in \p VL are constants (but not
@@ -397,9 +392,16 @@
 /// could be vectorized even if its structure is diverse.
 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
                                        unsigned BaseIndex = 0) {
-  // Make sure these are all Instructions.
-  if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
+  // Make sure these are all Instructions or UndefValues.
+  if (llvm::any_of(VL,
+                   [](Value *V) {
+                     return !isa<UndefValue>(V) && !isa<Instruction>(V);
+                   }) ||
+      llvm::all_of(VL, [](Value *V) { return isa<UndefValue>(V); }))
     return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+  BaseIndex =
+      std::distance(VL.begin(), llvm::find_if(llvm::drop_begin(VL, BaseIndex),
+                                              Instruction::classof));
 
   bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
   bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
@@ -410,6 +412,8 @@
   // Check for one alternate opcode from another BinaryOperator.
   // TODO - generalize to support all operators (types, calls etc.).
   for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
+    if (isa<UndefValue>(VL[Cnt]))
+      continue;
     unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
     if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
       if (InstOpcode == Opcode || InstOpcode == AltOpcode)
@@ -527,9 +531,10 @@
                                SmallVectorImpl<int> &Mask) {
   Mask.clear();
   const unsigned E = Indices.size();
-  Mask.resize(E, E + 1);
+  Mask.resize(E, UndefMaskElem);
   for (unsigned I = 0; I < E; ++I)
-    Mask[Indices[I]] = I;
+    if (Indices[I] != E + 1)
+      Mask[Indices[I]] = I;
 }
 
 namespace slpvectorizer {
@@ -851,6 +856,7 @@
     /// accessing a consecutive address. These strategies are summarized in the
     /// 'ReorderingMode' enumerator.
     enum class ReorderingMode {
+      Unknown,  ///< Mode is not defined yet
       Load,     ///< Matching loads to consecutive memory addresses
       Opcode,   ///< Matching instructions based on opcode (same or alternate)
       Constant, ///< Matching constants
@@ -866,6 +872,7 @@
     const DataLayout &DL;
     ScalarEvolution &SE;
     const BoUpSLP &R;
+    Instruction &VL0;
 
     /// \returns the operand data at \p OpIdx and \p Lane.
     OperandData &getData(unsigned OpIdx, unsigned Lane) {
@@ -1175,6 +1182,8 @@
           break;
         case ReorderingMode::Failed:
           return None;
+        case ReorderingMode::Unknown:
+          llvm_unreachable("Unknown mode is not expected here.");
         }
       }
 
@@ -1215,10 +1224,17 @@
       // a map. Instead we can simply count the number of operands that
       // correspond to one of them (in this case the 'true' APO), and calculate
       // the other by subtracting it from the total number of operands.
-      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
-        if (getData(OpIdx, Lane).APO)
+      unsigned UndefsCnt = 0;
+      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+        const OperandData &OpData = getData(OpIdx, Lane);
+        if (isa<UndefValue>(OpData.V)) {
+          ++UndefsCnt;
+          continue;
+        }
+        if (OpData.APO)
           ++CntTrue;
-      unsigned CntFalse = NumOperands - CntTrue;
+      }
+      unsigned CntFalse = NumOperands - CntTrue - UndefsCnt;
       return std::max(CntTrue, CntFalse);
     }
 
@@ -1227,13 +1243,18 @@
       assert(!VL.empty() && "Bad VL");
       assert((empty() || VL.size() == getNumLanes()) &&
              "Expected same number of lanes");
-      assert(isa<Instruction>(VL[0]) && "Expected instruction");
-      unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
+      unsigned NumOperands = VL0.getNumOperands();
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
       for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
         OpsVec[OpIdx].resize(NumLanes);
         for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          if (isa<UndefValue>(VL[Lane])) {
+            OpsVec[OpIdx][Lane] = {
+                UndefValue::get(VL0.getOperand(OpIdx)->getType()), false,
+                false};
+            continue;
+          }
           assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
           // Our tree has just 3 nodes: the root and two operands.
           // It is therefore trivial to get the APO. We only need to check the
@@ -1298,9 +1319,9 @@
 
   public:
     /// Initialize with all the operands of the instruction vector \p RootVL.
-    VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
+    VLOperands(Instruction &VL0, ArrayRef<Value *> RootVL, const DataLayout &DL,
                ScalarEvolution &SE, const BoUpSLP &R)
-        : DL(DL), SE(SE), R(R) {
+        : DL(DL), SE(SE), R(R), VL0(VL0) {
       // Append all the operands of RootVL.
       appendOperandsOfVL(RootVL);
     }
@@ -1325,7 +1346,8 @@
       // Each operand has its own mode. We are using this mode to help us select
       // the instructions for each lane, so that they match best with the ones
       // we have selected so far.
-      SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
+      SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands,
+                                                     ReorderingMode::Unknown);
 
       // This is a greedy single-pass algorithm. We are going over each lane
       // once and deciding on the best order right away with no back-tracking.
@@ -1419,6 +1441,8 @@
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
       switch (RMode) {
+      case ReorderingMode::Unknown:
+        return "Unknown";
       case ReorderingMode::Load:
         return "Load";
       case ReorderingMode::Opcode:
@@ -1506,7 +1530,8 @@
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
   int getGatherCost(FixedVectorType *Ty,
-                    const DenseSet<unsigned> &ShuffledIndices) const;
+                    const DenseSet<unsigned> &ShuffledIndices,
+                    const SparseBitVector<> &IgnoredIndices) const;
 
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
@@ -1526,12 +1551,10 @@
 
   /// Reorder commutative or alt operands to get better probability of
   /// generating vectorized code.
-  static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
-                                             SmallVectorImpl<Value *> &Left,
-                                             SmallVectorImpl<Value *> &Right,
-                                             const DataLayout &DL,
-                                             ScalarEvolution &SE,
-                                             const BoUpSLP &R);
+  static void reorderInputsAccordingToOpcode(
+      Instruction &VL0, ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
+      SmallVectorImpl<Value *> &Right, const DataLayout &DL,
+      ScalarEvolution &SE, const BoUpSLP &R);
   struct TreeEntry {
     using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
     TreeEntry(VecTreeTy &Container) : Container(Container) {}
@@ -1599,15 +1622,19 @@
     }
 
     /// Set the operands of this bundle in their original order.
-    void setOperandsInOrder() {
+    void setOperandsInOrder(Instruction *I0) {
       assert(Operands.empty() && "Already initialized?");
-      auto *I0 = cast<Instruction>(Scalars[0]);
       Operands.resize(I0->getNumOperands());
       unsigned NumLanes = Scalars.size();
       for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
            OpIdx != NumOperands; ++OpIdx) {
         Operands[OpIdx].resize(NumLanes);
         for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          if (isa<UndefValue>(Scalars[Lane])) {
+            Operands[OpIdx][Lane] =
+                UndefValue::get(I0->getOperand(OpIdx)->getType());
+            continue;
+          }
           auto *I = cast<Instruction>(Scalars[Lane]);
           assert(I->getNumOperands() == NumOperands &&
                  "Expected same number of operands");
@@ -1755,8 +1782,9 @@
                                      ReuseShuffleIndices.end());
     Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
     Last->setOperations(S);
+    auto InstructionsOnly = make_filter_range(VL, Instruction::classof);
     if (Vectorized) {
-      for (Value *V : VL) {
+      for (Value *V : InstructionsOnly) {
         assert(!getTreeEntry(V) && "Scalar already in tree!");
         ScalarToTreeEntry[V] = Last;
       }
@@ -1768,10 +1796,12 @@
         BundleMember->Lane = Lane;
         ++Lane;
       }
-      assert((!Bundle.getValue() || Lane == VL.size()) &&
+      assert((!Bundle.getValue() ||
+              Lane == std::distance(InstructionsOnly.begin(),
+                                    InstructionsOnly.end())) &&
              "Bundle and VL out of sync");
     } else {
-      MustGather.insert(VL.begin(), VL.end());
+      MustGather.insert(InstructionsOnly.begin(), InstructionsOnly.end());
     }
 
     if (UserTreeIdx.UserTE)
@@ -2496,6 +2526,8 @@
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
+      if (isa<UndefValue>(Scalar))
+        continue;
       int FoundLane = Lane;
       if (!Entry->ReuseShuffleIndices.empty()) {
         FoundLane =
@@ -2519,16 +2551,19 @@
 
         // Skip in-tree scalars that become vectors
         if (TreeEntry *UseEntry = getTreeEntry(U)) {
-          Value *UseScalar = UseEntry->Scalars[0];
-          // Some in-tree scalars will remain as scalar in vectorized
-          // instructions. If that is the case, the one in Lane 0 will
-          // be used.
-          if (UseScalar != U ||
-              !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
-            LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
-                              << ".\n");
-            assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
-            continue;
+          auto *It = llvm::find_if(UseEntry->Scalars, Instruction::classof);
+          if (It != UseEntry->Scalars.end()) {
+            Value *UseScalar = *It;
+            // Some in-tree scalars will remain as scalar in vectorized
+            // instructions. If that is the case, the one in Lane 0 will
+            // be used.
+            if (UseScalar != U ||
+                !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+              LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+                                << ".\n");
+              assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
+              continue;
+            }
           }
         }
 
@@ -2569,8 +2604,10 @@
       return;
     }
 
+  auto InitialInstructionsOnly = make_filter_range(VL, Instruction::classof);
   // If all of the operands are identical or constant we have a simple solution.
-  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
+  if (allConstant(VL) || isSplat(VL) ||
+      !allSameBlock(InitialInstructionsOnly) || !S.getOpcode()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
     newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
     return;
@@ -2580,7 +2617,7 @@
   // the same block.
 
   // Don't vectorize ephemeral values.
-  for (Value *V : VL) {
+  for (Value *V : InitialInstructionsOnly) {
     if (EphValues.count(V)) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
                         << ") is ephemeral.\n");
@@ -2606,11 +2643,8 @@
   }
 
   // Check that none of the instructions in the bundle are already in the tree.
-  for (Value *V : VL) {
-    auto *I = dyn_cast<Instruction>(V);
-    if (!I)
-      continue;
-    if (getTreeEntry(I)) {
+  for (Value *V : InitialInstructionsOnly) {
+    if (getTreeEntry(V)) {
       LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
                         << ") is already in tree.\n");
       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
@@ -2621,7 +2655,7 @@
   // If any of the scalars is marked as a value that needs to stay scalar, then
   // we need to gather the scalars.
   // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
-  for (Value *V : VL) {
+  for (Value *V : InitialInstructionsOnly) {
     if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
@@ -2647,6 +2681,11 @@
   SmallVector<Value *, 4> UniqueValues;
   DenseMap<Value *, unsigned> UniquePositions;
   for (Value *V : VL) {
+    if (isa<UndefValue>(V)) {
+      ReuseShuffleIndicies.emplace_back(UniqueValues.size());
+      UniqueValues.emplace_back(V);
+      continue;
+    }
     auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
     ReuseShuffleIndicies.emplace_back(Res.first->second);
     if (Res.second)
@@ -2657,14 +2696,21 @@
     ReuseShuffleIndicies.clear();
   } else {
     LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
-    if (NumUniqueScalarValues <= 1 ||
-        !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
-      LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
-      return;
-    }
+    UniqueValues.append(VL.size() - UniqueValues.size(),
+                        UndefValue::get(VL0->getType()));
     VL = UniqueValues;
   }
+  const unsigned NumberOfInstructions =
+      llvm::count_if(VL, Instruction::classof);
+  if (NumberOfInstructions <= 1) {
+    LLVM_DEBUG(
+        dbgs()
+        << "SLP: Gathering due to vectorization of single instruction.\n");
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                 ReuseShuffleIndicies);
+    return;
+  }
+  auto InstructionsOnly = make_filter_range(VL, Instruction::classof);
 
   auto &BSRef = BlocksSchedules[BB];
   if (!BSRef)
@@ -2691,10 +2737,10 @@
       auto *PH = cast<PHINode>(VL0);
 
       // Check for terminator values (e.g. invoke).
-      for (Value *V : VL)
+      for (Value *V : InstructionsOnly)
         for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
-          Instruction *Term = dyn_cast<Instruction>(
-              cast<PHINode>(V)->getIncomingValueForBlock(
+          auto *Term =
+              dyn_cast<Instruction>(cast<PHINode>(V)->getIncomingValueForBlock(
                   PH->getIncomingBlock(I)));
           if (Term && Term->isTerminator()) {
             LLVM_DEBUG(dbgs()
@@ -2716,8 +2762,10 @@
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *V : VL)
-          Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
-              PH->getIncomingBlock(I)));
+          Operands.emplace_back(
+              isa<UndefValue>(V) ? UndefValue::get(V->getType())
+                                 : cast<PHINode>(V)->getIncomingValueForBlock(
+                                       PH->getIncomingBlock(I)));
         TE->setOperand(I, Operands);
         OperandsVec.push_back(Operands);
       }
@@ -2788,9 +2836,11 @@
 
       // Make sure all loads in the bundle are simple - we can't vectorize
       // atomic or volatile loads.
-      SmallVector<Value *, 4> PointerOps(VL.size());
-      auto POIter = PointerOps.begin();
+      SmallVector<Value *, 4> PointerOps(NumberOfInstructions);
+      auto *POIter = PointerOps.begin();
       for (Value *V : VL) {
+        if (isa<UndefValue>(V))
+          continue;
         auto *L = cast<LoadInst>(V);
         if (!L->isSimple()) {
           BS.cancelScheduling(VL, VL0);
@@ -2821,20 +2871,22 @@
             dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
         uint64_t Size = DL->getTypeAllocSize(ScalarTy);
         // Check that the sorted loads are consecutive.
-        if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
+        if (Diff && Diff->getAPInt() == (NumberOfInstructions - 1) * Size) {
           if (CurrentOrder.empty()) {
             // Original loads are consecutive and does not require reordering.
             ++NumOpsWantToKeepOriginalOrder;
             TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
                                          UserTreeIdx, ReuseShuffleIndicies);
-            TE->setOperandsInOrder();
+            TE->setOperandsInOrder(VL0);
             LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
           } else {
+            CurrentOrder.append(VL.size() - NumberOfInstructions,
+                                VL.size() + 1);
             // Need to reorder.
             TreeEntry *TE =
                 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                              ReuseShuffleIndicies, CurrentOrder);
-            TE->setOperandsInOrder();
+            TE->setOperandsInOrder(VL0);
             LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
             findRootOrder(CurrentOrder);
             ++NumOpsWantToKeepOrder[CurrentOrder];
@@ -2862,7 +2914,7 @@
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
-      for (Value *V : VL) {
+      for (Value *V : InstructionsOnly) {
         Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
           BS.cancelScheduling(VL, VL0);
@@ -2877,12 +2929,14 @@
                                    ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
-      TE->setOperandsInOrder();
+      TE->setOperandsInOrder(VL0);
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *V : VL)
-          Operands.push_back(cast<Instruction>(V)->getOperand(i));
+          Operands.push_back(isa<UndefValue>(V)
+                                 ? UndefValue::get(SrcTy)
+                                 : cast<Instruction>(V)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, {TE, i});
       }
@@ -2894,8 +2948,8 @@
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
       Type *ComparedTy = VL0->getOperand(0)->getType();
-      for (Value *V : VL) {
-        CmpInst *Cmp = cast<CmpInst>(V);
+      for (Value *V : InstructionsOnly) {
+        auto *Cmp = cast<CmpInst>(V);
         if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
           BS.cancelScheduling(VL, VL0);
@@ -2916,10 +2970,15 @@
         // Commutative predicate - collect + sort operands of the instructions
         // so that each side is more likely to have the same opcode.
         assert(P0 == SwapP0 && "Commutative Predicate mismatch");
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+        reorderInputsAccordingToOpcode(*VL0, VL, Left, Right, *DL, *SE, *this);
       } else {
         // Collect operands - commute if it uses the swapped predicate.
         for (Value *V : VL) {
+          if (isa<UndefValue>(V)) {
+            Left.push_back(UndefValue::get(VL0->getOperand(0)->getType()));
+            Right.push_back(UndefValue::get(VL0->getOperand(1)->getType()));
+            continue;
+          }
           auto *Cmp = cast<CmpInst>(V);
           Value *LHS = Cmp->getOperand(0);
           Value *RHS = Cmp->getOperand(1);
@@ -2963,7 +3022,7 @@
       // have the same opcode.
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+        reorderInputsAccordingToOpcode(*VL0, VL, Left, Right, *DL, *SE, *this);
         TE->setOperand(0, Left);
         TE->setOperand(1, Right);
         buildTree_rec(Left, Depth + 1, {TE, 0});
@@ -2971,20 +3030,46 @@
         return;
       }
 
-      TE->setOperandsInOrder();
+      SmallVector<ValueList, 2> OperandsVec;
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
-        for (Value *V : VL)
-          Operands.push_back(cast<Instruction>(V)->getOperand(i));
-
-        buildTree_rec(Operands, Depth + 1, {TE, i});
+        Value *LastDefined = nullptr;
+        for (Value *V : VL) {
+          Value *OpV;
+          if (isa<UndefValue>(V)) {
+            if (BinaryOperator::isIntDivRem(ShuffleOrOp)) {
+              if (LastDefined)
+                OpV = LastDefined;
+              else
+                OpV = ConstantInt::get(VL0->getOperand(i)->getType(), 1);
+            } else {
+              OpV = UndefValue::get(VL0->getOperand(i)->getType());
+            }
+          } else {
+            OpV = cast<Instruction>(V)->getOperand(i);
+            if (isa<UndefValue>(OpV) &&
+                BinaryOperator::isIntDivRem(ShuffleOrOp)) {
+              if (LastDefined)
+                OpV = LastDefined;
+              else
+                OpV = ConstantInt::get(VL0->getOperand(i)->getType(), 1);
+            } else {
+              LastDefined = OpV;
+            }
+          }
+          Operands.push_back(OpV);
+        }
+        TE->setOperand(i, Operands);
+        OperandsVec.push_back(Operands);
       }
+      for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
+        buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
       return;
     }
     case Instruction::GetElementPtr: {
       // We don't combine GEPs with complicated (nested) indexing.
-      for (Value *V : VL) {
+      for (Value *V : InstructionsOnly) {
         if (cast<Instruction>(V)->getNumOperands() != 2) {
           LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
           BS.cancelScheduling(VL, VL0);
@@ -2997,7 +3082,7 @@
       // We can't combine several GEPs into one vector if they operate on
       // different types.
       Type *Ty0 = VL0->getOperand(0)->getType();
-      for (Value *V : VL) {
+      for (Value *V : InstructionsOnly) {
         Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
         if (Ty0 != CurTy) {
           LLVM_DEBUG(dbgs()
@@ -3011,8 +3096,8 @@
 
       // We don't combine GEPs with non-constant indexes.
       Type *Ty1 = VL0->getOperand(1)->getType();
-      for (Value *V : VL) {
-        auto Op = cast<Instruction>(V)->getOperand(1);
+      for (Value *V : InstructionsOnly) {
+        auto *Op = cast<Instruction>(V)->getOperand(1);
         if (!isa<ConstantInt>(Op) ||
             (Op->getType() != Ty1 &&
              Op->getType()->getScalarSizeInBits() >
@@ -3030,12 +3115,15 @@
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
-      TE->setOperandsInOrder();
+      TE->setOperandsInOrder(VL0);
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *V : VL)
-          Operands.push_back(cast<Instruction>(V)->getOperand(i));
+          Operands.push_back(
+              isa<UndefValue>(V)
+                  ? UndefValue::get(VL0->getOperand(i)->getType())
+                  : cast<Instruction>(V)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, {TE, i});
       }
@@ -3046,11 +3134,16 @@
       llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
       // Make sure all stores in the bundle are simple - we can't vectorize
       // atomic or volatile stores.
-      SmallVector<Value *, 4> PointerOps(VL.size());
+      SmallVector<Value *, 4> PointerOps(NumberOfInstructions);
       ValueList Operands(VL.size());
       auto POIter = PointerOps.begin();
       auto OIter = Operands.begin();
       for (Value *V : VL) {
+        if (isa<UndefValue>(V)) {
+          *OIter = UndefValue::get(VL0->getOperand(0)->getType());
+          ++OIter;
+          continue;
+        }
         auto *SI = cast<StoreInst>(V);
         if (!SI->isSimple()) {
           BS.cancelScheduling(VL, VL0);
@@ -3066,46 +3159,50 @@
       }
 
       OrdersType CurrentOrder;
+      if (!llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
+        BS.cancelScheduling(VL, VL0);
+        newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
+        LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
+        return;
+      }
       // Check the order of pointer operands.
-      if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
-        Value *Ptr0;
-        Value *PtrN;
+      Value *Ptr0;
+      Value *PtrN;
+      if (CurrentOrder.empty()) {
+        Ptr0 = PointerOps.front();
+        PtrN = PointerOps.back();
+      } else {
+        Ptr0 = PointerOps[CurrentOrder.front()];
+        PtrN = PointerOps[CurrentOrder.back()];
+      }
+      const SCEV *Scev0 = SE->getSCEV(Ptr0);
+      const SCEV *ScevN = SE->getSCEV(PtrN);
+      const auto *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
+      uint64_t Size = DL->getTypeAllocSize(ScalarTy);
+      // Check that the sorted pointer operands are consecutive.
+      if (Diff && Diff->getAPInt() == (NumberOfInstructions - 1) * Size) {
         if (CurrentOrder.empty()) {
-          Ptr0 = PointerOps.front();
-          PtrN = PointerOps.back();
+          // Original stores are consecutive and does not require reordering.
+          ++NumOpsWantToKeepOriginalOrder;
+          TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
+                                       UserTreeIdx, ReuseShuffleIndicies);
+          TE->setOperandsInOrder(VL0);
+          buildTree_rec(Operands, Depth + 1, {TE, 0});
+          LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
         } else {
-          Ptr0 = PointerOps[CurrentOrder.front()];
-          PtrN = PointerOps[CurrentOrder.back()];
-        }
-        const SCEV *Scev0 = SE->getSCEV(Ptr0);
-        const SCEV *ScevN = SE->getSCEV(PtrN);
-        const auto *Diff =
-            dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
-        uint64_t Size = DL->getTypeAllocSize(ScalarTy);
-        // Check that the sorted pointer operands are consecutive.
-        if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
-          if (CurrentOrder.empty()) {
-            // Original stores are consecutive and does not require reordering.
-            ++NumOpsWantToKeepOriginalOrder;
-            TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
-                                         UserTreeIdx, ReuseShuffleIndicies);
-            TE->setOperandsInOrder();
-            buildTree_rec(Operands, Depth + 1, {TE, 0});
-            LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
-          } else {
-            TreeEntry *TE =
-                newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                             ReuseShuffleIndicies, CurrentOrder);
-            TE->setOperandsInOrder();
-            buildTree_rec(Operands, Depth + 1, {TE, 0});
-            LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
-            findRootOrder(CurrentOrder);
-            ++NumOpsWantToKeepOrder[CurrentOrder];
-          }
-          return;
+          CurrentOrder.append(VL.size() - NumberOfInstructions, VL.size() + 1);
+          TreeEntry *TE =
+              newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                           ReuseShuffleIndicies, CurrentOrder);
+          TE->setOperandsInOrder(VL0);
+          buildTree_rec(Operands, Depth + 1, {TE, 0});
+          LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
+          findRootOrder(CurrentOrder);
+          ++NumOpsWantToKeepOrder[CurrentOrder];
         }
+        return;
       }
-
       BS.cancelScheduling(VL, VL0);
       newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
                    ReuseShuffleIndicies);
@@ -3137,6 +3234,13 @@
         if (hasVectorInstrinsicScalarOpd(ID, j))
           ScalarArgs[j] = CI->getArgOperand(j);
       for (Value *V : VL) {
+        if (isa<UndefValue>(V)) {
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndicies);
+          LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
+          return;
+        }
         CallInst *CI2 = dyn_cast<CallInst>(V);
         if (!CI2 || CI2->getCalledFunction() != F ||
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
@@ -3146,8 +3250,8 @@
           BS.cancelScheduling(VL, VL0);
           newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
                        ReuseShuffleIndicies);
-          LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
-                            << "\n");
+          LLVM_DEBUG(dbgs()
+                     << "SLP: mismatched calls:" << *CI << "!=" << *V << "\n");
           return;
         }
         // Some intrinsics have scalar arguments and should be same in order for
@@ -3182,7 +3286,7 @@
 
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndicies);
-      TE->setOperandsInOrder();
+      TE->setOperandsInOrder(VL0);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -3211,7 +3315,7 @@
       // Reorder operands if reordering would enable vectorization.
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
-        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+        reorderInputsAccordingToOpcode(*VL0, VL, Left, Right, *DL, *SE, *this);
         TE->setOperand(0, Left);
         TE->setOperand(1, Right);
         buildTree_rec(Left, Depth + 1, {TE, 0});
@@ -3219,12 +3323,15 @@
         return;
       }
 
-      TE->setOperandsInOrder();
+      TE->setOperandsInOrder(VL0);
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (Value *V : VL)
-          Operands.push_back(cast<Instruction>(V)->getOperand(i));
+          Operands.push_back(
+              isa<UndefValue>(V)
+                  ? UndefValue::get(VL0->getOperand(i)->getType())
+                  : cast<Instruction>(V)->getOperand(i));
 
         buildTree_rec(Operands, Depth + 1, {TE, i});
       }
@@ -3297,9 +3404,6 @@
     NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
   }
 
-  if (NElts != VL.size())
-    return false;
-
   // Check that all of the indices extract from the correct offset.
   bool ShouldKeepOrder = true;
   unsigned E = VL.size();
@@ -3309,8 +3413,10 @@
   // consecutive access in the extract instructions, by checking that no
   // element of CurrentOrder still has value E + 1.
   CurrentOrder.assign(E, E + 1);
-  unsigned I = 0;
-  for (; I < E; ++I) {
+  unsigned I = 0, End = std::min(NElts, E);
+  for (; I < End; ++I) {
+    if (isa<UndefValue>(VL[I]))
+      continue;
     auto *Inst = cast<Instruction>(VL[I]);
     if (Inst->getOperand(0) != Vec)
       break;
@@ -3329,10 +3435,15 @@
       CurrentOrder[I] = I;
     }
   }
-  if (I < E) {
+  // Gather if not all extracts are from the same vector/aggrgate.
+  if (I < End || (NElts < E && !all_of(drop_begin(VL, NElts), [](Value *V) {
+                    return isa<UndefValue>(V);
+                  }))) {
     CurrentOrder.clear();
     return false;
   }
+  if (NElts != E)
+    return false;
 
   return ShouldKeepOrder;
 }
@@ -3376,20 +3487,31 @@
 int BoUpSLP::getEntryCost(TreeEntry *E) {
   ArrayRef<Value*> VL = E->Scalars;
 
+  auto InstructionsOnly = make_filter_range(VL, Instruction::classof);
+  const unsigned NumOfInstructions =
+      std::distance(InstructionsOnly.begin(), InstructionsOnly.end());
+  Value *V0;
   Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
-  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
-    ScalarTy = CI->getOperand(0)->getType();
-  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+  FixedVectorType *VecTy;
+  if (!llvm::empty(InstructionsOnly)) {
+    V0 = *InstructionsOnly.begin();
+    if (StoreInst *SI = dyn_cast<StoreInst>(V0))
+      ScalarTy = SI->getValueOperand()->getType();
+    else if (CmpInst *CI = dyn_cast<CmpInst>(V0))
+      ScalarTy = CI->getOperand(0)->getType();
+    VecTy = FixedVectorType::get(ScalarTy, VL.size());
+
+    // If we have computed a smaller type for the expression, update VecTy so
+    // that the costs will be accurate.
+    if (MinBWs.count(V0)) {
+      VecTy = FixedVectorType::get(
+          IntegerType::get(F->getContext(), MinBWs[V0].first), VL.size());
+    }
+  } else {
+    VecTy = FixedVectorType::get(ScalarTy, VL.size());
+  }
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
-  // If we have computed a smaller type for the expression, update VecTy so
-  // that the costs will be accurate.
-  if (MinBWs.count(VL[0]))
-    VecTy = FixedVectorType::get(
-        IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
-
   unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   int ReuseShuffleCost = 0;
@@ -3404,21 +3526,27 @@
       return ReuseShuffleCost +
              TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
     }
-    if (E->getOpcode() == Instruction::ExtractElement &&
-        allSameType(VL) && allSameBlock(VL)) {
-      Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
-      if (ShuffleKind.hasValue()) {
-        int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
-        for (auto *V : VL) {
+    if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) &&
+        allSameBlock(InstructionsOnly)) {
+      Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
+          NumOfInstructions > 1
+              ? isShuffle(llvm::to_vector<4>(InstructionsOnly))
+              : None;
+      if (NumOfInstructions == 1 || ShuffleKind) {
+        int Cost = NumOfInstructions > 1
+                       ? TTI->getShuffleCost(*ShuffleKind, VecTy)
+                       : 0;
+        for (Value *V : InstructionsOnly) {
           // If all users of instruction are going to be vectorized and this
           // instruction itself is not going to be vectorized, consider this
           // instruction as dead and remove its cost from the final cost of the
           // vectorized tree.
           if (areAllUsersVectorized(cast<Instruction>(V)) &&
               !ScalarToTreeEntry.count(V)) {
-            auto *IO = cast<ConstantInt>(
-                cast<ExtractElementInst>(V)->getIndexOperand());
-            Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
+            auto *EE = cast<ExtractElementInst>(V);
+            auto *IO = cast<ConstantInt>(EE->getIndexOperand());
+            Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
+                                            EE->getVectorOperandType(),
                                             IO->getZExtValue());
           }
         }
@@ -3428,7 +3556,8 @@
     return ReuseShuffleCost + getGatherCost(VL);
   }
   assert(E->State == TreeEntry::Vectorize && "Unhandled state");
-  assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+  assert(E->getOpcode() && allSameType(VL) && allSameBlock(InstructionsOnly) &&
+         "Invalid VL");
   Instruction *VL0 = E->getMainOp();
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
@@ -3441,12 +3570,14 @@
       if (NeedToShuffleReuses) {
         unsigned Idx = 0;
         for (unsigned I : E->ReuseShuffleIndices) {
+          if (isa<UndefValue>(VL[I]))
+            continue;
           if (ShuffleOrOp == Instruction::ExtractElement) {
-            auto *IO = cast<ConstantInt>(
-                cast<ExtractElementInst>(VL[I])->getIndexOperand());
+            auto *EE = cast<ExtractElementInst>(VL[I]);
+            auto *IO = cast<ConstantInt>(EE->getIndexOperand());
             Idx = IO->getZExtValue();
             ReuseShuffleCost -= TTI->getVectorInstrCost(
-                Instruction::ExtractElement, VecTy, Idx);
+                Instruction::ExtractElement, EE->getVectorOperandType(), Idx);
           } else {
             ReuseShuffleCost -= TTI->getVectorInstrCost(
                 Instruction::ExtractElement, VecTy, Idx);
@@ -3454,18 +3585,30 @@
           }
         }
         Idx = ReuseShuffleNumbers;
-        for (Value *V : VL) {
+        for (Value *V : InstructionsOnly) {
           if (ShuffleOrOp == Instruction::ExtractElement) {
-            auto *IO = cast<ConstantInt>(
-                cast<ExtractElementInst>(V)->getIndexOperand());
+            auto *EE = cast<ExtractElementInst>(V);
+            auto *IO = cast<ConstantInt>(EE->getIndexOperand());
             Idx = IO->getZExtValue();
+            ReuseShuffleCost += TTI->getVectorInstrCost(
+                Instruction::ExtractElement, EE->getVectorOperandType(), Idx);
           } else {
             --Idx;
+            ReuseShuffleCost += TTI->getVectorInstrCost(
+                Instruction::ExtractElement, VecTy, Idx);
           }
-          ReuseShuffleCost +=
-              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
         }
       }
+#ifndef NDEBUG
+      OrdersType CurrentOrder;
+      bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+      assert(Reuse && E->ReorderIndices.empty() ||
+             (!Reuse && CurrentOrder.size() == E->ReorderIndices.size() &&
+              std::equal(CurrentOrder.begin(), CurrentOrder.end(),
+                         E->ReorderIndices.begin())) &&
+                 "The sequence of extract elements must be reused or shuffled "
+                 "with the same mask.");
+#endif
       int DeadCost = ReuseShuffleCost;
       if (!E->ReorderIndices.empty()) {
         // TODO: Merge this shuffle with the ReuseShuffleCost.
@@ -3473,7 +3616,9 @@
             TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
       }
       for (unsigned I = 0, E = VL.size(); I < E; ++I) {
-        Instruction *EI = cast<Instruction>(VL[I]);
+        if (isa<UndefValue>(VL[I]))
+          continue;
+        auto *EI = cast<Instruction>(VL[I]);
         // If all users are going to be vectorized, instruction can be
         // considered as dead.
         // The same, if have only one user, it will be vectorized for sure.
@@ -3495,8 +3640,16 @@
               continue;
             }
           }
-          DeadCost -=
-              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
+          if (ShuffleOrOp == Instruction::ExtractElement) {
+            auto *EE = cast<ExtractElementInst>(EI);
+            auto *IO = cast<ConstantInt>(EE->getIndexOperand());
+            unsigned Idx = IO->getZExtValue();
+            DeadCost -= TTI->getVectorInstrCost(
+                Instruction::ExtractElement, EE->getVectorOperandType(), Idx);
+          } else {
+            DeadCost -=
+                TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
+          }
         }
       }
       return DeadCost;
@@ -3518,11 +3671,12 @@
           TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
                                 TTI::getCastContextHint(VL0), CostKind, VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -=
+            (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost;
       }
 
       // Calculate the cost of this instruction.
-      int ScalarCost = VL.size() * ScalarEltCost;
+      int ScalarCost = NumOfInstructions * ScalarEltCost;
 
       auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
       int VecCost = 0;
@@ -3543,10 +3697,11 @@
           TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -=
+            (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost;
       }
       auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
-      int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+      int ScalarCost = NumOfInstructions * ScalarEltCost;
 
       // Check if all entries in VL are either compares or selects with compares
       // as condition that have the same predicates.
@@ -3620,24 +3775,27 @@
       // If instead not all operands are constants, then set the operand kind
       // to OK_AnyValue. If all operands are constants but not the same,
       // then set the operand kind to OK_NonUniformConstantValue.
-      ConstantInt *CInt0 = nullptr;
+      Constant *C0 = nullptr;
       for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+        if (isa<UndefValue>(VL[i]))
+          continue;
         const Instruction *I = cast<Instruction>(VL[i]);
         unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
         ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
-        if (!CInt) {
+        Constant *UV = dyn_cast<UndefValue>(I->getOperand(OpIdx));
+        if (!CInt && !UV) {
           Op2VK = TargetTransformInfo::OK_AnyValue;
           Op2VP = TargetTransformInfo::OP_None;
           break;
         }
         if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
-            !CInt->getValue().isPowerOf2())
+            (UV || !cast<ConstantInt>(CInt)->getValue().isPowerOf2()))
           Op2VP = TargetTransformInfo::OP_None;
         if (i == 0) {
-          CInt0 = CInt;
+          C0 = CInt ? CInt : UV;
           continue;
         }
-        if (CInt0 != CInt)
+        if (C0 != (CInt ? CInt : UV))
           Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
       }
 
@@ -3646,9 +3804,10 @@
           E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
           Operands, VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -=
+            (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost;
       }
-      int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+      int ScalarCost = NumOfInstructions * ScalarEltCost;
       int VecCost = TTI->getArithmeticInstrCost(
           E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
           Operands, VL0);
@@ -3664,9 +3823,10 @@
           TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind,
                                       Op1VK, Op2VK);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -=
+            (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost;
       }
-      int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+      int ScalarCost = NumOfInstructions * ScalarEltCost;
       int VecCost =
           TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind,
                                       Op1VK, Op2VK);
@@ -3679,13 +3839,34 @@
           TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0,
                                CostKind, VL0);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -=
+            (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost;
+      }
+      int ScalarLdCost = NumOfInstructions * ScalarEltCost;
+      int VecLdCost;
+      const auto *FirstInstr = llvm::find_if(VL, Instruction::classof);
+      const auto LastInstr =
+          llvm::find_if(llvm::reverse(VL), Instruction::classof);
+      unsigned InstrDist = std::distance(FirstInstr, LastInstr.base());
+      bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(InstrDist);
+      if (!IsPowOf2NumOfInstructions) {
+        IntrinsicCostAttributes Attrs(
+            Intrinsic::masked_load, VecTy,
+            {VecTy->getPointerTo(), Builder.getInt32Ty(),
+             FixedVectorType::get(Builder.getInt1Ty(), VecTy->getNumElements()),
+             VecTy});
+        VecLdCost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
+      } else if (InstrDist == VL.size()) {
+        VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
+                                         CostKind, VL0);
+      } else {
+        auto *LoadVecTy = FixedVectorType::get(ScalarTy, InstrDist);
+        VecLdCost = TTI->getMemoryOpCost(Instruction::Load, LoadVecTy,
+                                         alignment, 0, CostKind, VL0);
       }
-      int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecLdCost =
-          TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
-                               CostKind, VL0);
-      if (!E->ReorderIndices.empty()) {
+      if (!NeedToShuffleReuses &&
+          (!E->ReorderIndices.empty() ||
+           (IsPowOf2NumOfInstructions && InstrDist != VL.size()))) {
         // TODO: Merge this shuffle with the ReuseShuffleCost.
         VecLdCost += TTI->getShuffleCost(
             TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
@@ -3702,10 +3883,30 @@
           TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0,
                                CostKind, VL0);
       if (NeedToShuffleReuses)
-        ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
-      int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
-                                           VecTy, Alignment, 0, CostKind, VL0);
+        ReuseShuffleCost -=
+            (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost;
+      int ScalarStCost = NumOfInstructions * ScalarEltCost;
+      int VecStCost;
+      const auto *FirstInstr = llvm::find_if(VL, Instruction::classof);
+      const auto LastInstr =
+          llvm::find_if(llvm::reverse(VL), Instruction::classof);
+      unsigned InstrDist = std::distance(FirstInstr, LastInstr.base());
+      bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(InstrDist);
+      if (!IsPowOf2NumOfInstructions) {
+        IntrinsicCostAttributes Attrs(
+            Intrinsic::masked_store, Builder.getVoidTy(),
+            {VecTy, VecTy->getPointerTo(), Builder.getInt32Ty(),
+             FixedVectorType::get(Builder.getInt1Ty(),
+                                  VecTy->getNumElements())});
+        VecStCost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
+      } else if (InstrDist == VL.size()) {
+        VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, Alignment,
+                                         0, CostKind, VL0);
+      } else {
+        auto *StoreVecTy = FixedVectorType::get(ScalarTy, InstrDist);
+        VecStCost = TTI->getMemoryOpCost(Instruction::Store, StoreVecTy,
+                                         Alignment, 0, CostKind, VL0);
+      }
       if (IsReorder) {
         // TODO: Merge this shuffle with the ReuseShuffleCost.
         VecStCost += TTI->getShuffleCost(
@@ -3721,9 +3922,10 @@
       IntrinsicCostAttributes CostAttrs(ID, *CI, 1, 1);
       int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
       if (NeedToShuffleReuses) {
-        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+        ReuseShuffleCost -=
+            (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost;
       }
-      int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
+      int ScalarCallCost = NumOfInstructions * ScalarEltCost;
 
       auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
       int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second);
@@ -3744,16 +3946,18 @@
       int ScalarCost = 0;
       if (NeedToShuffleReuses) {
         for (unsigned Idx : E->ReuseShuffleIndices) {
-          Instruction *I = cast<Instruction>(VL[Idx]);
+          if (isa<UndefValue>(VL[Idx]))
+            continue;
+          auto *I = cast<Instruction>(VL[Idx]);
           ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
         }
-        for (Value *V : VL) {
+        for (Value *V : InstructionsOnly) {
           Instruction *I = cast<Instruction>(V);
           ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
         }
       }
-      for (Value *V : VL) {
-        Instruction *I = cast<Instruction>(V);
+      for (Value *V : InstructionsOnly) {
+        auto *I = cast<Instruction>(V);
         assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
         ScalarCost += TTI->getInstructionCost(I, CostKind);
       }
@@ -4055,12 +4259,16 @@
 }
 
 int BoUpSLP::getGatherCost(FixedVectorType *Ty,
-                           const DenseSet<unsigned> &ShuffledIndices) const {
+                           const DenseSet<unsigned> &ShuffledIndices,
+                           const SparseBitVector<> &IgnoredIndices) const {
   unsigned NumElts = Ty->getNumElements();
   APInt DemandedElts = APInt::getNullValue(NumElts);
-  for (unsigned I = 0; I < NumElts; ++I)
+  for (unsigned I = 0; I < NumElts; ++I) {
+    if (IgnoredIndices.test(I))
+      continue;
     if (!ShuffledIndices.count(I))
       DemandedElts.setBit(I);
+  }
   int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
                                            /*Extract*/ false);
   if (!ShuffledIndices.empty())
@@ -4078,27 +4286,30 @@
   // Check if the same elements are inserted several times and count them as
   // shuffle candidates.
   DenseSet<unsigned> ShuffledElements;
+  SparseBitVector<> IgnoredElements;
   DenseSet<Value *> UniqueElements;
   // Iterate in reverse order to consider insert elements with the high cost.
   for (unsigned I = VL.size(); I > 0; --I) {
     unsigned Idx = I - 1;
+    if (isa<UndefValue>(VL[Idx])) {
+      IgnoredElements.set(Idx);
+      continue;
+    }
     if (!UniqueElements.insert(VL[Idx]).second)
       ShuffledElements.insert(Idx);
   }
-  return getGatherCost(VecTy, ShuffledElements);
+  return getGatherCost(VecTy, ShuffledElements, IgnoredElements);
 }
 
 // Perform operand reordering on the instructions in VL and return the reordered
 // operands in Left and Right.
-void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
-                                             SmallVectorImpl<Value *> &Left,
-                                             SmallVectorImpl<Value *> &Right,
-                                             const DataLayout &DL,
-                                             ScalarEvolution &SE,
-                                             const BoUpSLP &R) {
+void BoUpSLP::reorderInputsAccordingToOpcode(
+    Instruction &VL0, ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
+    SmallVectorImpl<Value *> &Right, const DataLayout &DL, ScalarEvolution &SE,
+    const BoUpSLP &R) {
   if (VL.empty())
     return;
-  VLOperands Ops(VL, DL, SE, R);
+  VLOperands Ops(VL0, VL, DL, SE, R);
   // Reorder the operands in place.
   Ops.reorder();
   Left = Ops.getVL(0);
@@ -4106,15 +4317,17 @@
 }
 
 void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
+  auto InstructionsOnly = make_filter_range(E->Scalars, Instruction::classof);
+  if (llvm::empty(InstructionsOnly))
+    return;
   // Get the basic block this bundle is in. All instructions in the bundle
   // should be in this block.
   auto *Front = E->getMainOp();
   auto *BB = Front->getParent();
-  assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()),
-                      [=](Value *V) -> bool {
-                        auto *I = cast<Instruction>(V);
-                        return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
-                      }));
+  assert(llvm::all_of(InstructionsOnly, [=](Value *V) -> bool {
+    auto *I = cast<Instruction>(V);
+    return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
+  }));
 
   // The last instruction in the bundle in program order.
   Instruction *LastInst = nullptr;
@@ -4124,8 +4337,8 @@
   // VL.back() and iterate over schedule data until we reach the end of the
   // bundle. The end of the bundle is marked by null ScheduleData.
   if (BlocksSchedules.count(BB)) {
-    auto *Bundle =
-        BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
+    auto *Bundle = BlocksSchedules[BB]->getScheduleData(
+        E->isOneOf(*llvm::reverse(InstructionsOnly).begin()));
     if (Bundle && Bundle->isPartOfBundle())
       for (; Bundle; Bundle = Bundle->NextInBundle)
         if (Bundle->OpValue == Bundle->Inst)
@@ -4151,7 +4364,8 @@
   // we both exit early from buildTree_rec and that the bundle be out-of-order
   // (causing us to iterate all the way to the end of the block).
   if (!LastInst) {
-    SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
+    SmallPtrSet<Value *, 16> Bundle(InstructionsOnly.begin(),
+                                    InstructionsOnly.end());
     for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
       if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
         LastInst = &I;
@@ -4174,6 +4388,10 @@
   Value *Vec = UndefValue::get(VecTy);
   unsigned InsIndex = 0;
   for (Value *Val : VL) {
+    if (isa<UndefValue>(Val)) {
+      ++InsIndex;
+      continue;
+    }
     Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++));
     auto *InsElt = dyn_cast<InsertElementInst>(Vec);
     if (!InsElt)
@@ -4200,26 +4418,27 @@
 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
   InstructionsState S = getSameOpcode(VL);
   if (S.getOpcode()) {
-    if (TreeEntry *E = getTreeEntry(S.OpValue)) {
-      if (E->isSame(VL)) {
-        Value *V = vectorizeTree(E);
-        if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
-          // We need to get the vectorized value but without shuffle.
-          if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
-            V = SV->getOperand(0);
-          } else {
-            // Reshuffle to get only unique values.
-            SmallVector<int, 4> UniqueIdxs;
-            SmallSet<int, 4> UsedIdxs;
-            for (int Idx : E->ReuseShuffleIndices)
-              if (UsedIdxs.insert(Idx).second)
-                UniqueIdxs.emplace_back(Idx);
-            V = Builder.CreateShuffleVector(V, UniqueIdxs);
-          }
-        }
-        return V;
+    // Check that every instruction appears once in this bundle.
+    SmallVector<Value *, 4> UniqueValues;
+    DenseMap<Value *, unsigned> UniquePositions;
+    UniqueValues.reserve(VL.size());
+    for (Value *V : VL) {
+      if (isa<UndefValue>(V)) {
+        UniqueValues.emplace_back(V);
+        continue;
       }
+      auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+      if (Res.second)
+        UniqueValues.emplace_back(V);
+    }
+    if (UniqueValues.size() != VL.size()) {
+      LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
+      UniqueValues.append(VL.size() - UniqueValues.size(),
+                          UndefValue::get(VL[0]->getType()));
     }
+    if (TreeEntry *E = getTreeEntry(S.OpValue))
+      if (E->isSame(UniqueValues))
+        return vectorizeTree(E);
   }
 
   // Check that every instruction appears once in this bundle.
@@ -4261,6 +4480,14 @@
     return E->VectorizedValue;
   }
 
+  Instruction *VL0 = E->getMainOp();
+  Type *ScalarTy = VL0->getType();
+  if (auto *Store = dyn_cast<StoreInst>(VL0))
+    ScalarTy = Store->getValueOperand()->getType();
+  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
+  if (isa<UndefValue>(VL0))
+    return UndefValue::get(VecTy);
+
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   if (E->State == TreeEntry::NeedToGather) {
     setInsertPointAfterBundle(E);
@@ -4277,13 +4504,9 @@
   }
 
   assert(E->State == TreeEntry::Vectorize && "Unhandled state");
+  auto InstructionsOnly = make_filter_range(E->Scalars, Instruction::classof);
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
-  Instruction *VL0 = E->getMainOp();
-  Type *ScalarTy = VL0->getType();
-  if (auto *Store = dyn_cast<StoreInst>(VL0))
-    ScalarTy = Store->getValueOperand()->getType();
-  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       auto *PH = cast<PHINode>(VL0);
@@ -4338,12 +4561,12 @@
       return V;
     }
     case Instruction::ExtractValue: {
-      auto *LI = cast<LoadInst>(E->getSingleOperand(0));
+      auto *LI = cast<LoadInst>(VL0->getOperand(0));
       Builder.SetInsertPoint(LI);
       auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
       Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
       LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
-      Value *NewV = propagateMetadata(V, E->Scalars);
+      Value *NewV = propagateMetadata(V, llvm::to_vector<4>(InstructionsOnly));
       if (!E->ReorderIndices.empty()) {
         SmallVector<int, 4> Mask;
         inversePermutation(E->ReorderIndices, Mask);
@@ -4486,7 +4709,7 @@
           RHS);
       propagateIRFlags(V, E->Scalars, VL0);
       if (auto *I = dyn_cast<Instruction>(V))
-        V = propagateMetadata(I, E->Scalars);
+        V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly));
 
       if (NeedToShuffleReuses)
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
@@ -4507,9 +4730,35 @@
       LoadInst *LI = cast<LoadInst>(VL0);
       unsigned AS = LI->getPointerAddressSpace();
 
-      Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
-                                            VecTy->getPointerTo(AS));
-
+      auto *FirstInstr = llvm::find_if(E->Scalars, Instruction::classof);
+      auto *LastInstr =
+          llvm::find_if(llvm::reverse(E->Scalars), Instruction::classof).base();
+      unsigned NumOfInstructions = std::distance(FirstInstr, LastInstr);
+      bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(NumOfInstructions);
+      Value *VecPtr;
+      Instruction *VecLI;
+      if (IsPowOf2NumOfInstructions) {
+        VecPtr = Builder.CreateBitCast(
+            LI->getPointerOperand(),
+            FixedVectorType::get(ScalarTy, NumOfInstructions)
+                ->getPointerTo(AS));
+        VecLI = Builder.CreateAlignedLoad(
+            FixedVectorType::get(ScalarTy, NumOfInstructions), VecPtr,
+            LI->getAlign());
+      } else {
+        VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
+                                       VecTy->getPointerTo(AS));
+        SmallVector<Constant *, 4> Mask;
+        Mask.reserve(E->Scalars.size());
+        for (auto *V : E->Scalars)
+          Mask.emplace_back(Builder.getInt1(!isa<UndefValue>(V)));
+        SmallVector<Constant *, 4> Passthrough(E->Scalars.size(),
+                                               UndefValue::get(LI->getType()));
+        VecLI = Builder.CreateMaskedLoad(VecPtr, LI->getAlign(),
+                                         ConstantVector::get(Mask),
+                                         ConstantVector::get(Passthrough));
+      }
+      Value *V = propagateMetadata(VecLI, llvm::to_vector<4>(InstructionsOnly));
       // The pointer operand uses an in-tree scalar so we add the new BitCast to
       // ExternalUses list to make sure that an extract will be generated in the
       // future.
@@ -4517,8 +4766,18 @@
       if (getTreeEntry(PO))
         ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
 
-      LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
-      Value *V = propagateMetadata(LI, E->Scalars);
+      if (PowerOf2Ceil(NumOfInstructions) != E->Scalars.size()) {
+        SmallVector<int, 4> ExtendedIndices(E->Scalars.size(),
+                                            NumOfInstructions);
+        const unsigned Dist = std::distance(E->Scalars.begin(), FirstInstr);
+        auto *LI = FirstInstr;
+        for (unsigned I = 0; I < NumOfInstructions; ++I, ++LI) {
+          if (!isa<UndefValue>(*LI))
+            ExtendedIndices[I] = Dist + I;
+        }
+        V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                                        ExtendedIndices, "load.extend");
+      }
       if (IsReorder) {
         SmallVector<int, 4> Mask;
         inversePermutation(E->ReorderIndices, Mask);
@@ -4547,10 +4806,42 @@
         VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf");
       }
       Value *ScalarPtr = SI->getPointerOperand();
-      Value *VecPtr = Builder.CreateBitCast(
-          ScalarPtr, VecValue->getType()->getPointerTo(AS));
-      StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,
-                                                 SI->getAlign());
+
+      auto *FirstInstr = llvm::find_if(E->Scalars, Instruction::classof);
+      auto *LastInstr =
+          llvm::find_if(llvm::reverse(E->Scalars), Instruction::classof).base();
+      unsigned NumOfInstructions = std::distance(FirstInstr, LastInstr);
+      if (PowerOf2Ceil(NumOfInstructions) != E->Scalars.size()) {
+        SmallVector<int, 4> ExtendedIndices(NumOfInstructions,
+                                            NumOfInstructions);
+        const unsigned Dist = std::distance(E->Scalars.begin(), FirstInstr);
+        auto *VI = FirstInstr;
+        for (unsigned I = 0; I < NumOfInstructions; ++I, ++VI) {
+          if (!isa<UndefValue>(*VI))
+            ExtendedIndices[I] = Dist + I;
+        }
+        VecValue = Builder.CreateShuffleVector(
+            VecValue, UndefValue::get(VecValue->getType()), ExtendedIndices,
+            "values.extend");
+      }
+      Value *VecPtr;
+      Instruction *VecSI;
+      bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(NumOfInstructions);
+      if (IsPowOf2NumOfInstructions) {
+        VecPtr = Builder.CreateBitCast(
+            ScalarPtr, FixedVectorType::get(ScalarTy, NumOfInstructions)
+                           ->getPointerTo(AS));
+        VecSI = Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign());
+      } else {
+        VecPtr = Builder.CreateBitCast(ScalarPtr,
+                                       VecValue->getType()->getPointerTo(AS));
+        SmallVector<Constant *, 4> Mask;
+        Mask.reserve(E->Scalars.size());
+        for (auto *V : E->Scalars)
+          Mask.emplace_back(Builder.getInt1(!isa<UndefValue>(V)));
+        VecSI = Builder.CreateMaskedStore(VecValue, VecPtr, SI->getAlign(),
+                                          ConstantVector::get(Mask));
+      }
 
       // The pointer operand uses an in-tree scalar, so add the new BitCast to
       // ExternalUses to make sure that an extract will be generated in the
@@ -4558,7 +4849,7 @@
       if (getTreeEntry(ScalarPtr))
         ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
 
-      Value *V = propagateMetadata(ST, E->Scalars);
+      Value *V = propagateMetadata(VecSI, llvm::to_vector<4>(InstructionsOnly));
       if (NeedToShuffleReuses)
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
 
@@ -4585,6 +4876,8 @@
                                               ->getPointerOperandType()
                                               ->getScalarType());
         for (Value *&V : VL) {
+          if (isa<UndefValue>(V))
+            continue;
           auto *CI = cast<ConstantInt>(V);
           V = ConstantExpr::getIntegerCast(CI, Ty,
                                            CI->getValue().isSignBitSet());
@@ -4596,7 +4889,7 @@
       Value *V = Builder.CreateGEP(
           cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
       if (Instruction *I = dyn_cast<Instruction>(V))
-        V = propagateMetadata(I, E->Scalars);
+        V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly));
 
       if (NeedToShuffleReuses)
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
@@ -4711,6 +5004,11 @@
       unsigned e = E->Scalars.size();
       SmallVector<int, 8> Mask(e);
       for (unsigned i = 0; i < e; ++i) {
+        if (isa<UndefValue>(E->Scalars[i])) {
+          Mask[i] = i;
+          OpScalars.push_back(E->Scalars[i]);
+          continue;
+        }
         auto *OpInst = cast<Instruction>(E->Scalars[i]);
         assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
         if (OpInst->getOpcode() == E->getAltOpcode()) {
@@ -4727,7 +5025,7 @@
 
       Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
       if (Instruction *I = dyn_cast<Instruction>(V))
-        V = propagateMetadata(I, E->Scalars);
+        V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly));
       if (NeedToShuffleReuses)
         V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
 
@@ -4876,6 +5174,8 @@
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
+      if (isa<UndefValue>(Scalar))
+        continue;
 
 #ifndef NDEBUG
       Type *Ty = Scalar->getType();
@@ -5001,14 +5301,15 @@
   bool ReSchedule = false;
   LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.OpValue << "\n");
 
+  auto InstructionsOnly = make_filter_range(VL, Instruction::classof);
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
-  for (Value *V : VL) {
+  for (Value *V : InstructionsOnly) {
     if (!extendSchedulingRegion(V, S))
       return None;
   }
 
-  for (Value *V : VL) {
+  for (Value *V : InstructionsOnly) {
     ScheduleData *BundleMember = getScheduleData(V);
     assert(BundleMember &&
            "no ScheduleData for bundle member (maybe not in same basic block)");
@@ -5087,7 +5388,9 @@
   LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
   assert(!Bundle->IsScheduled &&
          "Can't cancel bundle which is already scheduled");
-  assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
+  assert(Bundle->isSchedulingEntity() &&
+         (Bundle->isPartOfBundle() ||
+          llvm::count_if(VL, Instruction::classof) == 1) &&
          "tried to unbundle something which is not a bundle");
 
   // Un-bundle: make single instructions out of the bundle.
@@ -5392,7 +5695,9 @@
        I = I->getNextNode()) {
     BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
       assert(SD->isPartOfBundle() ==
-                 (getTreeEntry(SD->Inst) != nullptr) &&
+                 (getTreeEntry(SD->Inst) != nullptr &&
+                  llvm::count_if(getTreeEntry(SD->Inst)->Scalars,
+                                 Instruction::classof) > 1) &&
              "scheduler and vectorizer bundle mismatch");
       SD->FirstInBundle->SchedulingPriority = Idx++;
       if (SD->isSchedulingEntity()) {
@@ -5855,6 +6160,37 @@
   return Changed;
 }
 
+/// Order may have elements assigned special value (size + 1) which is out of
+/// bounds. Such indices only appear on places which correspond to undef values
+/// (see canReuseExtract for details) and used in order to avoid undef values
+/// have effect on operands ordering.
+/// The first loop below simply finds all unused indices and then the next loop
+/// nest assigns these indeces for undef values positions.
+/// As an example below Order has two undef positions and they have assigned
+/// values 3 and 7 respectively:
+/// before:  6 9 5 4 9 2 1 0
+/// after:   6 3 5 4 7 2 1 0
+static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) {
+  const unsigned Sz = Order.size();
+  SmallBitVector UsedIndices(Sz);
+  const unsigned BoundVal = Sz + 1;
+  for (unsigned I : Order)
+    if (I != BoundVal)
+      UsedIndices[I] = true;
+  unsigned Idx = 0;
+  for (unsigned &I : Order) {
+    if (I == BoundVal) {
+      // Find first non-used index.
+      for (; Idx != Sz; ++Idx)
+        if (!UsedIndices[Idx])
+          break;
+      // Set correct index.
+      I = Idx;
+      ++Idx;
+    }
+  }
+}
+
 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
                                             unsigned Idx) {
   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
@@ -5863,9 +6199,19 @@
   const unsigned MinVF = R.getMinVecRegSize() / Sz;
   unsigned VF = Chain.size();
 
-  if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
+  unsigned NewSize = PowerOf2Ceil(VF);
+  if (!isPowerOf2_32(Sz) || VF < 2 || NewSize < MinVF)
     return false;
 
+  SmallVector<Value *, 8> FixedChain;
+  if (NewSize != VF) {
+    FixedChain.reserve(NewSize);
+    FixedChain.append(Chain.begin(), Chain.end());
+    FixedChain.append(NewSize - Chain.size(),
+                      UndefValue::get(Chain[0]->getType()));
+    Chain = FixedChain;
+    VF = NewSize;
+  }
   LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
                     << "\n");
 
@@ -5873,9 +6219,11 @@
   Optional<ArrayRef<unsigned>> Order = R.bestOrder();
   // TODO: Handle orders of size less than number of elements in the vector.
   if (Order && Order->size() == Chain.size()) {
+    SmallVector<unsigned, 4> NewOrder(Order->begin(), Order->end());
+    fixupOrderingIndices(NewOrder);
     // TODO: reorder tree nodes without tree rebuilding.
     SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend());
-    llvm::transform(*Order, ReorderedOps.begin(),
+    llvm::transform(NewOrder, ReorderedOps.begin(),
                     [Chain](const unsigned Idx) { return Chain[Idx]; });
     R.buildTree(ReorderedOps);
   }
@@ -5972,8 +6320,9 @@
     // register size is a power-of-2?
     unsigned StartIdx = 0;
     for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) {
-      for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
-        ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
+      for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + 2 <= E;) {
+        ArrayRef<Value *> Slice =
+            makeArrayRef(Operands).slice(Cnt, std::min(Size, E - Cnt));
         if (!VectorizedStores.count(Slice.front()) &&
             !VectorizedStores.count(Slice.back()) &&
             vectorizeStoreChain(Slice, R, Cnt)) {
@@ -6075,9 +6424,9 @@
     }
   }
 
-  unsigned Sz = R.getVectorElementSize(I0);
-  unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
-  unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+  unsigned Pow2VL = PowerOf2Ceil(VL.size());
+  unsigned MinVF = 2;
+  unsigned MaxVF = Pow2VL;
   if (MaxVF < 2) {
     R.getORE()->emit([&]() {
       return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
@@ -6114,7 +6463,7 @@
       else
         OpsWidth = VF;
 
-      if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
+      if (OpsWidth < 2 || (VF > MinVF && OpsWidth <= VF / 2))
         break;
 
       ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
@@ -6127,6 +6476,17 @@
 
       LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
                         << "\n");
+      SmallVector<Value *, 8> FixedChain;
+      if (OpsWidth != VF && (CompensateUseCost || !AllowReorder)) {
+        unsigned NewSize = VF;
+        FixedChain.reserve(NewSize);
+        FixedChain.append(Ops.begin(), Ops.end());
+        FixedChain.append(NewSize - Ops.size(),
+                          UndefValue::get(Ops[0]->getType()));
+        Ops = FixedChain;
+      }
+      assert(Ops.size() == VF &&
+             "Operations must have same size as vectorization factor.");
 
       R.buildTree(Ops);
       Optional<ArrayRef<unsigned>> Order = R.bestOrder();
@@ -6950,9 +7310,11 @@
         assert(Order->size() == VL.size() &&
                "Order size must be the same as number of vectorized "
                "instructions.");
+        SmallVector<unsigned, 4> NewOrder(Order->begin(), Order->end());
+        fixupOrderingIndices(NewOrder);
         // TODO: reorder tree nodes without tree rebuilding.
         SmallVector<Value *, 4> ReorderedOps(VL.size());
-        llvm::transform(*Order, ReorderedOps.begin(),
+        llvm::transform(NewOrder, ReorderedOps.begin(),
                         [VL](const unsigned Idx) { return VL[Idx]; });
         V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
       }
@@ -7054,6 +7416,14 @@
     return VectorizedTree != nullptr;
   }
 
+  SmallVector<Value *, 4> getCopyOfExtraArgValues() const {
+    SmallVector<Value *, 4> Args(ExtraArgs.size());
+    std::transform(
+        ExtraArgs.begin(), ExtraArgs.end(), Args.begin(),
+        [](const std::pair<Instruction *, Value *> &P) { return P.second; });
+    return Args;
+  }
+
   unsigned numReductionValues() const {
     return ReducedVals.size();
   }
@@ -7399,6 +7769,15 @@
           // Set P to nullptr to avoid re-analysis of phi node in
           // matchAssociativeReduction function unless this is the root node.
           P = nullptr;
+          // Try to vectorize ExtraArgs.
+          // Continue analysis for the instruction from the same basic block
+          // only to save compile time.
+          if (++Level < RecursionMaxDepth)
+            for (auto *Op : HorRdx.getCopyOfExtraArgValues())
+              if (VisitedInstrs.insert(Op).second)
+                if (auto *I = dyn_cast<Instruction>(Op))
+                  if (!isa<PHINode>(I) && I->getParent() == BB)
+                    Stack.emplace_back(I, Level);
           continue;
         }
       }
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
@@ -3,15 +3,17 @@
 
 define void @f1(<2 x i16> %x, i16* %a) {
 ; CHECK-LABEL: @f1(
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[X:%.*]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[T2:%.*]] = extractelement <2 x i16> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[T3:%.*]] = extractelement <2 x i16> [[X]], i32 1
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
 ; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
 ; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
 ; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
-; CHECK-NEXT:    store i16 [[TMP1]], i16* [[A:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
-; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2
+; CHECK-NEXT:    store i16 [[T2]], i16* [[A:%.*]]
+; CHECK-NEXT:    store i16 [[T2]], i16* [[PTR0]]
+; CHECK-NEXT:    store i16 [[T3]], i16* [[PTR1]]
+; CHECK-NEXT:    store i16 [[T3]], i16* [[PTR2]]
+; CHECK-NEXT:    store i16 [[T2]], i16* [[PTR3]]
 ; CHECK-NEXT:    ret void
 ;
   %t2 = extractelement <2 x i16> %x, i32 0
@@ -35,15 +37,17 @@
 ; CHECK:       cont:
 ; CHECK-NEXT:    [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ]
 ; CHECK-NEXT:    [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[T2:%.*]] = extractelement <2 x i16> [[XX]], i32 0
+; CHECK-NEXT:    [[T3:%.*]] = extractelement <2 x i16> [[XX]], i32 1
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
 ; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
 ; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
 ; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
-; CHECK-NEXT:    store i16 [[TMP0]], i16* [[A]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
-; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    store i16 [[T2]], i16* [[A]]
+; CHECK-NEXT:    store i16 [[T2]], i16* [[PTR0]]
+; CHECK-NEXT:    store i16 [[T3]], i16* [[PTR1]]
+; CHECK-NEXT:    store i16 [[T3]], i16* [[PTR2]]
+; CHECK-NEXT:    store i16 [[T2]], i16* [[PTR3]]
 ; CHECK-NEXT:    [[A_VAL:%.*]] = load i16, i16* [[A]], align 2
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]]
@@ -82,16 +86,17 @@
 ; CHECK:       cont:
 ; CHECK-NEXT:    [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ]
 ; CHECK-NEXT:    [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ]
-; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[REORDER_SHUFFLE]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[T2:%.*]] = extractelement <2 x i16> [[XX]], i32 0
+; CHECK-NEXT:    [[T3:%.*]] = extractelement <2 x i16> [[XX]], i32 1
 ; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
 ; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
 ; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
 ; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
-; CHECK-NEXT:    store i16 [[TMP0]], i16* [[A]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
-; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    store i16 [[T3]], i16* [[A]]
+; CHECK-NEXT:    store i16 [[T3]], i16* [[PTR0]]
+; CHECK-NEXT:    store i16 [[T2]], i16* [[PTR1]]
+; CHECK-NEXT:    store i16 [[T2]], i16* [[PTR2]]
+; CHECK-NEXT:    store i16 [[T3]], i16* [[PTR3]]
 ; CHECK-NEXT:    [[A_VAL:%.*]] = load i16, i16* [[A]], align 2
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -72,14 +72,18 @@
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A1]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[GEP1]]
 ; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
-; CHECK-NEXT:    [[S2:%.*]] = sext i32 [[E2]] to i64
-; CHECK-NEXT:    [[A2:%.*]] = add i64 [[S2]], [[C2:%.*]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A2]]
-; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, i64* [[GEP2]]
 ; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
-; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
-; CHECK-NEXT:    [[A3:%.*]] = add i64 [[S3]], [[C3:%.*]]
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[E2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[E3]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[C2:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C3:%.*]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP6]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, i64* [[GEP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, i64* [[GEP3]]
 ; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -203,12 +203,10 @@
 ;
 ; MAX-COST-LABEL: @PR32038(
 ; MAX-COST-NEXT:  entry:
-; MAX-COST-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
-; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
-; MAX-COST-NEXT:    [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; MAX-COST-NEXT:    [[P5:%.*]] = icmp eq i8 [[P4]], 0
-; MAX-COST-NEXT:    [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; MAX-COST-NEXT:    [[P7:%.*]] = icmp eq i8 [[P6]], 0
+; MAX-COST-NEXT:    [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
+; MAX-COST-NEXT:    [[P1:%.*]] = icmp eq i8 [[P0]], 0
+; MAX-COST-NEXT:    [[TMP0:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2) to <4 x i8>*), i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i8> undef)
+; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i8> [[TMP0]], <i8 0, i8 0, i8 0, i8 undef>
 ; MAX-COST-NEXT:    [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
 ; MAX-COST-NEXT:    [[P9:%.*]] = icmp eq i8 [[P8]], 0
 ; MAX-COST-NEXT:    [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
@@ -217,22 +215,23 @@
 ; MAX-COST-NEXT:    [[P13:%.*]] = icmp eq i8 [[P12]], 0
 ; MAX-COST-NEXT:    [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
 ; MAX-COST-NEXT:    [[P15:%.*]] = icmp eq i8 [[P14]], 0
+; MAX-COST-NEXT:    [[TMP2:%.*]] = insertelement <4 x i1> undef, i1 [[P1]], i32 0
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
 ; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; MAX-COST-NEXT:    [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0
-; MAX-COST-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1
-; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2
-; MAX-COST-NEXT:    [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3
-; MAX-COST-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; MAX-COST-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i1> [[TMP2]], i1 [[TMP4]], i32 1
+; MAX-COST-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; MAX-COST-NEXT:    [[TMP7:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[TMP6]], i32 2
+; MAX-COST-NEXT:    [[TMP8:%.*]] = insertelement <4 x i1> [[TMP7]], i1 [[TMP3]], i32 3
+; MAX-COST-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
 ; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
-; MAX-COST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]]
-; MAX-COST-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]]
-; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5
+; MAX-COST-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
+; MAX-COST-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[P27]]
+; MAX-COST-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], [[P29]]
+; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP12]], -5
 ; MAX-COST-NEXT:    [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]]
 ; MAX-COST-NEXT:    [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -215,16 +215,16 @@
 ;
 ; GFX8-LABEL: @uadd_sat_v3i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
-; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> undef, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> undef, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX8-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <3 x i16> [[ARG0:%.*]], <3 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0]], i64 2
+; GFX8-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <3 x i16> [[ARG1:%.*]], <3 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1]], i64 2
+; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[REORDER_SHUFFLE]], <2 x i16> [[REORDER_SHUFFLE1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[TMP3]], i64 0
-; GFX8-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
-; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[TMP4]], i64 1
+; GFX8-NEXT:    [[TMP1:%.*]] = extractelement <2 x i16> [[TMP0]], i32 0
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[TMP1]], i64 0
+; GFX8-NEXT:    [[TMP2:%.*]] = extractelement <2 x i16> [[TMP0]], i32 1
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[TMP2]], i64 1
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
@@ -6,7 +6,8 @@
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[LOAD_EXTEND:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[LOAD_EXTEND]], <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
@@ -35,13 +36,14 @@
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[LOAD_EXTEND:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i64> [[LOAD_EXTEND]], <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[REORDER_SHUFFLE]], <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
-; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
+; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1
@@ -65,7 +67,8 @@
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[LOAD_EXTEND:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[LOAD_EXTEND]], <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -7,8 +7,8 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <8 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
@@ -47,17 +47,17 @@
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> undef, i32 [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> undef, i32 [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1
-; CHECK-NEXT:    [[TMP15]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP15]] = insertelement <8 x i32> [[TMP13]], i32 [[TMP14]], i32 1
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 ; FORCE_REDUCTION-LABEL: @Test(
 ; FORCE_REDUCTION-NEXT:  entry:
 ; FORCE_REDUCTION-NEXT:    br label [[LOOP:%.*]]
 ; FORCE_REDUCTION:       loop:
-; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
-; FORCE_REDUCTION-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <4 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ <i32 0, i32 0, i32 undef, i32 undef>, [[ENTRY:%.*]] ]
+; FORCE_REDUCTION-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
 ; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240>
 ; FORCE_REDUCTION-NEXT:    [[VAL_20:%.*]] = add i32 [[TMP2]], 1496
@@ -96,13 +96,13 @@
 ; FORCE_REDUCTION-NEXT:    [[VAL_39:%.*]] = add i32 [[TMP2]], 12529
 ; FORCE_REDUCTION-NEXT:    [[VAL_40:%.*]] = and i32 [[OP_EXTRA27]], [[VAL_39]]
 ; FORCE_REDUCTION-NEXT:    [[VAL_41:%.*]] = add i32 [[TMP2]], 13685
-; FORCE_REDUCTION-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_40]], i32 0
-; FORCE_REDUCTION-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1
-; FORCE_REDUCTION-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_41]], i32 0
-; FORCE_REDUCTION-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 14910, i32 1
-; FORCE_REDUCTION-NEXT:    [[TMP11:%.*]] = and <2 x i32> [[TMP8]], [[TMP10]]
-; FORCE_REDUCTION-NEXT:    [[TMP12:%.*]] = add <2 x i32> [[TMP8]], [[TMP10]]
-; FORCE_REDUCTION-NEXT:    [[TMP13]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> <i32 0, i32 3>
+; FORCE_REDUCTION-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[VAL_40]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 14910, i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> undef, i32 [[VAL_41]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP2]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP11:%.*]] = and <4 x i32> [[TMP8]], [[TMP10]]
+; FORCE_REDUCTION-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP8]], [[TMP10]]
+; FORCE_REDUCTION-NEXT:    [[TMP13]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
 ; FORCE_REDUCTION-NEXT:    br label [[LOOP]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
@@ -4,11 +4,11 @@
 define void @mainTest(i32 %param, i32 * %vals, i32 %len) {
 ; CHECK-LABEL: @mainTest(
 ; CHECK-NEXT:  bci_15.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 31, i32 undef>, i32 [[PARAM:%.*]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i32> <i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, i32 [[PARAM:%.*]], i32 1
 ; CHECK-NEXT:    br label [[BCI_15:%.*]]
 ; CHECK:       bci_15:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <16 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15
 ; CHECK-NEXT:    store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4
@@ -16,8 +16,8 @@
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[V44:%.*]] = add i32 [[TMP2]], 16
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[V44]], i32 0
-; CHECK-NEXT:    [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i32> undef, i32 [[V44]], i32 0
+; CHECK-NEXT:    [[TMP7]] = insertelement <16 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1
 ; CHECK-NEXT:    br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]]
 ; CHECK:       loopexit:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -8,26 +8,24 @@
 
 define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE-LABEL: @sitofp_uitofp(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
 ; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
 ; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; SSE-NEXT:    [[AB2:%.*]] = sitofp i32 [[A2]] to float
-; SSE-NEXT:    [[AB3:%.*]] = sitofp i32 [[A3]] to float
+; SSE-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[REORDER_SHUFFLE]] to <4 x float>
 ; SSE-NEXT:    [[AB4:%.*]] = uitofp i32 [[A4]] to float
 ; SSE-NEXT:    [[AB5:%.*]] = uitofp i32 [[A5]] to float
 ; SSE-NEXT:    [[AB6:%.*]] = uitofp i32 [[A6]] to float
 ; SSE-NEXT:    [[AB7:%.*]] = uitofp i32 [[A7]] to float
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
 ; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
 ; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
 ; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
@@ -81,26 +79,24 @@
 
 define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SSE-LABEL: @fptosi_fptoui(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SSE-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
 ; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
 ; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SSE-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; SSE-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; SSE-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; SSE-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; SSE-NEXT:    [[TMP1:%.*]] = fptosi <4 x float> [[REORDER_SHUFFLE]] to <4 x i32>
 ; SSE-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
 ; SSE-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
 ; SSE-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
 ; SSE-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3
 ; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
 ; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
@@ -135,26 +131,24 @@
 ; SLM-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX-LABEL: @fptosi_fptoui(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; AVX-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
 ; AVX-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
 ; AVX-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
 ; AVX-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; AVX-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; AVX-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; AVX-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; AVX-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; AVX-NEXT:    [[TMP1:%.*]] = fptosi <4 x float> [[REORDER_SHUFFLE]] to <4 x i32>
 ; AVX-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
 ; AVX-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
 ; AVX-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
 ; AVX-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3
 ; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
 ; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -110,22 +110,22 @@
 ; AVX1-LABEL: @ashr_shl_v8i32(
 ; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
 ; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
 ; AVX1-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
 ; AVX1-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
-; AVX1-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
-; AVX1-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
 ; AVX1-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
 ; AVX1-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; AVX1-NEXT:    [[AB2:%.*]] = ashr i32 [[A2]], [[B2]]
-; AVX1-NEXT:    [[AB3:%.*]] = ashr i32 [[A3]], [[B3]]
-; AVX1-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; AVX1-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
+; AVX1-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:    [[TMP7:%.*]] = shl <8 x i32> [[A]], [[B]]
 ; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
 ; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; AVX1-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:    [[R3:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:    [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 undef, i32 undef>
+; AVX1-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX2-LABEL: @ashr_shl_v8i32(
@@ -177,19 +177,19 @@
 
 define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SSE-LABEL: @ashr_shl_v8i32_const(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> [[REORDER_SHUFFLE]], <i32 2, i32 2, i32 2, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32_const(
-; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX1-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
-; AVX1-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> [[REORDER_SHUFFLE]], <i32 2, i32 2, i32 2, i32 2>
+; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX2-LABEL: @ashr_shl_v8i32_const(
@@ -293,25 +293,18 @@
 ; AVX2-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
 ; AVX2-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
 ; AVX2-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX2-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; AVX2-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]]
+; AVX2-NEXT:    [[TMP2:%.*]] = lshr <8 x i32> [[A]], [[B]]
 ; AVX2-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; AVX2-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
-; AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; AVX2-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
-; AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
-; AVX2-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
-; AVX2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
-; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
+; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2
+; AVX2-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP3]], i32 2
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3
+; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP4]], i32 3
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4
+; AVX2-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP5]], i32 4
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5
+; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP6]], i32 5
 ; AVX2-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
 ; AVX2-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; AVX2-NEXT:    ret <8 x i32> [[R7]]
@@ -321,25 +314,18 @@
 ; AVX512-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
 ; AVX512-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
 ; AVX512-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP2:%.*]] = lshr <8 x i32> [[A]], [[B]]
 ; AVX512-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; AVX512-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
-; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
-; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
-; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
-; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
-; AVX512-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
-; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
+; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2
+; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP3]], i32 2
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3
+; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP4]], i32 3
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4
+; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP5]], i32 4
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5
+; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP6]], i32 5
 ; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
 ; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; AVX512-NEXT:    ret <8 x i32> [[R7]]
@@ -412,26 +398,56 @@
 }
 
 define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
-; CHECK-LABEL: @sdiv_v8i32_undefs(
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
-; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; CHECK-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
-; CHECK-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
-; CHECK-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
-; CHECK-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
-; CHECK-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
-; CHECK-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[AB2]], i32 2
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[AB3]], i32 3
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; CHECK-NEXT:    ret <8 x i32> [[R7]]
+; SSE-LABEL: @sdiv_v8i32_undefs(
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
+; SSE-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
+; SSE-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; SSE-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
+; SSE-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
+; SSE-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX1-LABEL: @sdiv_v8i32_undefs(
+; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1
+; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX1-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
+; AVX1-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
+; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; AVX1-NEXT:    [[TMP2:%.*]] = sdiv <4 x i32> [[TMP1]], <i32 16, i32 16, i32 4, i32 8>
+; AVX1-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
+; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1
+; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; AVX1-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
+; AVX1-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[TMP6]], i32 6
+; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX1-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX2-LABEL: @sdiv_v8i32_undefs(
+; AVX2-NEXT:    [[TMP1:%.*]] = sdiv <8 x i32> [[A:%.*]], <i32 1, i32 4, i32 8, i32 16, i32 16, i32 4, i32 8, i32 16>
+; AVX2-NEXT:    ret <8 x i32> [[TMP1]]
+;
+; AVX512-LABEL: @sdiv_v8i32_undefs(
+; AVX512-NEXT:    [[TMP1:%.*]] = sdiv <8 x i32> [[A:%.*]], <i32 1, i32 4, i32 8, i32 16, i32 16, i32 4, i32 8, i32 16>
+; AVX512-NEXT:    ret <8 x i32> [[TMP1]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
@@ -237,23 +237,22 @@
 define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
 ; CHECK-LABEL: @fcmp_ord_uno_v4i32(
 ; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
 ; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
 ; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
 ; CHECK-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
-; CHECK-NEXT:    [[B1:%.*]] = load float, float* [[P1]], align 4
-; CHECK-NEXT:    [[B2:%.*]] = load float, float* [[P2]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
 ; CHECK-NEXT:    [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
-; CHECK-NEXT:    [[C1:%.*]] = fcmp uno float [[B1]], [[A1]]
-; CHECK-NEXT:    [[C2:%.*]] = fcmp uno float [[B2]], [[A2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
 ; CHECK-NEXT:    [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0
-; CHECK-NEXT:    [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[C1]], i32 1
-; CHECK-NEXT:    [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[C2]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[TMP6]], i32 2
 ; CHECK-NEXT:    [[D3:%.*]] = insertelement <4 x i1> [[D2]], i1 [[C3]], i32 3
 ; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
@@ -55,35 +55,32 @@
 ; AVX:       for.body:
 ; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; AVX-NEXT:    [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ]
-; AVX-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ]
+; AVX-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
 ; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
 ; AVX-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
 ; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
 ; AVX-NEXT:    store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4
-; AVX-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; AVX-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1
-; AVX-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
-; AVX-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[TMP9]], [[TMP8]]
-; AVX-NEXT:    [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT:    [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], <float -1.000000e+00, float -1.000000e+00>
-; AVX-NEXT:    [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
-; AVX-NEXT:    [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP14]]
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
-; AVX-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
-; AVX-NEXT:    [[ADD13]] = fadd float [[TMP16]], [[TMP17]]
-; AVX-NEXT:    [[TMP18:%.*]] = insertelement <2 x float> undef, float [[TMP17]], i32 0
-; AVX-NEXT:    [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[ADD13]], i32 1
-; AVX-NEXT:    [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT:    [[TMP21:%.*]] = select <2 x i1> [[TMP20]], <2 x float> [[TMP19]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT:    [[TMP22:%.*]] = fcmp olt <2 x float> [[TMP21]], <float -1.000000e+00, float -1.000000e+00>
-; AVX-NEXT:    [[TMP23]] = select <2 x i1> [[TMP22]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP21]]
+; AVX-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
+; AVX-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[REORDER_SHUFFLE]], [[TMP3]]
+; AVX-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
+; AVX-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[TMP4]]
+; AVX-NEXT:    [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], <float 1.000000e+00, float 1.000000e+00>
+; AVX-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
+; AVX-NEXT:    [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], <float -1.000000e+00, float -1.000000e+00>
+; AVX-NEXT:    [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer
+; AVX-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP10]]
+; AVX-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
+; AVX-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
+; AVX-NEXT:    [[ADD13]] = fadd float [[TMP12]], [[TMP13]]
+; AVX-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> undef, float [[TMP13]], i32 0
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1
+; AVX-NEXT:    [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], <float 1.000000e+00, float 1.000000e+00>
+; AVX-NEXT:    [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
+; AVX-NEXT:    [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], <float -1.000000e+00, float -1.000000e+00>
+; AVX-NEXT:    [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP17]]
 ; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
 ; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; AVX:       for.end:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
@@ -131,10 +131,9 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double undef, i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
@@ -4,25 +4,13 @@
 define i32 @crash_reordering_undefs() {
 ; CHECK-LABEL: @crash_reordering_undefs(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR0:%.*]] = or i64 undef, undef
-; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]]
-; CHECK-NEXT:    [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537
-; CHECK-NEXT:    [[ADD1:%.*]] = add i32 undef, [[ADD0]]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT:    [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537
-; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[ADD1]], [[ADD2]]
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT:    [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537
-; CHECK-NEXT:    [[ADD5:%.*]] = add i32 [[ADD3]], [[ADD4]]
-; CHECK-NEXT:    [[ADD6:%.*]] = add i32 [[ADD5]], undef
-; CHECK-NEXT:    [[ADD7:%.*]] = add i32 [[ADD6]], undef
-; CHECK-NEXT:    [[ADD8:%.*]] = add i32 [[ADD7]], undef
-; CHECK-NEXT:    [[OR1:%.*]] = or i64 undef, undef
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]]
-; CHECK-NEXT:    [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537
-; CHECK-NEXT:    [[ADD10:%.*]] = add i32 [[ADD8]], [[ADD9]]
-; CHECK-NEXT:    [[ADD11:%.*]] = add i32 [[ADD10]], undef
-; CHECK-NEXT:    ret i32 [[ADD11]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> <i32 65537, i32 65537, i32 65537, i32 65537>)
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP0]], undef
+; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], undef
+; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = add i32 [[OP_EXTRA1]], undef
+; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = add i32 [[OP_EXTRA2]], undef
+; CHECK-NEXT:    [[OP_EXTRA4:%.*]] = add i32 [[OP_EXTRA3]], undef
+; CHECK-NEXT:    ret i32 [[OP_EXTRA4]]
 ;
 entry:
   %or0 = or i64 undef, undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -31,15 +31,14 @@
 ; CHECK:       cond.false66.us:
 ; CHECK-NEXT:    [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, undef
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[ADD_I276_US]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double undef, i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 0.000000e+00, double 0xBFA5CC2D1960285F>
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], <double 1.400000e+02, double 1.400000e+02>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], <double 5.000000e+01, double 5.200000e+01>
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> undef, [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[TMP0]], <double 0.000000e+00, double 0xBFA5CC2D1960285F>
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 1.400000e+02, double 1.400000e+02>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 5.000000e+01, double 5.200000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> undef, [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP7]], align 8
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.true63.us:
 ; CHECK-NEXT:    unreachable
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -18,21 +18,16 @@
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[G]], i64 6
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 4.000000e+00, double 3.000000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[LOAD_EXTEND:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[LOAD_EXTEND]], <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], <double 7.000000e+00, double 8.000000e+00>
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[SHUFFLE]], <double 4.000000e+00, double 3.000000e+00, double 4.000000e+00, double undef>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[SHUFFLE1]], <double 1.000000e+00, double 6.000000e+00, double 7.000000e+00, double 8.000000e+00>
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[G]] to <4 x double>*
+; CHECK-NEXT:    store <4 x double> [[TMP3]], <4 x double>* [[TMP4]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll
@@ -54,14 +54,11 @@
 ; CHECK-LABEL: @fextr2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LD:%.*]] = load <4 x double>, <4 x double>* undef
-; CHECK-NEXT:    [[V0:%.*]] = extractelement <4 x double> [[LD]], i32 0
-; CHECK-NEXT:    [[V1:%.*]] = extractelement <4 x double> [[LD]], i32 1
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x double> [[LD]], <4 x double> undef, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[P0:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 5.500000e+00, double 6.600000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[P0]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[REORDER_SHUFFLE]], <double 5.500000e+00, double 6.600000e+00>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P0]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
@@ -11,36 +11,33 @@
 ; CHECK-NEXT:    [[TOBOOL_NOT19:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT19]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
-; CHECK-NEXT:    [[C_022:%.*]] = phi i32* [ [[C_022_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32*> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ]
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint i32* [[C_022]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> <i64 1, i64 1>
-; CHECK-NEXT:    switch i32 [[TMP3]], label [[WHILE_BODY_BACKEDGE]] [
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <4 x i32*> [ [[TMP16:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32*> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint i32* [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP1]], <4 x i64> <i64 1, i64 1, i64 1, i64 undef>
+; CHECK-NEXT:    switch i32 [[TMP4]], label [[WHILE_BODY_BACKEDGE]] [
 ; CHECK-NEXT:    i32 2, label [[SW_BB:%.*]]
 ; CHECK-NEXT:    i32 4, label [[SW_BB6:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint i32* [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 1
-; CHECK-NEXT:    store i32 [[TMP7]], i32* [[TMP9]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32*> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint i32* [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32*> [[TMP5]], i32 2
+; CHECK-NEXT:    store i32 [[TMP8]], i32* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, <4 x i32*> [[TMP1]], <4 x i64> <i64 2, i64 2, i64 2, i64 undef>
 ; CHECK-NEXT:    br label [[WHILE_BODY_BACKEDGE]]
 ; CHECK:       sw.bb6:
-; CHECK-NEXT:    [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint i32* [[INCDEC_PTR]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0
-; CHECK-NEXT:    store i32 [[TMP11]], i32* [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32*> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint i32* [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, <4 x i32*> [[TMP1]], <4 x i64> <i64 2, i64 2, i64 2, i64 undef>
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32*> [[TMP5]], i32 1
+; CHECK-NEXT:    store i32 [[TMP13]], i32* [[TMP15]], align 4
 ; CHECK-NEXT:    br label [[WHILE_BODY_BACKEDGE]]
 ; CHECK:       while.body.backedge:
-; CHECK-NEXT:    [[C_022_BE]] = phi i32* [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ]
-; CHECK-NEXT:    [[TMP14]] = phi <2 x i32*> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ]
+; CHECK-NEXT:    [[TMP16]] = phi <4 x i32*> [ [[TMP5]], [[WHILE_BODY]] ], [ [[TMP14]], [[SW_BB6]] ], [ [[TMP10]], [[SW_BB]] ]
 ; CHECK-NEXT:    br label [[WHILE_BODY]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret i32 undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -218,42 +218,33 @@
 ; Unused insertelement
 define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
 ; CHECK-LABEL: @simple_select_no_users(
-; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
-; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
 ; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
 ; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; CHECK-NEXT:    [[REORDER_SHUFFLE2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
 ; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
-; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
-; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[REORDER_SHUFFLE]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[C3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <2 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[REORDER_SHUFFLE1]], <2 x float> [[REORDER_SHUFFLE2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[A3]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[B3]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP4]], <2 x float> [[TMP7]], <2 x float> [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP13]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP14]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[RD]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
@@ -19,22 +19,24 @@
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[LOAD_EXTEND1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 13
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX1]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x float>
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], <float 1.000000e+01, float 1.000000e+01>
-; CHECK-NEXT:    [[TMP8:%.*]] = fsub <2 x float> <float 1.000000e+00, float 0.000000e+00>, [[TMP7]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[LOAD_EXTEND:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[LOAD_EXTEND]], [[LOAD_EXTEND1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sitofp <4 x i32> [[TMP5]] to <4 x float>
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x float> [[TMP6]], <float 1.000000e+01, float 1.000000e+01, float undef, float undef>
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub <4 x float> <float 1.000000e+00, float 0.000000e+00, float undef, float undef>, [[TMP7]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0
 ; CHECK-NEXT:    store float [[TMP9]], float* @g, align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], <float -1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], <float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 2
 ; CHECK-NEXT:    store float [[TMP11]], float* @c, align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 1
 ; CHECK-NEXT:    store float [[TMP12]], float* @d, align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 3
 ; CHECK-NEXT:    store float [[TMP13]], float* @e, align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 0
 ; CHECK-NEXT:    store float [[TMP14]], float* @f, align 4
@@ -42,16 +44,17 @@
 ; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 15
 ; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* @a, align 4
 ; CHECK-NEXT:    [[CONV19:%.*]] = sitofp i32 [[TMP15]] to float
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> <float -1.000000e+00, float -1.000000e+00, float undef, float undef>, float [[CONV19]], i32 2
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 3
-; CHECK-NEXT:    [[TMP19:%.*]] = fadd <4 x float> [[TMP10]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = fsub <4 x float> [[TMP10]], [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP22:%.*]] = fptosi <4 x float> [[TMP21]] to <4 x i32>
-; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[REORDER_SHUFFLE]], <4 x i32>* [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> <float -1.000000e+00, float undef, float undef, float undef>, float [[CONV19]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float -1.000000e+00, i32 3
+; CHECK-NEXT:    [[TMP20:%.*]] = fadd <4 x float> [[TMP10]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fsub <4 x float> [[TMP10]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP21]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP23:%.*]] = fptosi <4 x float> [[TMP22]] to <4 x i32>
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[REORDER_SHUFFLE]], <4 x i32>* [[TMP24]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -54,15 +54,16 @@
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[X2:%.*]] = load float, float* [[GEP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x float> undef)
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0
 ; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1
 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP4]], i32 1
-; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[X2]], i32 2
-; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[TMP5]], i32 2
+; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[TMP5]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[I3]]
 ;
   %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0
@@ -84,7 +85,8 @@
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[LOAD_EXTEND:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[LOAD_EXTEND]], <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0
 ; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP3]], i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll
@@ -15,21 +15,21 @@
 define i32 @foo(i32* nocapture %A, i32 %n) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[N:%.*]], 5
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 7, i32 8, i32 9, i32 10>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[A]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = add nsw i32 [[TMP1]], 11
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = add nsw i32 [[TMP11]], [[TMP13]]
-; CHECK-NEXT:    store i32 [[TMP14]], i32* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i32 [[TMP1]], 7
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[A]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %1 = mul nsw i32 %n, 5
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -264,32 +264,24 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32000 x float], [32000 x float]* @a, i32 0, i32 0), align 16
 ; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
 ; CHECK:       for.body3:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ]
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 4
-; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast float* [[ARRAYIDX5]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP13]]
-; CHECK-NEXT:    [[TMP14]] = load float, float* [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
-; CHECK-NEXT:    [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    store float [[MUL45]], float* [[ARRAYIDX31]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[ARRAYIDX]] to <8 x float>*
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP5]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false>, <8 x float> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul <8 x float> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[ARRAYIDX5]] to <8 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP10]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false>)
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP11]], 31995
+; CHECK-NEXT:    [[TMP12]] = extractelement <8 x float> [[TMP6]], i32 4
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
@@ -18,26 +18,23 @@
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], <i32 0, i32 -1, i32 -5, i32 -9>
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[SHR15]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 undef, i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 undef, i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 undef, i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt <4 x i32> [[TMP7]], undef
-; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP7]], <4 x i32> undef
-; CHECK-NEXT:    [[TMP10:%.*]] = sext <4 x i32> [[TMP9]] to <4 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2
-; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3
-; CHECK-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP4]], <4 x i32> undef
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i32> [[TMP6]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i64> [[TMP7]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP16]]
 ; CHECK-NEXT:    unreachable
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -140,49 +140,54 @@
 define float @foo3(float* nocapture readonly %A) #0 {
 ; CHECK-LABEL: @foo3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <8 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP0]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false>, <8 x float> undef)
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 3, i32 2, i32 1, i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float> [[REORDER_SHUFFLE]], <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00
-; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <8 x float> [ [[SHUFFLE]], [[ENTRY]] ], [ [[TMP32:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 6
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 5
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>*
-; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4
-; CHECK-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP11]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> undef, float [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP4]], i32 3
-; CHECK-NEXT:    [[TMP17:%.*]] = fmul <4 x float> [[TMP16]], <float 1.100000e+01, float 1.000000e+01, float 9.000000e+00, float 8.000000e+00>
-; CHECK-NEXT:    [[TMP18]] = fadd <4 x float> [[TMP6]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX14]] to <4 x float>*
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP6]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x float> undef)
+; CHECK-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 undef>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> undef, float [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE1]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float 8.000000e+00, i32 3
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float 7.000000e+00, i32 4
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> <float 1.100000e+01, float 1.000000e+01, float 9.000000e+00, float undef, float undef, float undef, float undef, float undef>, float [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP3]], i32 4
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul <8 x float> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fadd <8 x float> [[TMP2]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP20]], 121
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <8 x float> [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <8 x float> undef, float [[TMP21]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <8 x float> [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <8 x float> [[TMP22]], float [[TMP23]], i32 1
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <8 x float> [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x float> [[TMP24]], float [[TMP25]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <8 x float> [[TMP19]], i32 3
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP27]], i32 3
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <8 x float> [[TMP19]], i32 4
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x float> [[TMP28]], float [[TMP29]], i32 4
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x float> [[TMP30]], float [[TMP8]], i32 5
+; CHECK-NEXT:    [[TMP32]] = insertelement <8 x float> [[TMP31]], float [[TMP10]], i32 6
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP18]], i32 3
-; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP20]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP18]], i32 2
-; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP18]], i32 1
-; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP18]], i32 0
-; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP23]]
+; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[TMP29]], [[TMP27]]
+; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP25]]
+; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP23]]
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP21]]
 ; CHECK-NEXT:    ret float [[ADD31]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll
@@ -13,9 +13,10 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], <float 1.100000e+01, float 1.200000e+01, float 1.300000e+01, float 1.400000e+01>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[REORDER_SHUFFLE]], i32 0
 ; CHECK-NEXT:    [[VECIN0:%.*]] = insertelement <2 x float> undef, float [[TMP4]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[REORDER_SHUFFLE]], i32 1
 ; CHECK-NEXT:    [[VECIN1:%.*]] = insertelement <2 x float> [[VECIN0]], float [[TMP5]], i64 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[VECIN2:%.*]] = insertelement <2 x float> undef, float [[TMP6]], i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
@@ -11,31 +11,30 @@
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[T:%.*]] = select i1 undef, i16 undef, i16 15
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i16> undef, i16 [[T]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16 undef, i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i32> <i32 undef, i32 63>, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef
-; CHECK-NEXT:    [[SHUFFLE5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE5]], <i32 15, i32 31, i32 47, i32 undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT:    [[T19:%.*]] = select i1 undef, i32 [[TMP6]], i32 undef
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[T]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <4 x i32> <i32 undef, i32 63, i32 undef, i32 undef>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP2]], undef
+; CHECK-NEXT:    [[SHUFFLE5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE5]], <i32 15, i32 undef, i32 31, i32 47>
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[T19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef
 ; CHECK-NEXT:    [[T20:%.*]] = icmp sgt i32 [[T19]], 63
-; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <2 x i32> undef, [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP7]], undef
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[SHUFFLE]], <i32 -49, i32 -33, i32 -33, i32 -17>
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP9]])
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt i32 [[TMP10]], undef
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i32 [[OP_EXTRA]], undef
-; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[TMP12]], i32 [[OP_EXTRA]], i32 undef
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[OP_EXTRA1]], undef
-; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA1]], i32 undef
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp slt i32 [[OP_EXTRA2]], undef
-; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = select i1 [[TMP14]], i32 [[OP_EXTRA2]], i32 undef
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp slt i32 [[OP_EXTRA3]], undef
-; CHECK-NEXT:    [[OP_EXTRA4:%.*]] = select i1 [[TMP15]], i32 [[OP_EXTRA3]], i32 undef
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> undef, [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i32> [[TMP6]], undef
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[SHUFFLE]], <i32 -49, i32 -33, i32 -33, i32 -17>
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp slt i32 [[TMP9]], undef
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 undef
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt i32 [[OP_EXTRA]], undef
+; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[TMP11]], i32 [[OP_EXTRA]], i32 undef
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i32 [[OP_EXTRA1]], undef
+; CHECK-NEXT:    [[OP_EXTRA2:%.*]] = select i1 [[TMP12]], i32 [[OP_EXTRA1]], i32 undef
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[OP_EXTRA2]], undef
+; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA2]], i32 undef
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp slt i32 [[OP_EXTRA3]], undef
+; CHECK-NEXT:    [[OP_EXTRA4:%.*]] = select i1 [[TMP14]], i32 [[OP_EXTRA3]], i32 undef
 ; CHECK-NEXT:    [[T45:%.*]] = icmp sgt i32 undef, [[OP_EXTRA4]]
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -19,63 +19,52 @@
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3
-; CHECK-NEXT:    [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], <i32 9, i32 10, i32 11, i32 12>
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12
-; CHECK-NEXT:    [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13
-; CHECK-NEXT:    [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14
-; CHECK-NEXT:    [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15
-; CHECK-NEXT:    [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8>
-; CHECK-NEXT:    [[TMP44:%.*]] = and <16 x i8> [[TMP43]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[CONV31_I]], i32 0
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <16 x i32> [[SHUFFLE]], <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[TMP2]], i32 14
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i32> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[TMP9]], i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP11]], i32 4
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i32> [[TMP2]], i32 4
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP13]], i32 5
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i32> [[TMP2]], i32 5
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP15]], i32 6
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i32> [[TMP2]], i32 6
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[TMP17]], i32 7
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i32> [[TMP2]], i32 7
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP19]], i32 8
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x i32> [[TMP2]], i32 8
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP21]], i32 9
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x i32> [[TMP2]], i32 9
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP23]], i32 10
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x i32> [[TMP2]], i32 10
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[TMP25]], i32 11
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i32> [[TMP2]], i32 11
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[TMP27]], i32 12
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i32> [[TMP2]], i32 12
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[TMP29]], i32 13
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[TMP2]], i32 13
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[TMP31]], i32 14
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <16 x i32> [[TMP32]], i32 [[TMP3]], i32 15
+; CHECK-NEXT:    [[TMP34:%.*]] = trunc <16 x i32> [[TMP33]] to <16 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = and <16 x i8> [[TMP34]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
-; CHECK-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
-; CHECK-NEXT:    store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP35]], <16 x i8>* [[TMP36]], align 1
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end50.i:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
@@ -13,11 +13,12 @@
 ; CHECK-NEXT:    [[T11:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 1, i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[T14]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[LOAD_EXTEND:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
 ; CHECK-NEXT:    br label [[T37:%.*]]
 ; CHECK:       t37:
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[TMP3:%.*]] ], [ [[T89:%.*]], [[T37]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, [[TMP6]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x float> [ [[LOAD_EXTEND]], [[TMP3:%.*]] ], [ [[REORDER_SHUFFLE:%.*]], [[T37]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = fdiv fast <4 x float> <float 1.000000e+00, float 1.000000e+00, float undef, float undef>, [[TMP6]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[T21:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 0
 ; CHECK-NEXT:    [[T25:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 1
 ; CHECK-NEXT:    [[T31:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 2
@@ -25,7 +26,8 @@
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[T21]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[SHUFFLE]], <4 x float>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[T88:%.*]] = bitcast float* [[T9]] to <2 x float>*
-; CHECK-NEXT:    [[T89]] = load <2 x float>, <2 x float>* [[T88]], align 4
+; CHECK-NEXT:    [[T89:%.*]] = load <2 x float>, <2 x float>* [[T88]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE]] = shufflevector <2 x float> [[T89]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    br i1 undef, label [[T37]], label [[T55:%.*]]
 ; CHECK:       t55:
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
@@ -25,39 +25,37 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ]
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
-; CHECK-NEXT:    [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
-; CHECK-NEXT:    [[G_031:%.*]] = phi float [ [[TMP1]], [[ENTRY]] ], [ [[ADD9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
 ; CHECK-NEXT:    [[R_030:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP3]], 7.000000e+00
 ; CHECK-NEXT:    [[ADD4]] = fadd float [[R_030]], [[MUL]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX7]], align 4
-; CHECK-NEXT:    [[MUL8:%.*]] = fmul float [[TMP5]], 8.000000e+00
-; CHECK-NEXT:    [[ADD9]] = fadd float [[G_031]], [[MUL8]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
-; CHECK-NEXT:    [[MUL13:%.*]] = fmul float [[TMP7]], 9.000000e+00
-; CHECK-NEXT:    [[ADD14]] = fadd float [[B_032]], [[MUL13]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX7]] to <2 x float>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[REORDER_SHUFFLE1]], <float 9.000000e+00, float 8.000000e+00>
+; CHECK-NEXT:    [[TMP9]] = fadd <2 x float> [[TMP4]], [[TMP8]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], 121
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP10]], 121
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.for.body_crit_edge:
 ; CHECK-NEXT:    [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[ADD16:%.*]] = fadd float [[ADD4]], [[ADD9]]
-; CHECK-NEXT:    [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd float [[ADD4]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
+; CHECK-NEXT:    [[ADD17:%.*]] = fadd float [[ADD16]], [[TMP12]]
 ; CHECK-NEXT:    ret float [[ADD17]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
@@ -10,18 +10,10 @@
 define i32 @slp_schedule_bundle() local_unnamed_addr #0 {
 ; CHECK-LABEL: @slp_schedule_bundle(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([1 x i32]* @b to <4 x i32>*), align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], <i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4
-; CHECK-NEXT:    [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31
-; CHECK-NEXT:    [[DOTLOBIT_NOT_4:%.*]] = xor i32 [[DOTLOBIT_4]], 1
-; CHECK-NEXT:    store i32 [[DOTLOBIT_NOT_4]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4
-; CHECK-NEXT:    [[DOTLOBIT_5:%.*]] = lshr i32 [[TMP4]], 31
-; CHECK-NEXT:    [[DOTLOBIT_NOT_5:%.*]] = xor i32 [[DOTLOBIT_5]], 1
-; CHECK-NEXT:    store i32 [[DOTLOBIT_NOT_5]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast ([1 x i32]* @b to <8 x i32>*), i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x i32> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[TMP0]], <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP2]], <8 x i32>* bitcast ([1 x i32]* @a to <8 x i32>*), i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>)
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
@@ -27,7 +27,6 @@
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT:    [[T28:%.*]] = add nsw i32 [[T15]], [[T9]]
 ; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
 ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
 ; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
@@ -42,13 +41,19 @@
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
 ; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
-; CHECK-NEXT:    [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
-; CHECK-NEXT:    [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[T28]], i32 0
-; CHECK-NEXT:    [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[T15]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[T9]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP7]], i32 1
 ; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2
 ; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
-; CHECK-NEXT:    [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[T28]], i32 4
-; CHECK-NEXT:    [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[T50]], i32 5
+; CHECK-NEXT:    [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[TMP6]], i32 4
+; CHECK-NEXT:    [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP7]], i32 5
 ; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6
 ; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
 ; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
@@ -7,15 +7,17 @@
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A7:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A8:%.*]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A1:%.*]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A2:%.*]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A3:%.*]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A4:%.*]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A5:%.*]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A6:%.*]], i32 7
+; CHECK-NEXT:    [[LOAD_EXTEND:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[LOAD_EXTEND]], <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[REORDER_SHUFFLE]], <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A2:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
@@ -57,15 +59,17 @@
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A6:%.*]], i32 0
+; CHECK-NEXT:    [[LOAD_EXTEND:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[LOAD_EXTEND]], <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[REORDER_SHUFFLE]], <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 2, i32 0, i32 3, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A4:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A4:%.*]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A5:%.*]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A8:%.*]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A2:%.*]], i32 5
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A3:%.*]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
@@ -111,15 +115,17 @@
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A4:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A6:%.*]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A5:%.*]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A8:%.*]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A2:%.*]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A7:%.*]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A1:%.*]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7
+; CHECK-NEXT:    [[LOAD_EXTEND:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[LOAD_EXTEND]], <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[REORDER_SHUFFLE]], <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 0, i32 3, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A3:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A4:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A1:%.*]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
 ; CHECK-NEXT:    ret i32 [[TMP11]]
diff --git a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
--- a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
+++ b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
@@ -126,12 +126,12 @@
 ; MAX256-NEXT:  bb:
 ; MAX256-NEXT:    br label [[BB1:%.*]]
 ; MAX256:       bb1:
-; MAX256-NEXT:    [[TMP0:%.*]] = insertelement <4 x half> undef, half [[HVAL:%.*]], i32 0
-; MAX256-NEXT:    [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[HVAL]], i32 1
-; MAX256-NEXT:    [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[HVAL]], i32 2
-; MAX256-NEXT:    [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[HVAL]], i32 3
-; MAX256-NEXT:    [[TMP4:%.*]] = fpext <4 x half> [[TMP3]] to <4 x float>
-; MAX256-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; MAX256-NEXT:    [[TMP0:%.*]] = insertelement <8 x half> undef, half [[HVAL:%.*]], i32 0
+; MAX256-NEXT:    [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half [[HVAL]], i32 1
+; MAX256-NEXT:    [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[HVAL]], i32 2
+; MAX256-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[HVAL]], i32 3
+; MAX256-NEXT:    [[TMP4:%.*]] = fpext <8 x half> [[TMP3]] to <8 x float>
+; MAX256-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; MAX256-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> undef, float [[FVAL:%.*]], i32 0
 ; MAX256-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[FVAL]], i32 1
 ; MAX256-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[FVAL]], i32 2
@@ -302,12 +302,12 @@
 ; MAX1024-NEXT:  bb:
 ; MAX1024-NEXT:    br label [[BB1:%.*]]
 ; MAX1024:       bb1:
-; MAX1024-NEXT:    [[TMP0:%.*]] = insertelement <4 x half> undef, half [[HVAL:%.*]], i32 0
-; MAX1024-NEXT:    [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[HVAL]], i32 1
-; MAX1024-NEXT:    [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[HVAL]], i32 2
-; MAX1024-NEXT:    [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[HVAL]], i32 3
-; MAX1024-NEXT:    [[TMP4:%.*]] = fpext <4 x half> [[TMP3]] to <4 x float>
-; MAX1024-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
+; MAX1024-NEXT:    [[TMP0:%.*]] = insertelement <32 x half> undef, half [[HVAL:%.*]], i32 0
+; MAX1024-NEXT:    [[TMP1:%.*]] = insertelement <32 x half> [[TMP0]], half [[HVAL]], i32 1
+; MAX1024-NEXT:    [[TMP2:%.*]] = insertelement <32 x half> [[TMP1]], half [[HVAL]], i32 2
+; MAX1024-NEXT:    [[TMP3:%.*]] = insertelement <32 x half> [[TMP2]], half [[HVAL]], i32 3
+; MAX1024-NEXT:    [[TMP4:%.*]] = fpext <32 x half> [[TMP3]] to <32 x float>
+; MAX1024-NEXT:    [[SHUFFLE:%.*]] = shufflevector <32 x float> [[TMP4]], <32 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
 ; MAX1024-NEXT:    [[TMP5:%.*]] = insertelement <32 x float> undef, float [[FVAL:%.*]], i32 0
 ; MAX1024-NEXT:    [[TMP6:%.*]] = insertelement <32 x float> [[TMP5]], float [[FVAL]], i32 1
 ; MAX1024-NEXT:    [[TMP7:%.*]] = insertelement <32 x float> [[TMP6]], float [[FVAL]], i32 2